# Imports

In [1]:
import os

import pandas as pd
from transformers import TokenClassificationPipeline, DistilBertForTokenClassification, BertTokenizerFast
from tokenizers import Tokenizer, decoders

from evaluation.evaluation import PipelinePredictor, POSEvaluator

# Constants

In [2]:
UNK_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'
CLS_TOKEN = '<CLS>'
SEP_TOKEN = '<SEP>'
MASK_TOKEN = '<MASK>'
SPECIAL_TOKENS = [UNK_TOKEN, PAD_TOKEN, CLS_TOKEN, SEP_TOKEN, MASK_TOKEN]
SPECIAL_TOKENS

['<UNK>', '<PAD>', '<CLS>', '<SEP>', '<MASK>']

In [3]:
TOKENIZER_PATH = os.path.join('models', 'tokenizers')
WORDPIECE_TOKENIZER_PATH = os.path.join(TOKENIZER_PATH, 'wordpiece_tokenizer')
MODEL_PATH = os.path.join('models', 'results')

# Loading Model

In [4]:
model = DistilBertForTokenClassification.from_pretrained(os.path.join(MODEL_PATH ,'checkpoint-31000'))
model

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(6000, 64, padding_idx=1)
      (position_embeddings): Embedding(200, 64)
      (LayerNorm): LayerNorm((64,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=64, out_features=64, bias=True)
            (k_lin): Linear(in_features=64, out_features=64, bias=True)
            (v_lin): Linear(in_features=64, out_features=64, bias=True)
            (out_lin): Linear(in_features=64, out_features=64, bias=True)
          )
          (sa_layer_norm): LayerNorm((64,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Lin

In [5]:
tokenizer = Tokenizer.from_file(os.path.join(WORDPIECE_TOKENIZER_PATH, 'wordpiece_6000.json'))
tokenizer.decoder = decoders.WordPiece(prefix="##")
tokenizer = BertTokenizerFast(tokenizer_object=tokenizer, unk_token=UNK_TOKEN,
    pad_token=PAD_TOKEN,
    cls_token=CLS_TOKEN,
    sep_token=SEP_TOKEN,
    mask_token=MASK_TOKEN, model_input_names=['input_ids', 'attention_mask'])

In [6]:
recognizer = TokenClassificationPipeline(model=model.to('cpu'), tokenizer=tokenizer)

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


In [7]:
predictor =  PipelinePredictor(recognizer)

In [8]:
TEST_PATH = os.path.join('data', 'test')
EVALUATIONS_PATH = os.path.join('evaluation', 'evaluations')

In [9]:
test_df = pd.read_orc(os.path.join(TEST_PATH, 'test_1.orc'), dtype_backend='pyarrow')

In [13]:
import json
import os
from collections import defaultdict
from dataclasses import dataclass
from typing import Protocol, Tuple, List, Dict
import torch.nn as nn
from transformers import TokenClassificationPipeline

import pandas as pd
from sklearn.metrics import classification_report
from hmmlearn import hmm

EVALUATION_PATH = 'evaluation/evaluations'


class Predictor(Protocol):

    def predict(self, data: pd.DataFrame) -> pd.DataFrame:
        pass


@dataclass
class HMMPredictor:
    model: hmm.CategoricalHMM

    def predict(self, data: pd.DataFrame) -> pd.DataFrame:
        data['predictions'] = data.mod_words.apply(lambda l: self.model.predict(l))
        return data
    
@dataclass
class PipelinePredictor:
    pipeline: TokenClassificationPipeline
    
    def _combine_entities_with_average(self, subword_entities, sent):
        grouped_entities = []

        entities = []
        current_entity = ""
        current_tokens = defaultdict(int)
        for entity in subword_entities:
            token = entity['word']
            tag = entity['entity']
            score = entity['score']
            
            if token.startswith('##'):
                current_entity += token[2:]
                current_tokens[tag] += score
            else:
                if current_entity:
                    entities.append((current_entity, max(current_tokens, key=lambda x: current_tokens[x])))
                    
                current_entity = token
                current_tokens.clear()
                current_tokens[tag] = score
                
        if current_entity:
            entities.append((current_entity, max(current_tokens, key=lambda x: current_tokens[x])))
        
        if len(sent) != len(entities):
            print(sent)
            print(entities)
            print()
        return entities

    def predict(self, data: pd.DataFrame) -> pd.DataFrame:
        data['predictions'] = data.mod_words.apply(lambda l: [tag for word, tag in self._combine_entities_with_average(self.pipeline(" ".join(l)), l)])
        return data


@dataclass
class POSEvaluator:
    df: pd.DataFrame
    model_name: str
    path: os.PathLike

    def __evaluate_category(self, data: pd.DataFrame, category: str) -> Dict[str, Dict[str, int]]:

        groupped = data.groupby(category)
        results = {}
        # print(dict(list(groupped)))

        for type, data in dict(list(groupped)).items():
            y_true = [p for l in data.pos_tags.tolist() for p in l]
            y_pred = data.predictions.explode().tolist()
                
            results[type] = classification_report(y_true, y_pred, output_dict=True)
                
        return results

    def evaluate(self, predictor: Predictor, categories: List[str], worst: int = 3) -> None:
        result_df = predictor.predict(self.df)

        eval = {}
        for i, category in enumerate(categories):
            eval[category] = self.__evaluate_category(result_df, category)

        with open(os.path.join(self.path, self.model_name, 'eval.json'), "w") as f:
            json.dump(eval, f, indent=3, ensure_ascii=False)


In [14]:
evaluator = POSEvaluator(test_df, 'transformers', EVALUATIONS_PATH)

In [15]:
predictor =  PipelinePredictor(recognizer)

In [16]:
evaluator.evaluate(predictor, ['starts_with', 'unique_pos'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


['მეფემ' 'აღარ' 'იცოდა' ',' 'როგორი' 'მადლი' 'გადაეხადა' 'წერეთლისათვის'
 'და' 'დიდძალი' 'ყმა' 'და' 'მამული' 'უბოძა' 'ზემო-ქართლში' '.']
[('მეფემ', 'N'), ('აღარ', 'Adv'), ('იცოდა', 'V'), (',', 'Punct'), ('როგორი', 'Pron'), ('მადლი', 'N'), ('გადაეხადა', 'V'), ('წერეთლისათვის', 'N'), ('და', 'Cj'), ('დიდძალი', 'N'), ('ყმა', 'N'), ('და', 'Cj'), ('მამული', 'N'), ('უბოძა', 'N'), ('ზემო', 'N'), ('-', 'N'), ('ქართლში', 'N'), ('.', 'Punct')]

['ასე' 'მოუხსენებია' 'ოდესღაც' 'ვიღაც' 'ბერს' 'და' 'შემდეგ' 'დონ-კიხოტის'
 'მთქმელს' 'სერვანტესსაც' 'გაუმეორებია' '…' 'სთქვა' 'მწერლობასა' 'შინა'
 'განსწავლულმა' 'ჭავჭავაძემ' 'და' 'ვახშმის' 'დასაჭედავად' 'გავიდა' '.']
[('ასე', 'Adv'), ('მოუხსენებია', 'V'), ('ოდესღაც', 'Adv'), ('ვიღაც', 'Pron'), ('ბერს', 'N'), ('და', 'Cj'), ('შემდეგ', 'Adv'), ('დონ', 'N'), ('-', 'N'), ('კიხოტის', 'N'), ('მთქმელს', 'N'), ('სერვანტესსაც', 'N'), ('გაუმეორებია', 'V'), ('…', 'Punct'), ('სთქვა', 'V'), ('მწერლობასა', 'N'), ('შინა', 'A'), ('განსწავლულმა', 'A'), ('ჭავჭავაძემ', 'N'),


KeyboardInterrupt

