# Imports

In [31]:
import os

import pandas as pd
from transformers import TokenClassificationPipeline, DistilBertForTokenClassification, BertTokenizerFast
from tokenizers import Tokenizer, decoders

from evaluation.evaluation import PipelinePredictor, POSEvaluator

# Constants

In [1]:
UNK_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'
CLS_TOKEN = '<CLS>'
SEP_TOKEN = '<SEP>'
MASK_TOKEN = '<MASK>'
SPECIAL_TOKENS = [UNK_TOKEN, PAD_TOKEN, CLS_TOKEN, SEP_TOKEN, MASK_TOKEN]
SPECIAL_TOKENS

['<UNK>', '<PAD>', '<CLS>', '<SEP>', '<MASK>']

In [4]:
TOKENIZER_PATH = os.path.join('models', 'tokenizers')
WORDPIECE_TOKENIZER_PATH = os.path.join(TOKENIZER_PATH, 'wordpiece_tokenizer')
MODEL_PATH = os.path.join('models', 'results')

# Loading Model

In [6]:
model = DistilBertForTokenClassification.from_pretrained(os.path.join(MODEL_PATH ,'checkpoint-31000'))
model

DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(6000, 64, padding_idx=1)
      (position_embeddings): Embedding(200, 64)
      (LayerNorm): LayerNorm((64,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=64, out_features=64, bias=True)
            (k_lin): Linear(in_features=64, out_features=64, bias=True)
            (v_lin): Linear(in_features=64, out_features=64, bias=True)
            (out_lin): Linear(in_features=64, out_features=64, bias=True)
          )
          (sa_layer_norm): LayerNorm((64,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Lin

In [13]:
tokenizer = Tokenizer.from_file(os.path.join(WORDPIECE_TOKENIZER_PATH, 'wordpiece_6000.json'))
tokenizer.decoder = decoders.WordPiece(prefix="##")
tokenizer = BertTokenizerFast(tokenizer_object=tokenizer, unk_token=UNK_TOKEN,
    pad_token=PAD_TOKEN,
    cls_token=CLS_TOKEN,
    sep_token=SEP_TOKEN,
    mask_token=MASK_TOKEN, model_input_names=['input_ids', 'attention_mask'])

In [118]:
recognizer = TokenClassificationPipeline(model=model.to('cpu'), tokenizer=tokenizer)

In [134]:
predictor =  PipelinePredictor(recognizer)

In [120]:
TEST_PATH = os.path.join('data', 'test')
EVALUATIONS_PATH = os.path.join('evaluation', 'evaluations')

In [121]:
test_df = pd.read_orc(os.path.join(TEST_PATH, 'test_1.orc'), dtype_backend='pyarrow')

In [147]:
import json
import os
from dataclasses import dataclass
from typing import Protocol, Tuple, List, Dict
import torch.nn as nn
from transformers import TokenClassificationPipeline

import pandas as pd
from sklearn.metrics import classification_report
from hmmlearn import hmm

EVALUATION_PATH = 'evaluation/evaluations'


class Predictor(Protocol):

    def predict(self, data: pd.DataFrame) -> pd.DataFrame:
        pass


@dataclass
class HMMPredictor:
    model: hmm.CategoricalHMM

    def predict(self, data: pd.DataFrame) -> pd.DataFrame:
        data['predictions'] = data.mod_words.apply(lambda l: self.model.predict(l))
        return data
    
@dataclass
class PipelinePredictor:
    pipeline: TokenClassificationPipeline
    
    def _combine_entities_with_average(self, entities):
        grouped_entities = {}

        for entry in entities:
            word = entry['word']
            entity = entry['entity']
            score = entry['score']

            if word not in grouped_entities:
                grouped_entities[word] = {'entities': [], 'total_score': 0, 'count': 0}

            grouped_entities[word]['entities'].append({'entity': entity, 'score': score})
            grouped_entities[word]['total_score'] += score
            grouped_entities[word]['count'] += 1

        combined_entities = []

        for word, values in grouped_entities.items():
            average_score = values['total_score'] / values['count']
            best_entity = max(values['entities'], key=lambda x: x['score'])

            combined_entities.append({'word': word, 'entity': best_entity['entity'], 'average_score': average_score})

        return combined_entities

    def predict(self, data: pd.DataFrame) -> pd.DataFrame:
        data['predictions'] = data.mod_words.apply(lambda l: [word['entity'] for word in self._combine_entities_with_average(self.pipeline(" ".join(l)))])
        return data


@dataclass
class POSEvaluator:
    df: pd.DataFrame
    model_name: str
    path: os.PathLike

    def __evaluate_category(self, data: pd.DataFrame, category: str) -> Dict[str, Dict[str, int]]:

        groupped = data.groupby(category)
        results = {}
        # print(dict(list(groupped)))

        for type, data in dict(list(groupped)).items():
            y_true = [p for l in data.mod_words.tolist() for p in l]
            y_pred = data.predictions.explode().tolist()
            results[type] = classification_report(y_true, y_pred, output_dict=True)
            
            if len(y_true) != len(y_pred):
                print(data.mod_words.tolist())
                print(data.predictions.tolist())
        return results

    def evaluate(self, predictor: Predictor, categories: List[str], worst: int = 3) -> None:
        result_df = predictor.predict(self.df)

        eval = {}
        for i, category in enumerate(categories):
            eval[category] = self.__evaluate_category(result_df, category)

        with open(os.path.join(self.path, self.model_name, 'eval.json'), "w") as f:
            json.dump(eval, f, indent=3, ensure_ascii=False)


In [148]:
evaluator = POSEvaluator(test_df[:10], 'transformer', EVALUATIONS_PATH)

In [149]:
evaluator.evaluate(predictor, ['starts_with', 'unique_pos'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['predictions'] = data.mod_words.apply(lambda l: [word['entity'] for word in self._combine_entities_with_average(self.pipeline(" ".join(l)))])


ValueError: Found input variables with inconsistent numbers of samples: [10, 18]