In [None]:
import sys
sys.path.append('../src/')

In [None]:
import helper as h
import imp

In [None]:
imp.reload(h)

In [None]:
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel

In [None]:
import pandas as pd

In [None]:
df = pd.read_json('../data/df_final_5.json')

In [None]:
df

In [None]:
label_names = ['O', *sorted(df['ner'].explode().map(lambda x: x['label']).unique())]
# id2label = dict(enumerate([None, *sorted(labels)]))
# label2id = {v:k for k, v in id2label.items()}
cl = ClassLabel(names=(label_names))

In [None]:
val_ids = df.sample(frac=0.2).index.tolist()
dataset = DatasetDict({
    'train': Dataset.from_dict(df[['source', 'ner']].drop(index=val_ids).apply(pd.Series)),
        'validation': Dataset.from_pandas(df[['source', 'ner']].reindex(index=val_ids).apply(pd.Series))
})

In [None]:
dataset

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")

In [None]:
tokenizer

In [None]:
#Get the values for input_ids, token_type_ids, attention_mask
def tokenize_adjust_labels(all_samples_per_split):
#     print('all', all_samples_per_split)
    tokenized_samples = tokenizer.batch_encode_plus(all_samples_per_split["source"], is_split_into_words=False,
                                                    return_offsets_mapping=True)
    #tokenized_samples is not a datasets object so this alone won't work with Trainer API, hence map is used 
    #so the new keys [input_ids, labels (after adjustment)]
    #can be added to the datasets dict for each train test validation split
    total_adjusted_labels = []
    print(len(tokenized_samples["input_ids"]))
    for k in range(0, len(tokenized_samples["input_ids"])):
        prev_wid = -1
        word_ids_list = tokenized_samples.word_ids(batch_index=k)
        offsets_list = tokenized_samples['offset_mapping'][k]
        ents = (all_samples_per_split['ner'][k])
        i_ent = 0
        curr_ent = ents[i_ent]
        adjusted_label_ids = []
   
        for wid, (wstart, wend) in zip(word_ids_list, offsets_list):
            if wstart > curr_ent['endOffset'] and i_ent < len(ents) - 1:
                i_ent += 1
                curr_ent = ents[i_ent]
            if(wid is None):
                adjusted_label_ids.append(-100)
            elif(wstart >= curr_ent['startOffset'] and wend <= curr_ent['endOffset']):
                adjusted_label_ids.append(cl.str2int(curr_ent['label']))
            else:
                adjusted_label_ids.append(cl.str2int('O'))
        
        total_adjusted_labels.append(adjusted_label_ids)
    tokenized_samples["labels"] = total_adjusted_labels
    return tokenized_samples

tokenized_dataset = dataset.map(tokenize_adjust_labels, batched=True)

In [None]:
item = tokenized_dataset['train'][5]
source = item['source']
tokens = [item['source'][token[0]: token[1]] for token in item['offset_mapping']]
pd.DataFrame({
    'tokens': tokens,
    'labels': [cl.int2str(lab) if lab != -100 else '' for lab in item['labels']]
})

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import numpy as np
from datasets import load_metric
metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    for k in results.keys():
        if(k not in flattened_results.keys()):
            flattened_results[k+"_f1"]=results[k]["f1"]

    return flattened_results

In [None]:
import time

In [None]:
model = AutoModelForTokenClassification.from_pretrained("DeepPavlov/rubert-base-cased",
                                                        num_labels=len(label_names))
training_args = TrainingArguments(
    output_dir=f"../models/rubert_conv_{time.strftime('%y%m%d-%H%M')}",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_steps = 10,
    report_to="wandb",
    run_name = "rent-ner-15",
    save_strategy='steps',
    save_steps=10,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
# wandb.finish()


In [None]:
import json

In [None]:
with open(f"../models/rubert_conv_{time.strftime('%y%m%d-%H%M')}/val_ids.json", 'w') as f:
    json.dump(val_ids, f)