In [None]:
import torch
import seaborn as sns
import pandas as pd
import transformers
import evaluate
from transformers import pipeline, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk
from collections import Counter, defaultdict
from transformers import DataCollatorWithPadding
from pathlib import Path
import numpy as np
from sklearn.metrics import f1_score, classification_report, accuracy_score
from tqdm.auto import tqdm
import re
from nltk.tokenize import sent_tokenize
from sklearn.linear_model import LinearRegression
sns.set()

In [None]:
#pd.read_csv('../animacy.csv')

In [None]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files="../LwM-nlp-animacy-annotations-machines19thC.tsv",sep='\t')

In [None]:
dataset

In [None]:
dataset = dataset['train']

## Classify Animacy

In [None]:
def pred_data(example,add_field='Date'):
    return {'st_year_sep': f'[{example[add_field]}]' + ' [SEP] ' + example['Sentence'] ,
     'year_sep': str(example[add_field]) + ' [SEP] ' + example['Sentence'] ,
     'year_date': str(example[add_field]) + ' [DATE] ' + example['Sentence'] 
        
    }
    
dataset = dataset.map(pred_data , num_proc=6)

In [None]:
dataset['st_year_sep'][1]

In [None]:
len(dataset)

In [None]:
lab2code = {label:i for i,label in enumerate(dataset.unique('animacy'))}
num_labels = len(lab2code)
dataset = dataset.map(lambda x: {'label': lab2code[x['animacy']]})

In [None]:
dataset

In [None]:
test_size = int(len(dataset)*.3)
train_test = dataset.train_test_split(test_size=test_size , seed=42)
test_set = train_test['test']
val_size = int(len(train_test['train'])*.05)
train_val =  train_test['train'].train_test_split(test_size=val_size,seed=42)

In [None]:
train_test.save_to_disk('/datadrive_2/animacy_split')

In [None]:
checkpoints = [('distilbert','distilbert-base-uncased','[SEP]','year_sep'),
               ('hmd_distilbert','/datadrive_2/bnert-hmd','[SEP]','year_sep'),
               ('bnert-time-st-y','/datadrive_2/bnert-time-st-y','[SEP]','st_year_sep'),
               ('bnert-time-y','/datadrive_2/bnert-time-y','[DATE]','year_date'),
               ('bnert-time-y_masked_25','/datadrive_2/bnert-time-y_masked_25','[DATE]','year_date'),
               ('bnert-time-y_masked_75','/datadrive_2/bnert-time-y_masked_75','[DATE]','year_date'),
               ('bnert-pol-st','/datadrive_2/bnert-pol-st','[SEP]','year_sep'),
               ('bnert-pol','/datadrive_2/bnert-pol','[SEP]','year_sep'),
               ('bnert-comb','/datadrive_2/bnert-combined','[SEP]','year_sep')
              ]

model_dict = defaultdict(dict)
for name,checkpoint, st, sent_col in checkpoints:
    model_dict[name]['model'] = AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=num_labels)
    model_dict[name]['tokenizer'] = AutoTokenizer.from_pretrained(checkpoint)
    #model_dict[name]['special_token'] = st
    model_dict[name]['sentences'] = sent_col

In [None]:
train_val

In [None]:
train_val = train_val.remove_columns(['Unnamed: 0','SentenceCtxt', 'SentenceId', 'TargetExpression','animacy', 'humanness'])
train_val

In [None]:
#def add_text_col(example,source):
#    return {'text' : example[source]}

def preprocess_function(examples, target_col):
    return tokenizer(examples[target_col], truncation=True)

In [None]:
result_dict = defaultdict(dict)

for name, mdict in model_dict.items():
    print(f'Creating a model for {name}')
    tokenizer = model_dict[name]['tokenizer']
    model = model_dict[name]['model']
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    sent_col = model_dict[name]['sentences']
    
    #train_val = train_val.map(add_text_col,fn_kwargs={'source': sent_col})
    train_val = train_val.map(preprocess_function,fn_kwargs={'target_col': sent_col})
    
    training_args = TrainingArguments(
    output_dir=f"../results_{name}",
    seed = 42,
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
        )

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_val["train"],
    eval_dataset=train_val["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
        )


    trainer.train()
    
    model.save_pretrained(f'/datadrive_2/bnert-{name}-animacy')
    tokenizer.save_pretrained(f"/datadrive_2/bnert-{name}-animacy")
    
    
    test_set = test_set.map(preprocess_function,fn_kwargs={'target_col': sent_col})
    predictions = trainer.predict(test_set)
    preds = np.argmax(predictions.predictions, axis=-1)
    result_dict[name]['f1_binary'] = f1_score(preds,predictions.label_ids,average='binary')
    result_dict[name]['f1_macro'] = f1_score(preds,predictions.label_ids,average='macro')
    result_dict[name]['f1_micro'] = f1_score(preds,predictions.label_ids,average='micro')
    result_dict[name]['accuracy']  = accuracy_score(preds,predictions.label_ids)

In [None]:
results_df = pd.DataFrame.from_dict(result_dict, orient='index')

In [None]:
print(results_df.round(3).to_latex())

# Fin.