In [1]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
import evaluate

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 




In [2]:
def passage_uploader(data_label):
    # load OLD and new datasets (THIS IS WHERE YOU WOULD ENTER IN THE DATA YOU WANTED TO TEST!)
    df_old = pd.read_excel("../RA_Cleaning/Culture_Coding_old.xlsx", header=[0,1], index_col=0)
    df_current = pd.read_excel("../RA_Cleaning/Culture_Coding.xlsx", header=[0,1], index_col=0)

    # Remove the runs not of the first from the datasets (this will be made superfluous later but nonetheless is an extra assuredness step)
    # df_old = df_old.loc[df_old[("CODER","Run_Number")]==1] #if this had "Run_Number" column, you would uncomment and run this line
    df_current = df_current.loc[df_current[("CODER","Run_Number")]==1]

    # only get new rows that have NOT been trained/tested on before
    df_new = pd.concat([df_current, df_old])
    df_new = df_new[~df_new.duplicated(subset=("CULTURE","Passage Number"), keep=False)]


    # subdivide into just passage and outcome
    df_small = pd.DataFrame()
    df_small[["passage","label"]] = df_new[[('CULTURE', "Passage"), (data_label, "No_Info")]]
    # Flip the lable of "no_info"
    df_small["label"] = df_small['label'].replace({0:1, 1:0})



    # Create an NLP friendly dataset
    Hraf = Dataset.from_dict(df_small.to_dict(orient= 'list'))
    return Hraf

In [3]:
from transformers import pipeline, AutoTokenizer


def predictor(Hraf, classifier):
    tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512}
    HrafOutput = []
    for text in Hraf:
        # text = Hraf[text]
        score = classifier(text['passage'], **tokenizer_kwargs)
        score[0]["actual_label"] = text['label']
        score[0]["passage"] = text['passage']

        # change the predicted label into a digit that we can compare
        if score[0]['label'] == 'PRESENT':
            score[0]['label'] = 1
        elif score[0]['label'] == 'ABSENT':
            score[0]['label'] = 0
        else:
            score[0]['label'] = 9


        # score[0][("actual_label", 'passage')] = text['passage'], text['label']
        HrafOutput.append(score[0])

    return HrafOutput

In [4]:
def f1_score(HrafOutput):
    # loading precision
    precision = evaluate.load('precision')
    result = precision.compute(predictions=[x['label'] for x in HrafOutput], references=[x['actual_label'] for x in HrafOutput])
    # print(result)

    # f1 score
    from sklearn.metrics import f1_score, accuracy_score
    accuracy = round(accuracy_score([x['actual_label'] for x in HrafOutput], [x['label'] for x in HrafOutput]),3)
    f1 = round(f1_score([x['actual_label'] for x in HrafOutput], [x['label'] for x in HrafOutput], average = 'binary'),3)
    f1_micro = round(f1_score([x['actual_label'] for x in HrafOutput], [x['label'] for x in HrafOutput], average='micro'),3)
    f1_macro = round(f1_score([x['actual_label'] for x in HrafOutput], [x['label'] for x in HrafOutput], average='macro'),3)
    results = f'Accuracy  {accuracy}\nF1 score (base)  {f1}\nF1 score (micro) {f1_micro}\nF1 score (macro) {f1_macro}'
    return results

In [5]:
def HRAF_inference(training_label, data_label):
    classifier = pipeline("text-classification", model=f"Chantland/HRAF_{training_label}_Demo", use_auth_token="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", tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased"))
    Hraf = passage_uploader(data_label)
    HrafOutput = predictor(Hraf, classifier)
    results = f1_score(HrafOutput)
    return results

In [8]:
from transformers import pipeline, AutoTokenizer

# # uncomment based on if you want to do EVENT, CAUSE, or, ACTION
training_label = 'EVENT'
data_label = 'EVENT'
# training_label = 'CAUSE'
# training_label = 'ACTION'

print('EVENT data using EVENT model\n', HRAF_inference(data_label='EVENT', training_label='EVENT'),'\n\n\n')
print('EVENT data using CAUSE model\n', HRAF_inference(data_label='EVENT', training_label='CAUSE'),'\n\n\n')
print('CAUSE data using CAUSE model\n', HRAF_inference(data_label='CAUSE', training_label='CAUSE'),'\n\n\n')
print('CAUSE data using EVENT model\n', HRAF_inference(data_label='CAUSE', training_label='EVENT'),'\n\n\n')

EVENT data using EVENT model
 Accuracy  0.914
F1 score (base)  0.935
F1 score (micro) 0.914
F1 score (macro) 0.905 



EVENT data using CAUSE model
 Accuracy  0.85
F1 score (base)  0.889
F1 score (micro) 0.85
F1 score (macro) 0.829 



CAUSE data using CAUSE model
 Accuracy  0.786
F1 score (base)  0.819
F1 score (micro) 0.786
F1 score (macro) 0.778 



CAUSE data using EVENT model
 Accuracy  0.779
F1 score (base)  0.807
F1 score (micro) 0.779
F1 score (macro) 0.773 





In [None]:
# # uncomment based on if you want to do EVENT, CAUSE, or, ACTION
training_label = 'EVENT'
data_label = 'EVENT'
# training_label = 'CAUSE'
# training_label = 'ACTION'