In [1]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset, concatenate_datasets
import evaluate

import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split 


In [2]:
from huggingface_hub import notebook_login
# If below code does not work, copy and paste this code in the terminal: huggingface-cli login 
# then paste this read token: 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# It may not show up but still paste the token in and press enter


notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
# load a list of passages and predict them (will take about .25 seconds per passage for me so beware the wait)
def predictor(data, labels, tokenizer_kwargs, classifier):
    dataOutput = []
    for text in data:
        # get actual labels
        actual_labels = [text[label] for label in labels]
        prediction = classifier(text['passage'], **tokenizer_kwargs)

        # get predicted labels
        scores = {item['label']:item['score'] for item in prediction[0]} #turn prediction into a dictionary
        pred_labels = [1 if scores[label] >= 0.5 else 0 for label in labels]

        
        output_dict = dict()
        output_dict["pred_labels"] = pred_labels
        output_dict["actual_labels"] = actual_labels
        output_dict["passage"] = text['passage']
        output_dict["ID"] = text['ID']


        # score[0][("actual_label", 'passage')] = text['passage'], text['label']
        dataOutput.append(output_dict)
    return dataOutput

# Get F1 scores
def score(dataOutput, labels):
    from sklearn.metrics import f1_score, accuracy_score

    df_score = pd.DataFrame(index=['NLP'], columns= [label+"_F1" for label in labels] + ["Micro_F1", "Macro_F1"])
    actual_labels = [x['actual_labels'] for x in dataOutput]
    pred_labels = [x['pred_labels'] for x in dataOutput]
    for index, label in enumerate(labels):
        f1 = round(f1_score(y_true=np.array(actual_labels)[:,index], y_pred=np.array(pred_labels)[:,index]),3)
        df_score.at['NLP', label+"_F1"] = f1
        # print(f"{label}: {(6 - len(label)) *' '}{f1}")

    # print("\n")

    f1_micro = round(f1_score(y_true=actual_labels, y_pred=pred_labels, average='micro'),3)
    f1_macro = round(f1_score(y_true=actual_labels, y_pred=pred_labels, average='macro'),3)
    df_score.at['NLP', "Micro_F1"] = f1_micro
    df_score.at['NLP', "Macro_F1"] = f1_macro
    return df_score
    # print(f'F1 score (micro) {f1_micro}\nF1 score (macro) {f1_macro}')

## Inference

### Load datset 

In [3]:
import json
loc = ""
# loc = "../HRAF_MultiLabel_ThreeLargeClasses/" #load old threemain class (comment this out unless you specifically are using it)

f = open(loc+"Datasets/test_dataset.json")
# f = open("../HRAF_MultiLabel_ThreeLargeClasses/Datasets/test_dataset.json") #load old threemain class (comment this out unless you specifically are using it)
data = json.load(f)
f.close()
Hraf = Dataset.from_dict(data)
Hraf

Dataset({
    features: ['ID', 'passage', 'EVENT_Illness', 'EVENT_Accident', 'EVENT_Other', 'CAUSE_Material_Physical', 'CAUSE_Spirits_Gods', 'CAUSE_Witchcraft_Sorcery', 'CAUSE_Rule_Violation_Taboo', 'ACTION_Physical_Material', 'ACTION_Technical_Specialist', 'ACTION_Divination', 'ACTION_Shaman_Medium_Healer', 'ACTION_Priest_High_Religion'],
    num_rows: 2074
})

### Define Kwargs and Labels

In [4]:
# Define tokenizer kwargs
tokenizer_kwargs = {'padding':True,'truncation':True,'max_length':512}

classifier_kwargs = {'top_k':None, 'device':0} #Set device -1 for CPU, 0 or higher for GPU

# get label names
labels = [label for label in Hraf.features.keys() if label not in ['ID', 'passage']]
labels

['EVENT_Illness',
 'EVENT_Accident',
 'EVENT_Other',
 'CAUSE_Material_Physical',
 'CAUSE_Spirits_Gods',
 'CAUSE_Witchcraft_Sorcery',
 'CAUSE_Rule_Violation_Taboo',
 'ACTION_Physical_Material',
 'ACTION_Technical_Specialist',
 'ACTION_Divination',
 'ACTION_Shaman_Medium_Healer',
 'ACTION_Priest_High_Religion']

## Single Model Inference

Run this or the other model, not both

In [17]:
from transformers import pipeline, AutoTokenizer

# CHANGE Model name
model = "Model_3_LearningRates/Learning_Rate_1e-05_fold_1"
checkpoint_path = "checkpoint-9960"
# set up the pipeline from local
import os
path =os.path.abspath(f"{model}/{checkpoint_path}")
classifier = pipeline("text-classification", model=path, **classifier_kwargs)


# sample inference ENTER TEXT IN HERE.
text = '''
“Drinking-tubes made of the leg-bones of swans (Fig. 109) are 190 also used chiefly as a measure of precaution against diseases ‘subject to shunning.’....”
'''
# reveal sample classification
prediction = classifier(text, **tokenizer_kwargs)
prediction

# # Demo other models (COMMENT THIS OUT UNLESS YOU REALLY WANT TO DEMO THIS)
# # Set up path from online hub (note, this is analogous but different model and is here because this is a demo)
# classifier = pipeline("text-classification", top_k=None, model="Chantland/Hraf_MultiLabel", use_auth_token="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", tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased"))
# model = "MultiLabel_ThreeLargeClasses"

[[{'label': 'ACTION_Physical_Material', 'score': 0.9821813106536865},
  {'label': 'EVENT_Illness', 'score': 0.9040574431419373},
  {'label': 'EVENT_Accident', 'score': 0.019755030050873756},
  {'label': 'CAUSE_Spirits_Gods', 'score': 0.01311244536191225},
  {'label': 'CAUSE_Rule_Violation_Taboo', 'score': 0.008514720015227795},
  {'label': 'ACTION_Technical_Specialist', 'score': 0.007323103491216898},
  {'label': 'CAUSE_Material_Physical', 'score': 0.00530574144795537},
  {'label': 'EVENT_Other', 'score': 0.0029064505361020565},
  {'label': 'ACTION_Priest_High_Religion', 'score': 0.0024381738621741533},
  {'label': 'ACTION_Shaman_Medium_Healer', 'score': 0.002224983647465706},
  {'label': 'ACTION_Divination', 'score': 0.0006451642839238048},
  {'label': 'CAUSE_Witchcraft_Sorcery', 'score': 0.0003798917750827968}]]

### Predict The Dataset

In [18]:
# Predict dataset (may take about .25 seconds per passage when tested on lab mac, could differ depending on your system)
# Also note that this pipeline is sequential and may give a warning saying it is unoptimized. Currently, using a whole dataset does not seem to reap faster results so we are remaining with sequential
HrafOutput = predictor(Hraf, labels=labels, tokenizer_kwargs=tokenizer_kwargs, classifier=classifier)
print(len(HrafOutput), "passages Predicted")

2074 passages Predicted


In [17]:
# HrafOutput = classifier(Hraf['passage'],**tokenizer_kwargs)
# print(len(HrafOutput), "passages Predicted")

### Calculate "Correctness" Metrics

In [20]:
#get F1 scores for labels.
df_score = score(HrafOutput, labels)
df_score

Unnamed: 0,EVENT_Illness_F1,EVENT_Accident_F1,EVENT_Other_F1,CAUSE_Material_Physical_F1,CAUSE_Spirits_Gods_F1,CAUSE_Witchcraft_Sorcery_F1,CAUSE_Rule_Violation_Taboo_F1,ACTION_Physical_Material_F1,ACTION_Technical_Specialist_F1,ACTION_Divination_F1,ACTION_Shaman_Medium_Healer_F1,ACTION_Priest_High_Religion_F1,Micro_F1,Macro_F1
NLP,0.856,0.355,0.574,0.394,0.681,0.59,0.493,0.628,0.396,0.0,0.479,0.193,0.622,0.47


#### Add correctness to file

In [21]:

# export F1 scores to excel
df_scoresSep = df_score.copy()
# first load train (and maybe add validation)
f = open(loc+"Datasets/train_dataset.json")
data = json.load(f)
train = Dataset.from_dict(data)
if os.path.isfile(loc+"Datasets/validation_dataset.json"):
    f = open(loc+"Datasets/validation_dataset.json")
    data = json.load(f)
    valid = Dataset.from_dict(data)
    train = concatenate_datasets([train, valid])
# add lengths of test and training set
df_scoresSep[["test_length", "train_length"]] = (len(Hraf), len(train))
# add date
df_scoresSep.insert(0, "Date", [datetime.today().date()])
if loc == "":
    df_scoresSep['Notes'] = f"model: {model}/{checkpoint_path}, Dataset: {model}"
else:
    df_scoresSep['Notes'] = f"model: {model}/{checkpoint_path}, Dataset: {loc}"
# load model_performance.xlsx or else create it
if os.path.isfile("Model_Prediction_Performance.xlsx"):
    df_oldScores = pd.read_excel("Model_Prediction_Performance.xlsx", index_col=0)
    df_oldScores_merged = pd.concat([df_scoresSep, df_oldScores])
    nonDateCols = df_oldScores_merged.columns[df_scoresSep.columns != 'Date']
    if any(df_oldScores_merged.duplicated(subset=nonDateCols)): # don't append the data unless it is new
        print("Duplicated scores found, skipping new addition")
        df_scoresSep = df_oldScores.copy()
    else:
        df_scoresSep = df_oldScores_merged.copy()
        df_scoresSep['Date'] = df_scoresSep['Date'].astype('datetime64[ns]')
        df_scoresSep.to_excel("Model_Prediction_Performance.xlsx")
else:
    df_scoresSep['Date'] = df_scoresSep['Date'].astype('datetime64[ns]')
    df_scoresSep.to_excel(f"Model_Prediction_Performance.xlsx")
df_scoresSep

Unnamed: 0,Date,EVENT_Illness_F1,EVENT_Accident_F1,EVENT_Other_F1,CAUSE_Material_Physical_F1,CAUSE_Spirits_Gods_F1,CAUSE_Witchcraft_Sorcery_F1,CAUSE_Rule_Violation_Taboo_F1,ACTION_Physical_Material_F1,ACTION_Technical_Specialist_F1,ACTION_Divination_F1,ACTION_Shaman_Medium_Healer_F1,ACTION_Priest_High_Religion_F1,Micro_F1,Macro_F1,test_length,train_length,Notes
NLP,2024-05-29,0.856,0.355,0.574,0.394,0.681,0.59,0.493,0.628,0.396,0.0,0.479,0.193,0.622,0.47,2074,8293,model: Model_3_LearningRates/Learning_Rate_1e-...
NLP,2024-05-29,0.856,0.0,0.473,0.065,0.446,0.0,0.0,0.559,0.0,0.0,0.0,0.0,0.503,0.2,2074,8293,model: Model_2_ReducedCols/Weight_Decay_.01_fo...


## Checkpoint Multi-model Inference

This is to run over MANY models and checkpoints to test and see which is the strongest. This is ran instead of the single model one above and should NOT be ran together with the single model (simply because they do different things)

In [6]:
# code for running through all checkpoints
# code for running through all checkpoints
import os
import pandas as pd
import re
import json
from transformers import pipeline, AutoTokenizer
def checkpointInfer(path, data, labels, tokenizer_kwargs, classifier_kwargs, folds=True, output_str="output_dir_", modelDestinctifier:str= "ModelDistinctifierUnknown"):
    # Initiate Dataframe overall
    df = pd.DataFrame([])

    # Get all viable models
    # Makes sure the model starts with the output string and is a directory
    models = [name for name in os.listdir(path) if (name.startswith(output_str) and os.path.isdir(f"{path}/{name}"))]

    for model in models:
        # Initiate Dataframe for each model
        df_model = pd.DataFrame([])

        checkpoints_dir = [checkpoint for checkpoint in os.listdir(f"{path}/{model}") if checkpoint.startswith("checkpoint")]

        modelDestinctifier_unit = re.findall(f"{output_str}(.*?)_",model)
        try:
            modelDestinctifier_unit = float(modelDestinctifier_unit[0])
        except:
            pass



        for checkpoint in checkpoints_dir:
            # Initiate Dataframe for each checkpoint
            df_checkpoint = pd.DataFrame([])
            # set up the pipeline from local
            model_path =os.path.abspath(f"{path}/{model}/{checkpoint}")
            classifier = pipeline("text-classification", model=model_path, **classifier_kwargs)
            # Get Predictions
            dataOutput = predictor(data, labels=labels, tokenizer_kwargs=tokenizer_kwargs, classifier=classifier)
            # Get scores
            df_checkpoint = score(dataOutput, labels)
            df_checkpoint = df_checkpoint.reset_index(drop=True) #remove the index here


            df_checkpoint.insert(0,modelDestinctifier,modelDestinctifier_unit) #insert model distinctifier (like weight decay or learning rate)
            #Extract and add Fold name if relevant
            if folds: #if using folds
                fold = re.findall(r"fold_(\d*)",model)
                fold = int(fold[0])
                df_checkpoint.insert(1,"Fold",fold)
            else:
                fold = ""

            # get checkpoint
            checkpoint_num = re.findall(r"checkpoint-(\d*)",checkpoint)
            assert len(checkpoint_num) == 1, f"More or less than one checkpoint numbers found: {len(checkpoint_num)} checkpoints"
            checkpoint_num = int(checkpoint_num[0])

            df_checkpoint.insert(0,"Model",model) # Add model name
            df_checkpoint.insert(0,"Checkpoint",checkpoint_num)
            df_model = pd.concat([df_model,df_checkpoint])
            print(model, checkpoint, "Complete")

        # concat model to overarching dataframe
        df = pd.concat([df,df_model])
        # save df for each model (as a checkpoint)
        # import evaluation if it exists
        if os.path.exists(f"{path}/Inference_Test.xlsx"):
            old_df = pd.read_excel(f"{path}/Inference_Test.xlsx", sheet_name="Sheet1", index_col=0)
            df_model = pd.concat([old_df, df_model])

        df_model.to_excel(f"{path}/Inference_Test.xlsx", sheet_name="Sheet1")
        print(model, "Successfully Saved")

    return df


            




# output_str="output_dir_"

# model = "MultiLabel_ThreeLargeClasses_kfoldsDEMO_WeightInvestigation"
# path =os.path.abspath(f"HRAF_Model_{model}")
# x = [name for name in os.listdir(path) if (name.startswith("output_dir_") and os.path.isdir(f"{path}/{name}"))]
# # x
# modelDestinctifier_unit = re.findall(f"{output_str}(.*?)_",x[1])
# try:
#     modelDestinctifier_unit = float(modelDestinctifier_unit)
# except:
#     pass

In [11]:
#This code will take a LONG time depending on how many models you have. It is recommended to use a GPU
path = loc+"/Model_3_LearningRates"
output_str = "Learning_Rate_"
modelDestinctifier = "Learning_Rate"

df_allScores = checkpointInfer(path=path, data=Hraf, labels=labels, tokenizer_kwargs=tokenizer_kwargs,  classifier_kwargs=classifier_kwargs, folds=True, output_str=output_str, modelDestinctifier= modelDestinctifier)
df_allScores

Weight_Decay_.01_fold_1 checkpoint-26430 Complete
Weight_Decay_.01_fold_1 Successfully Saved


Unnamed: 0,Checkpoint,Weight_Decay,Fold,EVENT_Illness_F1,EVENT_Accident_F1,EVENT_Other_F1,CAUSE_Just_Happens_F1,CAUSE_Material_Physical_F1,CAUSE_Spirits_Gods_F1,CAUSE_Witchcraft_Sorcery_F1,CAUSE_Rule_Violation_Taboo_F1,CAUSE_Other_F1,ACTION_Physical_Material_F1,ACTION_Technical_Specialist_F1,ACTION_Divination_F1,ACTION_Shaman_Medium_Healer_F1,ACTION_Priest_High_Religion_F1,ACTION_Other_F1,Micro_F1,Macro_F1
0,26430,0.01,1,0.865,0.0,0.494,0.0,0.059,0.591,0.0,0.0,0.0,0.567,0.0,0.0,0.046,0.0,0.0,0.501,0.175


In [32]:
#This code will take a LONG time depending on how many models you have. It is reccommended to use a GPU
path = "HRAF_Model_MultiLabel_ThreeLargeClasses_kfoldsDEMO_WeightInvestigation"

df_allScores = checkpointInfer(path=path, data=Hraf, labels=labels, tokenizer_kwargs=tokenizer_kwargs,  classifier_kwargs=classifier_kwargs, folds=True, output_str="output_dir_", modelDestinctifier= "Weight_Decay")
df_allScores

output_dir_0.0001_fold_1 checkpoint-1516 Complete
output_dir_0.0001_fold_1 checkpoint-2274 Complete
output_dir_0.0001_fold_1 checkpoint-3032 Complete
output_dir_0.0001_fold_1 checkpoint-3790 Complete
output_dir_0.0001_fold_1 checkpoint-758 Complete
output_dir_0.0001_fold_1 Successfully Saved
output_dir_0.0001_fold_2 checkpoint-1516 Complete
output_dir_0.0001_fold_2 checkpoint-2274 Complete
output_dir_0.0001_fold_2 checkpoint-3032 Complete
output_dir_0.0001_fold_2 checkpoint-3790 Complete
output_dir_0.0001_fold_2 checkpoint-758 Complete
output_dir_0.0001_fold_2 Successfully Saved
output_dir_0.0001_fold_3 checkpoint-1516 Complete
output_dir_0.0001_fold_3 checkpoint-2274 Complete
output_dir_0.0001_fold_3 checkpoint-3032 Complete
output_dir_0.0001_fold_3 checkpoint-3790 Complete
output_dir_0.0001_fold_3 checkpoint-758 Complete
output_dir_0.0001_fold_3 Successfully Saved
output_dir_0.0001_fold_4 checkpoint-1516 Complete
output_dir_0.0001_fold_4 checkpoint-2274 Complete
output_dir_0.0001_fol

Unnamed: 0,Checkpoint,Weight_Decay,Fold,EVENT_F1,CAUSE_F1,ACTION_F1,Micro_F1,Macro_F1
0,1516,0.000100,1,0.917,0.816,0.806,0.854,0.846
0,2274,0.000100,1,0.919,0.816,0.8,0.852,0.845
0,3032,0.000100,1,0.914,0.815,0.802,0.85,0.844
0,3790,0.000100,1,0.914,0.812,0.794,0.847,0.84
0,758,0.000100,1,0.91,0.776,0.794,0.837,0.826
...,...,...,...,...,...,...,...,...
0,1456,0.000001,5,0.961,0.923,0.928,0.94,0.937
0,2184,0.000001,5,0.968,0.935,0.936,0.948,0.946
0,2912,0.000001,5,0.972,0.94,0.931,0.95,0.948
0,3640,0.000001,5,0.972,0.942,0.941,0.953,0.951


## Optional File save

In [104]:
# HrafOutput

In [15]:
# optionally save the file to json
from transformers import AutoTokenizer
import copy

HrafOutput_dummy = copy.deepcopy(HrafOutput)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["passage"], truncation=True)

tokenized_Hraf = Hraf.map(preprocess_function, batched=True)

for index, passage in enumerate(HrafOutput_dummy):
    assert passage['passage'] == tokenized_Hraf[index]['passage']
    passage['pred_labels'] = {key:passage['pred_labels'][index] for index, key in enumerate(labels)}
    passage['actual_labels'] = {key:passage['actual_labels'][index] for index, key in enumerate(labels)}
    passage['input_ids'] = tokenized_Hraf[index]['input_ids']

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Map:   0%|          | 0/728 [00:00<?, ? examples/s]

In [18]:
import json
# Save to unformatted json (uncomment)
with open(f"Datasets/tokenized_inputs.json", "w") as outfile:
    json.dump(HrafOutput_dummy, outfile)


# # Save to Dataset (uncomment)
# HrafOutput_dummy_dataset = Dataset.from_list(HrafOutput_dummy)
# Dataset.to_json(HrafOutput_dummy_dataset, f"Datasets/tokenized_Hraf")

## CHi Square

In [41]:
from scipy.stats import chi2_contingency

ct_EVENT_CAUSE = pd.crosstab(df[('EVENT','No_Info')], df[('CAUSE','No_Info')], rownames=['ACTION'], colnames=['CAUSE'])
ct_EVENT_CAUSE

array([[1167,  351],
       [  49,  183]], dtype=int64)

In [119]:
def chi_square_calc(row, col):
    cross_tab = pd.crosstab(df[(row,'No_Info')], df[(col,'No_Info')], rownames=[row], colnames=[col])
    stat, p, dof, expected = chi2_contingency(cross_tab)
    results = f"{row} by {col}:\nchi: {round(stat,1)}\np:   {round(p,3)}\n\n"
    return results

group_list = [('EVENT', 'CAUSE'), ('EVENT', 'ACTION'), ('ACTION', 'CAUSE')]
for row, col in group_list:
    print(chi_square_calc(row, col))

EVENT by CAUSE:
chi: 292.4
p:   0.0


EVENT by ACTION:
chi: 103.3
p:   0.0


ACTION by CAUSE:
chi: 0.0
p:   0.857




In [44]:
def chi_sqr(obs):
    size_x = obs.shape
    chi_mat = np.zeros(size_x)
    for row in range(size_x[0]):
        for col in range(size_x[1]):
            exp = np.sum(x[row]) * np.sum(x[:,col]) / np.sum(x)
            chi_mat[row, col] = np.sum((obs[row, col] - exp)**2 / exp)
    return chi_mat

print(np.sum(chi_sqr(x)))
