<a href="https://colab.research.google.com/github/Luensmann/Bachelorarbeit/blob/main/Evaluation/Eval_Variome.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from transformers import AutoTokenizer, BertForTokenClassification, AutoModelForTokenClassification
from datasets import load_dataset

In [None]:
# labels

labels = ['O', 'B-disease', 'I-disease', 'B-body-part', 'I-body-part', 'B-mutation', 'I-mutation', 'B-Physiology', 'I-Physiology', 'B-cohort-patient', 'I-cohort-patient', 'B-size', 'I-size', 'B-gender', 'I-gender', 'B-age', 'I-age', 'B-Concepts_Ideas', 'I-Concepts_Ideas', 'B-Disorder', 'I-Disorder', 'B-gene', 'I-gene', 'B-Phenomena', 'I-Phenomena', 'B-ethnicity', 'I-ethnicity']

id2label = {0: 'O',
             1: 'B-disease',
             2: 'I-disease',
             3: 'B-body-part',
             4: 'I-body-part',
             5: 'B-mutation',
             6: 'I-mutation',
             7: 'B-Physiology',
             8: 'I-Physiology',
             9: 'B-cohort-patient',
             10: 'I-cohort-patient',
             11: 'B-size',
             12: 'I-size',
             13: 'B-gender',
             14: 'I-gender',
             15: 'B-age',
             16: 'I-age',
             17: 'B-Concepts_Ideas',
             18: 'I-Concepts_Ideas',
             19: 'B-Disorder',
             20: 'I-Disorder',
             21: 'B-gene',
             22: 'I-gene',
             23: 'B-Phenomena',
             24: 'I-Phenomena',
             25: 'B-ethnicity',
             26: 'I-ethnicity'}
label2id = {'B-Concepts_Ideas': 17,
             'B-Disorder': 19,
             'B-Phenomena': 23,
             'B-Physiology': 7,
             'B-age': 15,
             'B-body-part': 3,
             'B-cohort-patient': 9,
             'B-disease': 1,
             'B-ethnicity': 25,
             'B-gender': 13,
             'B-gene': 21,
             'B-mutation': 5,
             'B-size': 11,
             'I-Concepts_Ideas': 18,
             'I-Disorder': 20,
             'I-Phenomena': 24,
             'I-Physiology': 8,
             'I-age': 16,
             'I-body-part': 4,
             'I-cohort-patient': 10,
             'I-disease': 2,
             'I-ethnicity': 26,
             'I-gender': 14,
             'I-gene': 22,
             'I-mutation': 6,
             'I-size': 12,
             'O': 0}

In [None]:
dataset = load_dataset("Brizape/Variome_tokenized_split_0404_dev")
dataset

In [None]:
# get all texts for predictions
texts = []
for item in dataset["test"]:
    texts.append(item["texts"])
print(len(texts))

# get all gold labels aka true labels
gold_id_all = []
for item in dataset["test"]:
    gold_id_all.append(item["labels"][1:-1])
print(len(gold_id_all))

# convert all IDs into labels
gold_label_all = []
for item in gold_id_all:
    gold_label_all.append([id2label[i] for i in item])
print(len(gold_label_all))

In [None]:
saveName = "5e-05_0404_ES6_strict_tok"
model = "Brizape/Variome_" + saveName
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForTokenClassification.from_pretrained(model)

In [None]:
# gold_id_all - expected outputs in ids-format
# gold_label_all - expected outputs in label-format
# pred_label_all - all predicted labels in label-format
# pred_id_all - all predicted labels in id-format

# del all missmatches in both list:
# gold_id - expected outputs in ids-format
# gold_label - expected outputs in label-format
# pred_label - all predicted labels in label-format
# pred_id - all predicted labels in id-format

# gold_label_norm - normalisation; removed prefix
# pred_label_norm - normalisation; removed prefix

# y_true - for sklearn, no sublists
# y-pred - for sklearn, no sublists

In [None]:
def predict(text):
    
    inputs = tokenizer(text, add_special_tokens=False, return_tensors="pt", truncation=True, max_length=512)
    if inputs['input_ids'].numel() == 0:
        print("input_ids is empty")
        return []
    
    with torch.no_grad():
        logits = model(**inputs).logits 
    predictions = torch.argmax(logits, dim=2)
    
    predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]

    return predicted_token_class

In [None]:
# predict all text
pred_label_all = []
for index, text in enumerate(texts):
    print(index, end=' ')
    pred_label_all.append(predict(text))
len(pred_label_all)


In [None]:
# delete all special token
for i, pred in enumerate(pred_label_all):
    if len(pred) == 512:
        pred_label_all[i] = pred[1:-1]
pred_label_all

In [None]:
# convert all IDs into label
pred_id_all = []
for item in pred_label_all:
    pred_id_all.append([label2id[i] for i in item])
len(pred_id_all)

In [None]:
# checking if all gold and pred are same len
# reutrn list with indices of all missmatches
def check_len(gold, pred):
    missmatch_index = []
    for index, (gold, pred) in enumerate(zip(gold, pred)):
        if len(gold)==len(pred):
            #print("yes")
            continue
        else:
            missmatch_index.append(index)
    print("indices to del:" + str(missmatch_index))
    return missmatch_index

In [None]:
# delete all missmatches from both sets
def del_missmatches(list1, list2, indices_to_del):
    for index in sorted(indices_to_del, reverse=True):
            del list1[index]
            del list2[index]

In [None]:
# copy _all files
gold_id = gold_id_all[:]
gold_label = gold_label_all[:]
pred_id = pred_id_all[:]
pred_label = pred_label_all[:]
all = [gold_id, gold_label, pred_id, pred_label]
print('len befor:')
for list in all:
    print(len(list))
# get indicies with missmatch
# and delete them from both sets
indices_to_del = check_len(gold_id, pred_id)
del_missmatches(gold_id , pred_id, indices_to_del)
del_missmatches(gold_label , pred_label, indices_to_del)
print('len after:')
for list in all:
    print(len(list))
    


In [None]:
############################################################
######################### seqeval ##########################
############################################################

In [None]:
from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score
from seqeval.scheme import IOB2

In [None]:
# generate metrics
print("\n")
print("-"*60 + "\n" + "    SeqEval strict\n" + "-"*60)
print("strict: \n" + classification_report(gold_label, pred_label, scheme=IOB2, mode='strict'))

print("\n")
print("-"*60 + "\n" + "    SeqEval default\n" + "-"*60)
print("default: \n" + classification_report(gold_label, pred_label))

In [None]:
# save reports as csv
import pandas as pd
report = pd.DataFrame(classification_report(gold_label, pred_label, mode='strict', digits=2, output_dict=True)).transpose()
print(report)
df = pd.DataFrame(report)#.transpose()
# With a Sheet Name
file_name = "variome" + saveName
report.to_excel('variome\ ' + saveName + '.xlsx', index=True)

In [None]:
#######################################################
####################### sklearn #######################
#######################################################

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import pandas as pd
import seaborn as sns
import evaluate

In [None]:
y_pred = []
for sublist in pred_id:
    for label in sublist:
        y_pred.append(label)
len(y_pred)

In [None]:
y_true = []
for sublist in gold_id:
    for label in sublist:
        y_true.append(label)
len(y_true)

In [None]:
print(unique_labels(y_pred))
print(unique_labels(y_true))

In [None]:
confusion_matrix(y_true, y_pred, labels=[i for i in range(27)])

In [None]:
def plot(freal, predicted_token_class, label_list):
    label_names = labels #unique_labels(y_true)
    columns = [f'{label}' for label in label_names]
    index = [f'{label}' for label in label_names]
    table = pd.DataFrame(confusion_matrix(y_true, y_pred, labels=[i for i in range(27)]), columns=columns, index=index)
    return table

In [None]:
plot = plot(y_true, y_pred, labels)
print(plot)
df = pd.DataFrame(plot)#.transpose()
df.to_excel('variome\ ' + saveName + 'confusion_.xlsx', index=True)

In [None]:
def plot_heat(freal, predicted_token_class, label_list):
    label_names = label_list #unique_labels(y_true)
    columns = [f'{label}' for label in label_names]
    index = [f'{label}' for label in label_names]
    table = pd.DataFrame(confusion_matrix(y_true, y_pred, labels= [i for i in range(27)]), columns=columns, index=index)
    sns.set (rc = {'figure.figsize':(12, 12)})
    return sns.heatmap(table, annot=True, fmt='d', linewidth=.1, vmax=150, cmap='YlOrBr', cbar_kws = dict(use_gridspec=False,location="top"))

In [None]:
ax = plot_heat(y_true, y_pred, labels)
ax.set(xlabel="Prediction", ylabel="Actual")
ax.figure.savefig("output.png")

In [None]:
from sklearn.metrics import classification_report

In [None]:
exl = pd.DataFrame()


report = classification_report(y_true, y_pred, labels=[i for i in range(27)], target_names = labels, output_dict=True)
print(report)
df = pd.DataFrame(report).transpose()
print("\n")
print("-"*60 + "\n" + "    Label Report\n" + "-"*60)
print(df)
exl = exl.append(df, ignore_index=True)
exl.to_excel('variome\ ' + saveName + '_labelReport.xlsx', index=True)#.transpose()

In [None]:
i = 0
for index, (pred, gold, inIDs) in enumerate(zip(pred_label, gold_label, dataset['test']['input_ids'])):
    print(index, len(pred), len(inIDs[1:-1]), end="\n")
    i = i+1
    for i, (label1, label2, inID) in enumerate(zip(pred, gold, inIDs[1:-1])):
        print(i, tokenizer.decode(inID), ":(" + str(label1), ",", label2 + ')')
        if label1 != label2:
            print("#########################")
            #print(":(" + str(label1), ",", label2, end=')\n')
    print(" ")

0 6 6
0 * :(O , O)
1 * :(O , O)
2 ignore :(O , O)
3 line :(O , O)
4 * :(O , O)
5 * :(O , O)
 
1 6 6
0 * :(O , O)
1 * :(O , O)
2 ignore :(O , O)
3 line :(O , O)
4 * :(O , O)
5 * :(O , O)
 
2 6 6
0 * :(O , O)
1 * :(O , O)
2 ignore :(O , O)
3 line :(O , O)
4 * :(O , O)
5 * :(O , O)
 
3 1 1
0 discussion :(O , O)
 
4 276 276
0 the :(O , O)
1 apc :(O , O)
2 mutations :(O , O)
3 identified :(O , O)
4 in :(O , O)
5 the :(O , O)
6 swedish :(O , O)
7 patients :(O , O)
8 are :(O , O)
9 scattered :(O , O)
10 along :(O , O)
11 the :(O , O)
12 apc :(O , O)
13 gene :(O , O)
14 ( :(O , O)
15 figure :(O , O)
16 1 :(O , O)
17 ) :(O , O)
18 . :(O , O)
19 the :(O , O)
20 most :(O , O)
21 5 :(O , O)
22 ' :(O , O)
23 situated :(O , O)
24 pathogenic :(O , O)
25 germ :(O , O)
26 line :(O , O)
27 apc :(O , O)
28 mutation :(O , O)
29 identified :(O , O)
30 in :(O , O)
31 this :(O , O)
32 study :(O , O)
33 , :(O , O)
34 in :(O , O)
35 codon :(O , O)
36 24 :(O , O)
37 of :(O , O)
38 exon :(O , O)
39 1 :(O , O)
40