-
Notifications
You must be signed in to change notification settings - Fork 0
/
evaluate_classifiers.py
95 lines (76 loc) · 3.58 KB
/
evaluate_classifiers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import os
import torch
import pickle
import numpy as np
from tqdm.auto import tqdm
from collections import Counter
from transformers import pipeline
from transformers import AutoTokenizer
from sklearn.metrics import classification_report
from transition_matrix import load_processed_data
def get_trial_scores(trial: int, eval_data):
tokenizer = AutoTokenizer.from_pretrained("roberta-base", truncate=True, model_max_length=512)
sentence_classifier = pipeline(
task="text-classification",
model=f"saved_models/sentence_classifier/roberta-base_{trial}",
tokenizer=tokenizer,
return_all_scores=True,
)
trial_pred_scores = dict()
for doc_idx in tqdm(list(sorted(eval_data.documents))):
document = eval_data.data[eval_data.data["document"] == doc_idx]
paragraph_mapping = document["paragraph"].tolist()
timesteps = len(set(paragraph_mapping))
num_sentences_per_paragraph = Counter(paragraph_mapping)
# Get label prediction scores for individual sentences
texts = document["text"].tolist()
sentence_score_matrix = np.full((timesteps, len(eval_data.label_set)), fill_value=0., dtype=np.float64)
sentence_scores = sentence_classifier(texts, batch_size=16)
for t in range(len(texts)):
sentence_scores_t = sentence_scores[t]
paragraph_idx = paragraph_mapping[t]
for label_score in sentence_scores_t:
score = label_score["score"] / num_sentences_per_paragraph[paragraph_idx]
sentence_score_matrix[paragraph_idx, eval_data.label2idx[label_score["label"]]] += score
trial_pred_scores[doc_idx] = sentence_score_matrix.copy()
del sentence_classifier
torch.cuda.empty_cache()
return trial_pred_scores
if __name__ == '__main__':
dev_data = load_processed_data("data/dev_processed.csv")
sorted_documents = list(sorted(dev_data.documents))
prediction_scores = dict()
if os.path.exists("saved_models/dev_ensemble_predictions.pickle"):
with open("saved_models/dev_ensemble_predictions.pickle", "rb") as psf:
prediction_scores = pickle.load(psf)
else:
prediction_scores = {k: get_trial_scores(trial=k, eval_data=dev_data) for k in range(1, 6)}
with open("saved_models/dev_ensemble_predictions.pickle", "wb") as psf:
pickle.dump(prediction_scores, psf)
y_true = []
for document_idx in sorted_documents:
y_true.extend(dev_data.paragraph_labels[document_idx])
for model, trial_prediction_scores in prediction_scores.items():
print(model)
print(trial_prediction_scores[0][:2].round(2))
trial_prediction_scores = np.concatenate(
[trial_prediction_scores[document_idx] for document_idx in sorted_documents], axis=0
)
y_pred = np.argmax(trial_prediction_scores, axis=1).tolist()
y_pred = [dev_data.idx2label[label] for label in y_pred]
print(classification_report(y_true=y_true, y_pred=y_pred, zero_division=0.0))
print("\n\n")
y_pred_ensemble = np.argmax(
np.stack(
[
np.concatenate([trial_prediction_scores[document_idx] for document_idx in sorted_documents], axis=0)
for trial_prediction_scores in prediction_scores.values()
]
).sum(axis=0),
axis=1
)
y_pred_ensemble = y_pred_ensemble.tolist()
y_pred_ensemble = [dev_data.idx2label[label] for label in y_pred_ensemble]
print("Ensemble:")
print(classification_report(y_true=y_true, y_pred=y_pred_ensemble, zero_division=0.0))
print("\n\n")