#Init. aux.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

use_drive = True
if use_drive:
  PATH = "/content/drive/MyDrive/CIL 2022/"
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/CIL 2022/
  !ls
else:
  PATH = "./"

Mounted at /content/drive
/content/drive/MyDrive/CIL 2022
'code submission README.txt'   Models	      'README document.gdoc'
 data			       OLD	      'Saved Model States'
'Final Results.gsheet'	       Preprocessing  'Word embeddings'


#Init. (aux.) ensemble methods

In [2]:
#read in each predictions
def read_predictions(prediction_of_model):
  #prediction_of_model_name is a list of model names
  prediction_list = []
  for model_name in prediction_of_model:
    path = PATH + model_name + "_val.csv"
    prediction_list.append(np.loadtxt(path, delimiter=",").astype(int))
  return prediction_list

def ensemble_majority_vote(number_of_predictions, list_of_predictions):
  #count votes for label
  #index 0: negative label
  #index 1: positive label
  label_list = [[0] * number_of_predictions, [0] * number_of_predictions]

  for prediction_list in list_of_predictions:
    for index in range(number_of_predictions):
      prediction_at_index = prediction_list[index]
      label_list[prediction_at_index][index] += 1
  
  final_prediction = [1] * number_of_predictions

  neg_label_list = label_list[0]
  pos_label_list = label_list[1]

  for index in range(number_of_predictions):
    if neg_label_list[index] > pos_label_list[index]:
      final_prediction[index] = 0

  #map label 0 to -1
  # for index in range(len(final_prediction)):
  #   if final_prediction[index] == 0:
  #     final_prediction[index] = -1
  
  #return 3 lists for debugging purposes
  return np.array(neg_label_list), np.array(pos_label_list), np.array(final_prediction)


def read_valid_labels(PREPROCESSING_CHOICE):
  dataset_path = PATH + "data/" + PREPROCESSING_CHOICE + "/"
  lines = []
  with open(dataset_path + "val_labels.txt") as file:
    for line in file:
      lines.append(line.strip())
  return np.array(lines).astype(int)

  
def print_stats(y_val, y_val_pred):
  print(f'Acc: {accuracy_score(y_val, y_val_pred)}')
  print(f'Recall: {recall_score(y_val, y_val_pred)}')
  print(f'Precision: {precision_score(y_val, y_val_pred)}')
  print(f'F1: {f1_score(y_val, y_val_pred)}')
  print(f'ROC_AUC: {roc_auc_score(y_val, y_val_pred)}')

#Generate csv file with predictions

In [3]:
models = [
          "data/test data/finetuned_roberta_model",
          "data/test data/Grubert v.A.2.;epochnr=2",
          # "data/test data/XLNET v.A.3;epochnr=3",
          "data/test data/XLNET v.A.2 hyperparametertuning;epochnr=1",
          ]

#load val_labels and compare it with test_label to get stats
PREPROCESSING_CHOICE = "raw"
labels = read_valid_labels(PREPROCESSING_CHOICE)

list_of_predictions = read_predictions(models)

for i, prediction in enumerate(list_of_predictions):
  print("stats for model: %s" %models[i])
  print_stats(labels, prediction)

print(list(map(len, list_of_predictions)))
number_of_predictions = len(list_of_predictions[0])
neg_pred_label, pos_pred_label, test_label = ensemble_majority_vote(number_of_predictions, list_of_predictions)

print("\nstats for ensemble")
print_stats(labels, test_label)

stats for model: data/test data/finetuned_roberta_model
Acc: 0.8970486546956824
Recall: 0.8901762329648866
Precision: 0.901096432102094
F1: 0.8956030459346598
ROC_AUC: 0.8969952257522879
stats for model: data/test data/Grubert v.A.2.;epochnr=2
Acc: 0.8962294482688759
Recall: 0.9005371332179163
Precision: 0.8913964320239037
F1: 0.8959434691398919
ROC_AUC: 0.8962629379271534
stats for model: data/test data/XLNET v.A.2 hyperparametertuning;epochnr=1
Acc: 0.8958638884117526
Recall: 0.9138278510232166
Precision: 0.8807362215167669
F1: 0.8969769326628961
ROC_AUC: 0.8960035474110263
[227049, 227049, 227049]

stats for ensemble
Acc: 0.9027566736695604
Recall: 0.9090957517645492
Precision: 0.8963549143877587
F1: 0.9026803778358626
ROC_AUC: 0.9028059561871324
