#Init. aux.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

use_drive = True
if use_drive:
  PATH = "/content/drive/MyDrive/CIL 2022/"
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/CIL 2022/
  !ls
else:
  PATH = "./"

Mounted at /content/drive
/content/drive/MyDrive/CIL 2022
'code submission README.txt'   Models	      'README document.gdoc'
 data			       OLD	      'Saved Model States'
'Final Results.gsheet'	       Preprocessing  'Word embeddings'


#Init. (aux.) ensemble methods

In [2]:
#read in each predictions
def read_predictions(prediction_of_model):
  #prediction_of_model_name is a list of model names
  prediction_list = []
  for model_name in prediction_of_model:
    path = PATH + model_name + "_submission.csv"
    first = True
    lines = []
    with open(path) as file:
      for line in file:
        if first:
          first = False
          continue
        lines.append(line.split(",")[1].strip())
    nparray = np.asarray(lines).astype(int)

    for index in range(len(nparray)):
        if nparray[index] == -1:
          nparray[index] = 0

    print(nparray[:10])
    prediction_list.append(nparray)
  return prediction_list

def ensemble_majority_vote(number_of_predictions, list_of_predictions):
  #count votes for label
  #index 0: negative label
  #index 1: positive label
  label_list = [[0] * number_of_predictions, [0] * number_of_predictions]

  for prediction_list in list_of_predictions:
    for index in range(number_of_predictions):
      prediction_at_index = prediction_list[index]
      label_list[prediction_at_index][index] += 1
  
  final_prediction = [1] * number_of_predictions

  neg_label_list = label_list[0]
  pos_label_list = label_list[1]

  for index in range(number_of_predictions):
    if neg_label_list[index] > pos_label_list[index]:
      final_prediction[index] = 0

  #map label 0 to -1
  for index in range(len(final_prediction)):
    if final_prediction[index] == 0:
      final_prediction[index] = -1
  
  #return 3 lists for debugging purposes
  return np.array(neg_label_list), np.array(pos_label_list), np.array(final_prediction)

def generate_submission(final):
  submission_file = PATH + "data/test data/majority_voting_ensemble_submission.csv"

  # Create the pandas dataframe
  id = np.arange(1, len(final) + 1)
  data = {"Id": id, "Prediction": final}
  df = pd.DataFrame(data, columns=["Id", "Prediction"])
  print(df)
  #save submission file
  df.to_csv(submission_file, index=False)


def read_valid_labels(PREPROCESSING_CHOICE):
  dataset_path = PATH + "data/" + PREPROCESSING_CHOICE + "/"
  lines = []
  with open(dataset_path + "val_labels.txt") as file:
    for line in file:
      lines.append(line.strip())
  return np.array(lines).astype(int)

  
def print_stats(y_val, y_val_pred):
  print(f'Acc: {accuracy_score(y_val, y_val_pred)}')
  print(f'Recall: {recall_score(y_val, y_val_pred)}')
  print(f'Precision: {precision_score(y_val, y_val_pred)}')
  print(f'F1: {f1_score(y_val, y_val_pred)}')
  print(f'ROC_AUC: {roc_auc_score(y_val, y_val_pred)}')

#Generate csv file with predictions

In [3]:
models = [
          "data/test data/finetuned_roberta_model",
          "data/test data/Grubert v.A.2.;epochnr=2",
          # "data/test data/XLNET v.A.3;epochnr=3",
          "data/test data/XLNET v.A.2 hyperparametertuning;epochnr=1",
          ]

list_of_predictions = read_predictions(models)
print(list(map(len, list_of_predictions)))
number_of_predictions = len(list_of_predictions[0])
neg_pred_label, pos_pred_label, test_label = ensemble_majority_vote(number_of_predictions, list_of_predictions)

generate_submission(test_label)

[0 0 1 1 0 0 0 1 1 1]
[0 0 1 1 0 0 0 1 1 1]
[0 0 1 1 0 0 0 1 1 0]
[10000, 10000, 10000]
         Id  Prediction
0         1          -1
1         2          -1
2         3           1
3         4           1
4         5          -1
...     ...         ...
9995   9996           1
9996   9997          -1
9997   9998          -1
9998   9999           1
9999  10000          -1

[10000 rows x 2 columns]
