#Init. aux.

In [25]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score

use_drive = True
if use_drive:
  PATH = "/content/drive/MyDrive/CIL 2022/"
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/CIL 2022/
  !ls
else:
  PATH = "./"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/CIL 2022
'code submission README.txt'   OLD		      'Saved Model States'
 data			       Preprocessing	      'Word embeddings'
 Models			      'README document.gdoc'


#Init. (aux.) ensemble methods

In [26]:
#read in each predictions
def read_predictions(prediction_of_model):
  #prediction_of_model_name is a list of model names
  prediction_list = []
  for model_name in prediction_of_model:
    path = PATH + model_name + "_val.csv"
    prediction_list.append(np.loadtxt(path, delimiter=",").astype(int))
  return prediction_list

def read_training_predictions(prediction_of_model):
  #prediction_of_model_name is a list of model names
  prediction_list = []
  for model_name in prediction_of_model:
    path = PATH + model_name + "_train.csv"
    prediction_list.append(np.loadtxt(path, delimiter=",").astype(int))
  return prediction_list

def read_test_predictions(prediction_of_model):
  #prediction_of_model_name is a list of model names
  prediction_list = []
  for model_name in prediction_of_model:
    path = PATH + model_name + "_submission.csv"
    first = True
    lines = []
    with open(path) as file:
      for line in file:
        if first:
          first = False
          continue
        lines.append(line.split(",")[1].strip())
    nparray = np.asarray(lines).astype(int)

    for index in range(len(nparray)):
        if nparray[index] == -1:
          nparray[index] = 0

    print(nparray[:10])
    prediction_list.append(nparray)
  return prediction_list

def ensemble_majority_vote(number_of_predictions, list_of_predictions):
  #count votes for label
  #index 0: negative label
  #index 1: positive label
  label_list = [[0] * number_of_predictions, [0] * number_of_predictions]

  for prediction_list in list_of_predictions:
    for index in range(number_of_predictions):
      prediction_at_index = prediction_list[index]
      label_list[prediction_at_index][index] += 1
  
  final_prediction = [1] * number_of_predictions

  neg_label_list = label_list[0]
  pos_label_list = label_list[1]

  for index in range(number_of_predictions):
    if neg_label_list[index] > pos_label_list[index]:
      final_prediction[index] = 0

  #map label 0 to -1
  # for index in range(len(final_prediction)):
  #   if final_prediction[index] == 0:
  #     final_prediction[index] = -1
  
  #return 3 lists for debugging purposes
  return np.array(neg_label_list), np.array(pos_label_list), np.array(final_prediction)

def generate_submission(final):
  submission_file = PATH + "data/test data/least_squares_voting_ensemble_submission.csv"

  #map label 0 to -1
  for index in range(len(final)):
    if final[index] == 0:
      final[index] = -1

  # Create the pandas dataframe
  id = np.arange(1, len(final) + 1)
  data = {"Id": id, "Prediction": final}
  df = pd.DataFrame(data, columns=["Id", "Prediction"])
  print(df)
  #save submission file
  df.to_csv(submission_file, index=False)


def read_valid_labels(PREPROCESSING_CHOICE):
  dataset_path = PATH + "data/" + PREPROCESSING_CHOICE + "/"
  lines = []
  with open(dataset_path + "val_labels.txt") as file:
    for line in file:
      lines.append(line.strip())
  return np.array(lines).astype(int)

def read_training_labels(PREPROCESSING_CHOICE):
  dataset_path = PATH + "data/" + PREPROCESSING_CHOICE + "/"
  lines = []
  with open(dataset_path + "train_labels.txt") as file:
    for line in file:
      lines.append(line.strip())
  return np.array(lines).astype(int)

  
def print_stats(y_val, y_val_pred):
  print(f'Acc: {accuracy_score(y_val, y_val_pred)}')
  print(f'Recall: {recall_score(y_val, y_val_pred)}')
  print(f'Precision: {precision_score(y_val, y_val_pred)}')
  print(f'F1: {f1_score(y_val, y_val_pred)}')
  print(f'ROC_AUC: {roc_auc_score(y_val, y_val_pred)}')

#Generate csv file with predictions

In [27]:
models = [
          "data/test data/finetuned_roberta_model",
          "data/test data/Grubert v.A.2.;epochnr=2",
          # "data/test data/XLNET v.A.3;epochnr=3",
          "data/test data/XLNET v.A.2 hyperparametertuning;epochnr=1",
          ]

list_of_predictions = read_training_predictions(models)
print(list(map(len, list_of_predictions)))
number_of_predictions = len(list_of_predictions[0])

#load val_labels and compare it with test_label to get stats
PREPROCESSING_CHOICE = "raw"
labels = read_training_labels(PREPROCESSING_CHOICE)[:500000]

# prepare data
row_matrix = np.asarray(list_of_predictions)
print(row_matrix.shape)
column_matrix = row_matrix.transpose()
print(column_matrix.shape)

from sklearn.linear_model import RidgeClassifier
clf = RidgeClassifier().fit(column_matrix, labels)

print(clf.coef_)

import torch
vector = torch.tensor(clf.coef_[0])
probabilities = torch.nn.Softmax(dim=-1)(vector)
print(probabilities)

[500000, 500000, 500000]
(3, 500000)
(500000, 3)
[[0.10498783 1.16611039 0.53975231]]
tensor([0.1840, 0.5317, 0.2842], dtype=torch.float64)


In [28]:
list_of_val_predictions = read_predictions(models)
print(list(map(len, list_of_val_predictions)))
number_of_val_predictions = len(list_of_val_predictions[0])

#load val_labels and compare it with test_label to get stats
PREPROCESSING_CHOICE = "raw"
val_labels = read_valid_labels(PREPROCESSING_CHOICE)

# prepare data
val_row_matrix = np.asarray(list_of_val_predictions)
print(val_row_matrix.shape)
val_column_matrix = val_row_matrix.transpose()
print(val_column_matrix.shape)


test_labels = clf.predict(val_column_matrix)
print(test_labels[:10])

print_stats(val_labels, test_labels)

# generate_submission(test_label)

[227049, 227049, 227049]
(3, 227049)
(227049, 3)
[1 0 1 0 1 0 0 0 1 0]
Acc: 0.8962294482688759
Recall: 0.9005371332179163
Precision: 0.8913964320239037
F1: 0.8959434691398919
ROC_AUC: 0.8962629379271534


In [29]:
list_of_predictions = read_test_predictions(models)
row_matrix = np.asarray(list_of_predictions)
print(row_matrix.shape)
column_matrix = row_matrix.transpose()
print(column_matrix.shape)

test_labels = clf.predict(column_matrix)
print(test_labels[:10])

generate_submission(test_labels)

[0 0 1 1 0 0 0 1 1 1]
[0 0 1 1 0 0 0 1 1 1]
[0 0 1 1 0 0 0 1 1 0]
(3, 10000)
(10000, 3)
[0 0 1 1 0 0 0 1 1 1]
         Id  Prediction
0         1          -1
1         2          -1
2         3           1
3         4           1
4         5          -1
...     ...         ...
9995   9996           1
9996   9997          -1
9997   9998          -1
9998   9999           1
9999  10000          -1

[10000 rows x 2 columns]
