#Init aux.

In [1]:
model_name = "Grubert v.A.2.;epochnr=2"
maximal_number_of_words_in_tweet = 60

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.utils.data
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as functional
#comment out when using euler cluster
!pip install transformers
from transformers import BertModel

#from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import pickle

use_drive = True
#When using google colab, use this.
#when using euler, comment this if else structure out and set PATH to "./"
if use_drive:
  PATH = "/content/drive/MyDrive/CIL 2022/"
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/CIL 2022/
  !ls
else:
  PATH = "./"

print("Choosing data: ", end="")
#option 0 - 8
PREPROCESSING_OPTIONS = [ "raw",
"no-stemming_no-lemmatize_no-stopwords_no-spellcorrect",
"no-stemming_no-lemmatize_with-stopwords_no-spellcorrect",
"no-stemming_no-lemmatize_with-stopwords_with-spellcorrect",
"no-stemming_with-lemmatize_with-stopwords_no-spellcorrect",
"no-stemming_with-lemmatize_with-stopwords_with-spellcorrect",
"with-stemming_no-lemmatize_with-stopwords_no-spellcorrect",
"with-stemming_with-lemmatize_no-stopwords_with-spellcorrect",
"with-stemming_with-lemmatize_with-stopwords_no-spellcorrect" ]
#Should be 7/8
PREPROCESSING_CHOICE = PREPROCESSING_OPTIONS[0] # one from PREPROCESSING_OPTIONS
print(PREPROCESSING_CHOICE)

if torch.cuda.is_available():
	device = torch.device("cuda")
else:
	device = torch.device("cpu")

from transformers import BertTokenizer
# import torch.utils.data
# from torch.utils.data import Dataset, DataLoader, TensorDataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

print("Init methods.")

def read_testfile(filename):
  lines = []
  index = 0
  with open(filename) as file:
    for line in file:
      #Use BERT embedding
      lines.append(tokenizer.encode(line.strip(), max_length=maximal_number_of_words_in_tweet, pad_to_max_length=True, truncation=True))
      index += 1
      if index >= 500000:
        break
  return torch.tensor(torch.LongTensor(lines))

def load_model(path_to_model):
  # state = {
  #     "epoch": epoch_nr,
  #     "classifier": classifier,
  #     "optimizer": optimizer
  #         }
  if torch.cuda.is_available():
    return torch.load(path_to_model)["classifier"]
  else:
	  return torch.load(path_to_model, map_location='cpu')["classifier"]

#define a slightly different method when evaluating on test set
def prepare_embeddings_for_eval_testset(tweet, embedder, device):
  embeddings = embedder(input_ids=tweet[0].to(device))
  #some are hyperparameters, see below
  return [torch.cat(embeddings[2][i*4+1 : (i+1)*4+1], 2) for i in range(3)] 

from tqdm.auto import tqdm

def evaluate_on_testset(data_loader, classifier, device, embedder):
  classifier.eval()
  final_predictions = []
  
  #display bar
  progress_bar = tqdm(range(len(data_loader)))
  for _, tweet in enumerate(data_loader):
    # def prepare_embeddings(tweets, labels, embedder, device):
    embeddings = prepare_embeddings_for_eval_testset(tweet, embedder, device)
    prediction = classifier(embeddings)

    _, extracted_prediction = prediction["logits"].max(dim=1)
    final_predictions.append(extracted_prediction)

    #progress bar update
    progress_bar.update(1)

    #save space
    try:
      for sentence in tweet:
        tweet.clear_embeddings()
    except:
      pass
  final = []
  for tensor in final_predictions:
    final.extend(list(np.array(tensor.cpu())))
  return final


def generate_results_on_val_file(final, model_name):

  # final = []
  # #for each tensor
  # for tensor in final_prediction:
  #   #convert to correct datatype and append predictions to final
  #   final.extend(list(np.array(tensor)))

  #Leave this mapping out; mapping done in ensemble
  # #map label 0 to -1
  # for index in range(len(final)):
  #   if final[index] == 0:
  #     final[index] = -1

  #path to test file
  res_val_file = "/content/drive/MyDrive/CIL 2022/data/test data/"
  res_val_file += model_name + "_train.csv"

  # Create the pandas dataframe
  # id = np.arange(1, len(final) + 1)
  # data = {"Id": id, "Prediction": final}
  df = pd.DataFrame(final)
  #print(df)
  #save submission file
  df.to_csv(res_val_file, index=False, header=None)
  print("Submission file generated.")



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 30.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 5.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 38.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uni

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Init methods.


#Init. Grubert class (model)

In [2]:
#hyperparameters hard coded
class Grubert(nn.Module):
  
  def __init__(self, device):
    
    super().__init__()
    
    #Init device
    self.device = device

    #Binary classification
    self.number_of_classes = 2

    #Init hyperparameters
    self.number_of_hidden_units_per_gru = 100
    self.number_of_layers_in_gru = 1
    self.number_of_gru = 3
    self.number_of_combined_bert_hidden_layers_per_gru = 4

    self.max_number_of_words_per_tweet = maximal_number_of_words_in_tweet

    #For linear classifier layer
    self.number_of_hidden_units_for_linear = 100
    self.dropout = 0.5

    #Layers

    #embeddings
    self.embedder = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
    for parameter in self.embedder.parameters():
      parameter.requires_grad = True
    self.embedder = self.embedder.to(self.device)

    #GRU
    self.grus = [nn.GRU(self.number_of_combined_bert_hidden_layers_per_gru * 768, self.number_of_hidden_units_per_gru, num_layers=self.number_of_layers_in_gru, bidirectional=True) for _ in range(self.number_of_gru)]
    self.gru = nn.GRU(2 * self.number_of_gru * self.number_of_hidden_units_per_gru, self.number_of_hidden_units_per_gru, num_layers=self.number_of_layers_in_gru, bidirectional=True)
    
    self.linear_classifier = nn.Sequential(
        nn.Linear(2*self.number_of_hidden_units_per_gru, self.number_of_hidden_units_for_linear),
        nn.ReLU(),
        nn.Dropout(p=self.dropout),
        nn.Linear(self.number_of_hidden_units_for_linear, self.number_of_classes)
    )

    #init all layers in the linear classifier part
    for layer in self.linear_classifier:
      if (isinstance(layer, nn.Linear)):
        torch.nn.init.xavier_normal_(layer.weight)
    
  def forward(self, embedding):
    intermediate_result = [embedding[i].to(self.device).permute(1, 0, 2) for i in range(self.number_of_gru)]
    output = [self.grus[i].to(self.device)(intermediate_result[i])[0] for i in range(self.number_of_gru)]
    
    x, _ = self.gru(torch.cat(output, 2).to(self.device))

    # Classifier
    res = self.linear_classifier(functional.relu(x.permute(1, 0, 2))).sum(dim=1)

    return {"logits": res}

#Generate prediction on validationset

In [3]:
#specify path of (pre-processed) validation file is
PATH = "/content/drive/MyDrive/CIL 2022/"
#See, just a file change.
train_set_data_path = PATH + "data/" + PREPROCESSING_CHOICE + "/train_sentences.txt"

#read test data in
train_sentences = read_testfile(train_set_data_path)
print(len(train_sentences))

#Init. Dataloader for test data
batch_size = 64
train_loader = DataLoader(TensorDataset(train_sentences), batch_size = batch_size)

#Load model of choice
#specify path to model of choice
path_to_model = PATH + "Saved Model States/Grubert v.A.2.;epochnr=2 time_duraction=12943.558448791504s"
classifier = load_model(path_to_model)
embedder = classifier.embedder

#Adapt to current device
classifier.device = device

# generate prediction on testset
final_prediction = evaluate_on_testset(train_loader, classifier, device, embedder)

# generate submission file
generate_results_on_val_file(final_prediction, model_name)



500000


  0%|          | 0/7813 [00:00<?, ?it/s]

Submission file generated.


In [4]:
#final_prediction = evaluate_on_testset(valid_loader, classifier, device, embedder)
print(len(final_prediction))

500000
