#Init. preliminaries

In [None]:
#https://aclanthology.org/2020.aacl-srw.19.pdf
model_name = "Grubert v.A.2."

#A tweet has a character limit of 280 characters.
#Average length of an English word is 5.1 characters (https://www.wolframalpha.com/input?i=average+english+word+length)
#We rounded it to 5 characters.
#Assuming that between each word is a space, then we get 46.8333 words in a tweet.
#Round it up to 47.
#We set the maximum number of words in a tweet to 60, to allow for additional "slack" in our analysis 
maximal_number_of_words_in_tweet = 60

EVALUATE_ONLY = False

#import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.utils.data
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as functional
#comment out when using euler cluster
!pip install transformers
from transformers import BertModel

import random

import torch.optim as optim
import torch.backends.cudnn as cudnn

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import pickle


use_drive = True
#When using google colab, use this.
#when using euler, comment this if else structure out and set PATH to "./"
if use_drive:
  PATH = "/content/drive/MyDrive/CIL 2022/"
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/CIL 2022/
  !ls
else:
  PATH = "./"

print("Choosing data: ", end="")
#option 0 - 8
PREPROCESSING_OPTIONS = [ "raw",
"no-stemming_no-lemmatize_no-stopwords_no-spellcorrect",
"no-stemming_no-lemmatize_with-stopwords_no-spellcorrect",
"no-stemming_no-lemmatize_with-stopwords_with-spellcorrect",
"no-stemming_with-lemmatize_with-stopwords_no-spellcorrect",
"no-stemming_with-lemmatize_with-stopwords_with-spellcorrect",
"with-stemming_no-lemmatize_with-stopwords_no-spellcorrect",
"with-stemming_with-lemmatize_no-stopwords_with-spellcorrect",
"with-stemming_with-lemmatize_with-stopwords_no-spellcorrect" ]
#Should be 7/8
PREPROCESSING_CHOICE = PREPROCESSING_OPTIONS[0] # one from PREPROCESSING_OPTIONS
print(PREPROCESSING_CHOICE)

#Fix seed to 42
#The way to fix seed (and define torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark)
#was taken over from the following GitHub repository/file:
#https://github.com/ZuowenWang0000/GRUBERT-A-GRU-Based-Method-to-Fuse-BERT-Hidden-Layers-for-Twitter-sentiment-analysis/blob/master/train.py
seed = 42
print("Using seed: %d" % seed)
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

if torch.cuda.is_available():
	device = torch.device("cuda")
else:
	device = torch.device("cpu")

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 36.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 5.7 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 24.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uni

#Grubert class (model)

In [None]:
#hyperparameters hard coded
class Grubert(nn.Module):
  
  def __init__(self, device):
    
    super().__init__()
    
    #Init device
    self.device = device

    #Binary classification
    self.number_of_classes = 2

    #Init hyperparameters
    self.number_of_hidden_units_per_gru = 100
    self.number_of_layers_in_gru = 1
    self.number_of_gru = 3
    self.number_of_combined_bert_hidden_layers_per_gru = 4

    self.max_number_of_words_per_tweet = maximal_number_of_words_in_tweet

    #For linear classifier layer
    self.number_of_hidden_units_for_linear = 100
    self.dropout = 0.5

    #Layers

    #embeddings
    self.embedder = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
    for parameter in self.embedder.parameters():
      parameter.requires_grad = True
    self.embedder = self.embedder.to(self.device)

    #GRU
    self.grus = [nn.GRU(self.number_of_combined_bert_hidden_layers_per_gru * 768, self.number_of_hidden_units_per_gru, num_layers=self.number_of_layers_in_gru, bidirectional=True) for _ in range(self.number_of_gru)]
    self.gru = nn.GRU(2 * self.number_of_gru * self.number_of_hidden_units_per_gru, self.number_of_hidden_units_per_gru, num_layers=self.number_of_layers_in_gru, bidirectional=True)
    
    self.linear_classifier = nn.Sequential(
        nn.Linear(2*self.number_of_hidden_units_per_gru, self.number_of_hidden_units_for_linear),
        nn.ReLU(),
        nn.Dropout(p=self.dropout),
        nn.Linear(self.number_of_hidden_units_for_linear, self.number_of_classes)
    )

    #init all layers in the linear classifier part
    for layer in self.linear_classifier:
      if (isinstance(layer, nn.Linear)):
        torch.nn.init.xavier_normal_(layer.weight)
    
  def forward(self, embedding):
    intermediate_result = [embedding[i].to(self.device).permute(1, 0, 2) for i in range(self.number_of_gru)]
    output = [self.grus[i].to(self.device)(intermediate_result[i])[0] for i in range(self.number_of_gru)]
    
    x, _ = self.gru(torch.cat(output, 2).to(self.device))

    # Classifier
    res = self.linear_classifier(functional.relu(x.permute(1, 0, 2))).sum(dim=1)

    return {"logits": res}

#Init Aux.

In [None]:
def prepare_embeddings(tweet_and_label, embedder, device):
  tweet_list = tweet_and_label[0]
  label_list = tweet_and_label[1]
  embeddings = embedder(input_ids=tweet_list.to(device))
  label_list = label_list.to(device)

  #some are hyperparameters, see below
  return [torch.cat(embeddings[2][i*4+1 : (i+1)*4+1], 2) for i in range(3)], label_list 

def save_model(model_name, epoch_nr, classifier, optimizer, time_duration):
  file_path = PATH + model_name + ";epochnr=" + str(epoch_nr) + " time_duraction=" + str(time_duration) + "s"

  #how to store and load model
  # torch.save(model.state_dict(), PATH)
  # model = TheModelClass(*args, **kwargs)
  # model.load_state_dict(torch.load(PATH))
  # model.eval()
  state = {
      "epoch_nr": epoch_nr,
      "classifier": classifier,
      "optimizer": optimizer
          }
  print("Saving model ... ", end="")
  torch.save(state, file_path)
  print("- Model saved.")

def init_save_stats(model_name):
  name_of_file_stats = model_name + ";stats.txt"
  #important, use (over)write "w"
  file_obj = open(name_of_file_stats, "w", encoding="utf8")
  file_obj.write(f'Model name: {model_name}\n')
  file_obj.write(f'use_drive: {use_drive}\n')
  file_obj.write(f'PREPROCESSING_CHOICE: {PREPROCESSING_CHOICE}\n')
  file_obj.close()

def final_stats_saving(model_name, training_duration, avg_loss_train, avg_acc_train, avg_loss_val, avg_acc_val):
  name_of_file_stats = model_name + ";stats.txt"
  #important, use append "a"
  file_obj = open(name_of_file_stats, "a", encoding="utf8")
  print(f'Training Time: {training_duration}')
  print(f'\tTrain Loss: {avg_loss_train:.3f} | Train Acc: {avg_acc_train*100:.2f}%')
  print(f'\t Val. Loss: {avg_loss_val:.3f} |  Val. Acc: {avg_acc_val*100:.2f}%')
  file_obj.write(f'Training Time: {training_duration}\n')
  file_obj.write(f'\tTrain Loss: {avg_loss_train:.3f} | Train Acc: {avg_acc_train*100:.2f}%\n')
  file_obj.write(f'\t Val. Loss: {avg_loss_val:.3f} |  Val. Acc: {avg_acc_val*100:.2f}%\n')
  file_obj.close()

def save_stats(model_name, epoch_nr, epoch_duration, avg_loss_train, avg_acc_train, avg_loss_val, avg_acc_val):
  print(f'Epoch: {epoch_nr} | Epoch Time: {epoch_duration}')
  print(f'\tTrain Loss: {avg_loss_train:.3f} | Train Acc: {avg_acc_train*100:.2f}%')
  print(f'\t Val. Loss: {avg_loss_val:.3f} |  Val. Acc: {avg_acc_val*100:.2f}%')
  name_of_file_stats = model_name + ";stats.txt"
  #important, use append "a"
  file_obj = open(name_of_file_stats, "a", encoding="utf8")
  file_obj.write(f'Epoch: {epoch_nr} | Epoch Time: {epoch_duration}\n')
  file_obj.write(f'\tTrain Loss: {avg_loss_train:.3f} | Train Acc: {avg_acc_train*100:.2f}%\n')
  file_obj.write(f'\t Val. Loss: {avg_loss_val:.3f} |  Val. Acc: {avg_acc_val*100:.2f}%\n')
  file_obj.close()

#Read in data

In [None]:
from transformers import BertTokenizer
# import torch.utils.data
# from torch.utils.data import Dataset, DataLoader, TensorDataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

#for testing/debugging purposes, limit a limit
#for actual use, comment out limit and related code
limit = 10 * 250000

print("Loading data.")
def read_file_and_strip(mode, filename):
  lines = []
  i = 0
  if mode == "sentences":
    with open(filename) as file:
      for line in file:
        #Use BERT embedding
        lines.append(tokenizer.encode(line.strip(), max_length=maximal_number_of_words_in_tweet, pad_to_max_length=True, truncation=True))
        # i += 1
        # if i > limit:
        #   break
    return torch.tensor(torch.LongTensor(lines))
  elif mode == "label": #no need for tags
    with open(filename) as file:
      for line in file:
        lines.append(line.strip())
        # i += 1
        # if i > limit:
        #   break
    return torch.tensor(np.array(lines).astype(int))
  else:
    return "Unknown mode, invalid user input."

def read_data():
  dataset_path = PATH + "data/" + PREPROCESSING_CHOICE + "/"

  train_sentences = read_file_and_strip("sentences", dataset_path + "train_sentences.txt")
  train_labels = read_file_and_strip("label", dataset_path + "train_labels.txt")
  val_sentences = read_file_and_strip("sentences", dataset_path + "val_sentences.txt")
  val_labels = read_file_and_strip("label", dataset_path + "val_labels.txt")
  
  return train_sentences, train_labels, val_sentences, val_labels

train_sentences, train_labels, val_sentences, val_labels = read_data()

# torch_tensor_X_train = torch.tensor(train_sentences)
# torch_tensor_X_val = torch.tensor(val_sentences)
# torch_tensor_y_train = torch.tensor(train_labels)
# torch_tensor_y_val = torch.tensor(val_labels)

#Tensordataset
# tensor_train = TensorDataset(train_sentences, train_labels)
# tensor_val = TensorDataset(val_sentences, val_labels)

batch_size = 64
#Dataloader
train_loader = DataLoader(TensorDataset(train_sentences, train_labels), batch_size = batch_size, shuffle=True)
val_loader = DataLoader(TensorDataset(val_sentences, val_labels), batch_size = batch_size, shuffle=False)


Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Loading data.




#Init model and training

In [None]:
import time
from tqdm.auto import tqdm

learning_rate = 1e-5
learning_rate_decay = momentum = 0.9
max_epochs = 30


classifier = Grubert(device)
embedder = classifier.embedder
optimizer = optim.Adam(params=filter(lambda p: p.requires_grad, classifier.parameters()), lr=learning_rate, weight_decay=0.0)
loss_metric = nn.CrossEntropyLoss()

classifier = classifier.to(device)
loss_metric = loss_metric.to(device)

def accuracy_eval(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division #
    acc = correct.sum() / len(correct)
    return acc

def training(data_loader, classifier, loss_metric, optimizer, device, embedder):
  classifier.train()

  current_loss = 0.0
  current_acc = 0.0
  #Show progress bar to estimate time/epoch
  progress_bar = tqdm(range(len(data_loader)))
  for _, tweet_and_label in enumerate(data_loader):

    # print("tweet_and_label")
    # print(tweet_and_label)
    # print(type(tweet_and_label))

    embeddings, labels = prepare_embeddings(tweet_and_label, embedder, device)
    prediction = classifier(embeddings)["logits"]

    loss = loss_metric(prediction.to(device), labels)
    # print(loss)
    #for stats
    _, extracted_prediction = prediction.max(dim=1)
    acc = accuracy_eval(extracted_prediction, labels)

    optimizer.zero_grad()
    loss.backward()

    optimizer.step()

    #for stats
    current_loss += loss.item()
    current_acc += acc.item()

    #advance progess bar by 1
    progress_bar.update(1)

    #save space
    try:
      for sentence in tweet_and_label:
        tweet_and_label.clear_embeddings()
    except:
      pass
    
  return current_loss / len(data_loader), current_acc / len(data_loader)

def evaluate(data_loader, classifier, loss_metric, device, embedder):
  classifier.eval()

  current_loss = 0.0
  current_acc = 0.0

  progress_bar = tqdm(range(len(data_loader)))

  for _, tweet_and_label in enumerate(data_loader):
    # def prepare_embeddings(tweets, labels, embedder, device):
    embeddings, labels = prepare_embeddings(tweet_and_label, embedder, device)
    prediction = classifier(embeddings)["logits"]

    loss = loss_metric(prediction.to(device), labels)

    _, extracted_prediction = prediction.max(dim=1)
    acc = accuracy_eval(extracted_prediction, labels)

    current_loss += loss.item()
    current_acc += acc.item()

    progress_bar.update(1)

    #save space
    try:
      for sentence in tweet_and_label:
        tweet_and_label.clear_embeddings()
    except:
      pass

  return current_loss / len(data_loader), current_acc / len(data_loader)

Downloading pytorch_model.bin:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#Train Model

In [None]:
# NEW from Gabriel: should re-enable the following line, I disabled it to resume training
init_save_stats(model_name)

start_epoch_nr = 0

# NEW from Gabriel: resume training
# model_file_path = PATH + "Grubert stats and model/raw/Grubert v.A.1.;epochnr=1 time_duraction=23720.3347864151s"
# # state = {
# #       "epoch_nr": epoch_nr,
# #       "classifier": classifier,
# #       "optimizer": optimizer
# #           }
# state = torch.load(model_file_path)
# classifier = state["classifier"]
# start_epoch_nr = state["epoch_nr"] + 1
# # optimizer = state["optimizer"]
# embedder = classifier.embedder
# classifier.eval()

best_val_loss_so_far = float('inf')

time_training_start = time.time()
# best_model = None

for epoch_nr in range(start_epoch_nr, max_epochs):
  current_epoch_time_start = time.time()
  
  #train model
  avg_loss_train, avg_acc_train = training(train_loader, classifier, loss_metric, optimizer, device, embedder)  
  
  #learning weight decay
  for param_group in optimizer.param_groups:
      param_group['lr'] = param_group['lr'] * learning_rate_decay
  
  #validate model
  avg_loss_val, avg_acc_val = evaluate(val_loader, classifier, loss_metric, device, embedder)
    
  current_epoch_time_end = time.time()

  elapsed_time_in_seconds = current_epoch_time_end - current_epoch_time_start

  #check for best loss so far
  if avg_loss_val < best_val_loss_so_far:
    #save best model so far
    # best_model = classifier
    #save best val loss so far
    best_val_loss_so_far = avg_loss_val
    #save_model(model_name, epoch_nr, classifier, optimizer, elapsed_time_in_seconds)

  #Save model for each epoch, regardless of avg_loss on validation set
  # NEW from Gabriel: always safe model, just to be sure it is saved
  #save model
  save_model(model_name, epoch_nr, classifier, optimizer, elapsed_time_in_seconds)

  #log stats
  save_stats(model_name, epoch_nr, elapsed_time_in_seconds, avg_loss_train, avg_acc_train, avg_loss_val, avg_acc_val)

time_training_end = time.time()

training_duration = time_training_end - time_training_start

#final validation
#validate model
avg_loss_val, avg_acc_val = evaluate(val_loader, classifier, loss_metric, device, embedder)
final_stats_saving(model_name, training_duration, avg_loss_train, avg_acc_train, avg_loss_val, avg_acc_val)
#save_model(model_name, epoch_nr, classifier, optimizer, elapsed_time_in_seconds)
#final save


  0%|          | 0/31929 [00:00<?, ?it/s]

  0%|          | 0/3548 [00:00<?, ?it/s]

Saving model ... - Model saved.
Epoch: 0 | Epoch Time: 12904.852769613266
	Train Loss: 0.314 | Train Acc: 86.40%
	 Val. Loss: 0.258 |  Val. Acc: 89.02%


  0%|          | 0/31929 [00:00<?, ?it/s]

  0%|          | 0/3548 [00:00<?, ?it/s]

Saving model ... - Model saved.
Epoch: 1 | Epoch Time: 12932.334761857986
	Train Loss: 0.247 | Train Acc: 89.53%
	 Val. Loss: 0.253 |  Val. Acc: 89.46%


  0%|          | 0/31929 [00:00<?, ?it/s]

  0%|          | 0/3548 [00:00<?, ?it/s]

Saving model ... - Model saved.
Epoch: 2 | Epoch Time: 12943.558448791504
	Train Loss: 0.216 | Train Acc: 91.03%
	 Val. Loss: 0.249 |  Val. Acc: 89.62%


  0%|          | 0/31929 [00:00<?, ?it/s]

  0%|          | 0/3548 [00:00<?, ?it/s]

Saving model ... - Model saved.
Epoch: 3 | Epoch Time: 12948.334243297577
	Train Loss: 0.187 | Train Acc: 92.39%
	 Val. Loss: 0.258 |  Val. Acc: 89.57%


  0%|          | 0/31929 [00:00<?, ?it/s]

  0%|          | 0/3548 [00:00<?, ?it/s]

Saving model ... - Model saved.
Epoch: 4 | Epoch Time: 12922.809358358383
	Train Loss: 0.160 | Train Acc: 93.54%
	 Val. Loss: 0.289 |  Val. Acc: 89.45%


  0%|          | 0/31929 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

#For prediction on testset, please see corresponding notebook/python file