#Init. preliminiaries

In [None]:
#https://arxiv.org/pdf/1906.08237.pdf

model_name = "XLNET v.A.3"

#!pip install pytorch-transformers #don't use this for XLNetForSequenceClassification; will result in error when trying to load classifier

#remove this install when dealing with euler cluster
!pip install transformers
!pip install sentencepiece
from transformers import XLNetForSequenceClassification
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from transformers import AdamW

import random
import numpy as np
import torch.nn as nn
import torch
# from pytorch_transformers import XLNetTokenizer, XLNetForSequenceClassification
# from pytorch_transformers import AdamW
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import pickle
import time

use_drive = True
#for euler, remove this entire if else branch and set PATH to "./"
if use_drive:
  PATH = "/content/drive/MyDrive/CIL 2022/"
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/CIL 2022/
  !ls
else:
  PATH = "./"

print("Choosing data: ", end="")
#option 0 - 8
PREPROCESSING_OPTIONS = [ "raw",
"no-stemming_no-lemmatize_no-stopwords_no-spellcorrect",
"no-stemming_no-lemmatize_with-stopwords_no-spellcorrect",
"no-stemming_no-lemmatize_with-stopwords_with-spellcorrect",
"no-stemming_with-lemmatize_with-stopwords_no-spellcorrect",
"no-stemming_with-lemmatize_with-stopwords_with-spellcorrect",
"with-stemming_no-lemmatize_with-stopwords_no-spellcorrect",
"with-stemming_with-lemmatize_no-stopwords_with-spellcorrect",
"with-stemming_with-lemmatize_with-stopwords_no-spellcorrect" ]
PREPROCESSING_CHOICE = PREPROCESSING_OPTIONS[0] # one from PREPROCESSING_OPTIONS
print(PREPROCESSING_CHOICE)

print("Init device: ", end="")
if torch.cuda.is_available():
	device = torch.device("cuda")
else:
	device = torch.device("cpu")
print(device)

#Fix seed to 42
#The way to fix seed (and define torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark)
#was taken over from the following GitHub repository/file:
#https://github.com/ZuowenWang0000/GRUBERT-A-GRU-Based-Method-to-Fuse-BERT-Hidden-Layers-for-Twitter-sentiment-analysis/blob/master/train.py
seed = 42
print("Using seed: %d" % seed)
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 6.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.8 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 60.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 86.8 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Un

#Load data

In [None]:
#to debug/testing purposes only
#for actual use, comment out limit and related code
#limit = 6*250000

print("Loading data.")
def read_file_and_strip(mode, filename):
  lines = []
  i = 0
  if mode == "sentences":
    with open(filename) as file:
      for line in file:
        #https://huggingface.co/docs/transformers/model_doc/xlnet
        #special tokens are <sep> and <cls>
        lines.append(line.strip() + " <sep> <cls>") #XLNet needs for each tweet to end in these 2 tags.
        # i += 1
        #if i > limit:
        #  break
  elif mode == "label": #no need for tags
    with open(filename) as file:
      for line in file:
        lines.append(line.strip())
        # i += 1
        #if i > limit:
        #  break
  else:
    return "Unknown mode, invalid user input."
  return np.array(lines)

def read_data():
  dataset_path = PATH + "data/" + PREPROCESSING_CHOICE + "/"

  train_sentences = read_file_and_strip("sentences", dataset_path + "train_sentences.txt")
  train_labels = read_file_and_strip("label", dataset_path + "train_labels.txt").astype(int)
  val_sentences = read_file_and_strip("sentences", dataset_path + "val_sentences.txt")
  val_labels = read_file_and_strip("label", dataset_path + "val_labels.txt").astype(int)
  
  return train_sentences, train_labels, val_sentences, val_labels

train_sentences, train_labels, val_sentences, val_labels = read_data()

Loading data.


##Tokenize

In [None]:
from transformers import XLNetTokenizer
#for euler, remove this import for TFXLNetForSequenceClassification
from transformers import TFXLNetForSequenceClassification

from transformers import XLNetTokenizerFast

print("Tokenize data.")
#tokenize
tokenizer = XLNetTokenizerFast.from_pretrained('xlnet-base-cased', do_lower_case=True)
#tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
tokenized_train_sentences = [tokenizer.tokenize(tweet) for tweet in train_sentences]
tokenized_val_sentences = [tokenizer.tokenize(tweet) for tweet in val_sentences]

Tokenize data.


Downloading spiece.model:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

##Padding

In [None]:
#A tweet has a character limit of 280 characters.
#Average length of an English word is 5.1 characters (https://www.wolframalpha.com/input?i=average+english+word+length)
#We rounded it to 5 characters.
#Assuming that between each word is a space, then we get 46.8333 words in a tweet.
#Round it up to 47.
#We set the maximum number of words in a tweet to 50, to allow for additional "slack" in our analysis
max_len = 50

#delete variables to free up space

print("Pad sequences.")
list_of_padded_ids_tokenized_train_sentences = pad_sequences([tokenizer.convert_tokens_to_ids(token_list) for token_list in tokenized_train_sentences], maxlen=max_len, dtype="long", truncating="post", padding="post")
del tokenized_train_sentences
list_of_padded_ids_tokenized_val_sentences = pad_sequences([tokenizer.convert_tokens_to_ids(token_list) for token_list in tokenized_val_sentences], maxlen=max_len, dtype="long", truncating="post", padding="post")
del tokenized_val_sentences
del tokenizer

Pad sequences.


##Init Dataloader

In [None]:
# np.array(list_of_padded_ids_tokenized_train_sentences)
# np.array(list_of_padded_ids_tokenized_val_sentences)
# np.array(train_labels)
# np.array(val_labels)

print("Init dataloader.")
# X_train = list_of_padded_ids_tokenized_train_sentences
# X_val = list_of_padded_ids_tokenized_val_sentences

# y_train = train_labels
# y_val = val_labels

torch_tensor_X_train = torch.tensor(list_of_padded_ids_tokenized_train_sentences)
torch_tensor_X_val = torch.tensor(list_of_padded_ids_tokenized_val_sentences)
torch_tensor_y_train = torch.tensor(train_labels)
torch_tensor_y_val = torch.tensor(val_labels)

#Tensordataset
tensor_train = TensorDataset(torch_tensor_X_train, torch_tensor_y_train)
tensor_val = TensorDataset(torch_tensor_X_val, torch_tensor_y_val)

#save space
del torch_tensor_X_train
del torch_tensor_X_val
del torch_tensor_y_train
del torch_tensor_y_val

#from paper
batch_size = 64
#Dataloader
train_loader = DataLoader(tensor_train, batch_size = batch_size, shuffle=True)
val_loader = DataLoader(tensor_val, batch_size = batch_size, shuffle=False)

#save space
del tensor_train
del tensor_val

Init dataloader.


#Init Aux.

In [None]:
def save_model(model_name, epoch_nr, classifier, optimizer, time_duration):
  file_path = PATH + model_name + ";epochnr=" + str(epoch_nr) + " time_duraction=" + str(time_duration) + "s"

  #how to store and load model
  # torch.save(model.state_dict(), PATH)
  # model = TheModelClass(*args, **kwargs)
  # model.load_state_dict(torch.load(PATH))
  # model.eval()
  state = {
      "epoch_nr": epoch_nr,
      "classifier": classifier,
      "optimizer": optimizer
          }
  print("Saving model ... ", end="")
  torch.save(state, file_path)
  print("- Model saved.")

def init_save_stats(model_name):
  name_of_file_stats = model_name + ";stats.txt"
  #important, use (over)write "w"
  file_obj = open(name_of_file_stats, "w", encoding="utf8")
  file_obj.write(f'Model name: {model_name}\n')
  file_obj.write(f'use_drive: {use_drive}\n')
  file_obj.write(f'PREPROCESSING_CHOICE: {PREPROCESSING_CHOICE}\n')
  file_obj.close()

def final_stats_saving(model_name, training_duration, avg_loss_train, avg_acc_train, avg_loss_val, avg_acc_val):
  name_of_file_stats = model_name + ";stats.txt"
  #important, use append "a"
  file_obj = open(name_of_file_stats, "a", encoding="utf8")
  print(f'Training Time: {training_duration}')
  print(f'\tTrain Loss: {avg_loss_train:.3f} | Train Acc: {avg_acc_train*100:.2f}%')
  print(f'\t Val. Loss: {avg_loss_val:.3f} |  Val. Acc: {avg_acc_val*100:.2f}%')
  file_obj.write(f'Training Time: {training_duration}\n')
  file_obj.write(f'\tTrain Loss: {avg_loss_train:.3f} | Train Acc: {avg_acc_train*100:.2f}%\n')
  file_obj.write(f'\t Val. Loss: {avg_loss_val:.3f} |  Val. Acc: {avg_acc_val*100:.2f}%\n')
  file_obj.close()

def save_stats(model_name, epoch_nr, epoch_duration, avg_loss_train, avg_acc_train, avg_loss_val, avg_acc_val):
  print(f'Epoch: {epoch_nr} | Epoch Time: {epoch_duration}')
  print(f'\tTrain Loss: {avg_loss_train:.3f} | Train Acc: {avg_acc_train*100:.2f}%')
  print(f'\t Val. Loss: {avg_loss_val:.3f} |  Val. Acc: {avg_acc_val*100:.2f}%')
  name_of_file_stats = model_name + ";stats.txt"
  #important, use append "a"
  file_obj = open(name_of_file_stats, "a", encoding="utf8")
  file_obj.write(f'Epoch: {epoch_nr} | Epoch Time: {epoch_duration}\n')
  file_obj.write(f'\tTrain Loss: {avg_loss_train:.3f} | Train Acc: {avg_acc_train*100:.2f}%\n')
  file_obj.write(f'\t Val. Loss: {avg_loss_val:.3f} |  Val. Acc: {avg_acc_val*100:.2f}%\n')
  file_obj.close()

#Init model and training

##Init training

In [None]:
def accuracy_eval(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division #
    acc = correct.sum() / len(correct)
    return acc

from tqdm.auto import tqdm

def training(data_loader, classifier, loss_metric, optimizer, device):
  classifier.train()

  current_loss = 0.0
  current_acc = 0.0

  #Show progress bar to estimate time/epoch
  progress_bar = tqdm(range(len(data_loader)))

  #each element in data_loader is a list of 2 tensors.
  #implicitly unwrap
  for list_of_ids_of_tokenized_tweet, label in data_loader:
    #move to device
    # list_of_ids_of_tokenized_tweet.to(device)
    # label.to(device)

    # optimizer.zero_grad()

    # predicted_label = classifier(list_of_ids_of_tokenized_tweet.to(device))[0]

    # loss = loss_metric(predicted_label, label.to(device)).to(device)

    list_of_ids_of_tokenized_tweet = list_of_ids_of_tokenized_tweet.to(device)
    label = label.to(device)

    # print("list_of_ids_of_tokenized_tweet")
    # print(type(list_of_ids_of_tokenized_tweet))
    # print(list_of_ids_of_tokenized_tweet)

    # print("label")
    # print(type(label))
    # print(label)

    optimizer.zero_grad()

    predicted_label = classifier(list_of_ids_of_tokenized_tweet)[0]

    # print("predicted_label")
    # print(type(predicted_label))
    # print(predicted_label)

    #maybe use another metric?
    loss = loss_metric(predicted_label, label).to(device)
    acc = accuracy_eval(torch.tensor(np.argmax(predicted_label.detach().cpu().numpy(), axis=1).flatten()), torch.tensor(label.detach().cpu().numpy()))
    #acc = accuracy_eval(np.argmax(predicted_label, axis=1).flatten(), label)

    loss.backward()
    optimizer.step()

    current_loss += loss.item()
    current_acc += acc.item()

    #advance progess ba by 1
    progress_bar.update(1)
  
  return current_loss / len(data_loader), current_acc / len(data_loader)

def evaluate(data_loader, classifier, loss_metric, device):
  classifier.eval()

  current_loss = 0.0
  current_acc = 0.0

  #each element in data_loader is a list of 2 tensors.
  #implicitly unwrap
  for list_of_ids_of_tokenized_tweet, label in data_loader:
    #move to device
    # list_of_ids_of_tokenized_tweet.to(device)
    # label.to(device)

    # predicted_label = classifier(list_of_ids_of_tokenized_tweet.to(device))[0]

    # loss = loss_metric(predicted_label, label.to(device)).to(device)

    list_of_ids_of_tokenized_tweet = list_of_ids_of_tokenized_tweet.to(device)
    label = label.to(device)

    predicted_label = classifier(list_of_ids_of_tokenized_tweet)[0]

    loss = loss_metric(predicted_label, label).to(device)

    acc = accuracy_eval(torch.tensor(np.argmax(predicted_label.detach().cpu().numpy(), axis=1).flatten()), torch.tensor(label.detach().cpu().numpy()))

    current_loss += loss.item()
    current_acc += acc.item()
  
  return current_loss / len(data_loader), current_acc / len(data_loader)

#due to time constraints, we agreed up train the model for 4 epochs; can be changed by user if needed
max_epochs = 4


print("Init. classifier model")
#Binary classification
classifier = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)
print(classifier.cuda())

#Hyper parameter tuning and optimizer from 
#https://colab.research.google.com/drive/16gx06PVffJwS4pRhysCmc5qbPm26vsY8#scrollTo=QxSMw0FrptiL
#(same author: http://mccormickml.com/2019/09/19/XLNet-fine-tuning/)
param_optimizer = list(classifier.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)# We pass model parameters

#maybe use another metric?
#changed metric
loss_metric = nn.CrossEntropyLoss()
# loss_metric = nn.BCEWithLogitsLoss()

classifier = classifier.to(device)
loss_metric = loss_metric.to(device)



Init. classifier model


Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.weight', 'sequence_summary.summary.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (activation_function): GELUActivation()
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward



##Training

In [None]:
init_save_stats(model_name)
best_val_loss_so_far = float('inf')
time_training_start = time.time()
# best_model = None

for epoch_nr in range(max_epochs):
  current_epoch_time_start = time.time()

  #train model
  avg_loss_train, avg_acc_train = training(train_loader, classifier, loss_metric, optimizer, device)  
  
  #validate model
  avg_loss_val, avg_acc_val = evaluate(val_loader, classifier, loss_metric, device)

  current_epoch_time_end = time.time()

  elapsed_time_in_seconds = current_epoch_time_end - current_epoch_time_start

  #check for best loss so far
  if avg_loss_val < best_val_loss_so_far:
    #save best model (for later use)
    # best_model = classifier
    #save best val loss so far
    best_val_loss_so_far = avg_loss_val
    #save model
    # save_model(model_name, epoch_nr, classifier, optimizer, elapsed_time_in_seconds)
  
  #save model for each epoch
  save_model(model_name, epoch_nr, classifier, optimizer, elapsed_time_in_seconds)
  #log stats
  save_stats(model_name, epoch_nr, elapsed_time_in_seconds, avg_loss_train, avg_acc_train, avg_loss_val, avg_acc_val)

time_training_end = time.time()

training_duration = time_training_end - time_training_start

#final validation
#validate model
avg_loss_val, avg_acc_val = evaluate(val_loader, classifier, loss_metric, device)
final_stats_saving(model_name, training_duration, avg_loss_train, avg_acc_train, avg_loss_val, avg_acc_val)
#save_model(model_name, epoch_nr, classifier, optimizer, elapsed_time_in_seconds)
#final save

  0%|          | 0/31929 [00:00<?, ?it/s]

Saving model ... - Model saved.
Epoch: 0 | Epoch Time: 11642.696839094162
	Train Loss: 0.285 | Train Acc: 87.65%
	 Val. Loss: 0.256 |  Val. Acc: 89.18%


  0%|          | 0/31929 [00:00<?, ?it/s]

Saving model ... - Model saved.
Epoch: 1 | Epoch Time: 11601.185776472092
	Train Loss: 0.242 | Train Acc: 89.80%
	 Val. Loss: 0.248 |  Val. Acc: 89.51%


  0%|          | 0/31929 [00:00<?, ?it/s]

Saving model ... - Model saved.
Epoch: 2 | Epoch Time: 11598.598737716675
	Train Loss: 0.215 | Train Acc: 91.07%
	 Val. Loss: 0.256 |  Val. Acc: 89.66%


  0%|          | 0/31929 [00:00<?, ?it/s]

Saving model ... - Model saved.
Epoch: 3 | Epoch Time: 11624.05092549324
	Train Loss: 0.192 | Train Acc: 92.15%
	 Val. Loss: 0.258 |  Val. Acc: 89.68%
Training Time: 46486.72888445854
	Train Loss: 0.192 | Train Acc: 92.15%
	 Val. Loss: 0.258 |  Val. Acc: 89.68%


#For prediction on testset, see corresponding notebook / python file