#Init.

In [1]:
#https://arxiv.org/pdf/1906.08237.pdf
#https://github.com/kcmankar/pytorch-sentiment-analysis-using-XLNet/blob/master/xlnet_sentiment_analysis.ipynb
#https://arrow.tudublin.ie/cgi/viewcontent.cgi?article=1265&context=scschcomdis

model_name = "XLNET v.A.2 hyperparametertuning;epochnr=1"

#https://github.com/kcmankar/pytorch-sentiment-analysis-using-XLNet/blob/master/xlnet_sentiment_analysis.ipynb
#!pip install pytorch-transformers #don't use this for XLNetForSequenceClassification; will result in error when trying to load classifier

#remove this install when dealing with euler cluster
!pip install transformers
!pip install sentencepiece
from transformers import XLNetForSequenceClassification
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from transformers import AdamW
from transformers import XLNetTokenizerFast

import pandas as pd
import random
import numpy as np
import torch.nn as nn
import torch
# from pytorch_transformers import XLNetTokenizer, XLNetForSequenceClassification
# from pytorch_transformers import AdamW
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
import pickle
import time

use_drive = True
#for euler, remove this entire if else branch and set PATH to "./"
if use_drive:
  PATH = "/content/drive/MyDrive/CIL 2022/"
  from google.colab import drive
  drive.mount('/content/drive')
  %cd /content/drive/MyDrive/CIL 2022/
  !ls
else:
  PATH = "./"

print("Choosing data: ", end="")
#option 0 - 8
PREPROCESSING_OPTIONS = [ "raw",
"no-stemming_no-lemmatize_no-stopwords_no-spellcorrect",
"no-stemming_no-lemmatize_with-stopwords_no-spellcorrect",
"no-stemming_no-lemmatize_with-stopwords_with-spellcorrect",
"no-stemming_with-lemmatize_with-stopwords_no-spellcorrect",
"no-stemming_with-lemmatize_with-stopwords_with-spellcorrect",
"with-stemming_no-lemmatize_with-stopwords_no-spellcorrect",
"with-stemming_with-lemmatize_no-stopwords_with-spellcorrect",
"with-stemming_with-lemmatize_with-stopwords_no-spellcorrect" ]
PREPROCESSING_CHOICE = PREPROCESSING_OPTIONS[0] # one from PREPROCESSING_OPTIONS
print(PREPROCESSING_CHOICE)

print("Init device: ", end="")
if torch.cuda.is_available():
	device = torch.device("cuda")
else:
	device = torch.device("cpu")
print(device)

print("Init methods.")

#load data
def read_testfile(filename):
  lines = []
  with open(filename) as file:
    for line in file:
      lines.append(line.strip() + " <sep> <cls>") #XLNet needs for each tweet to end in these 2 tags.
  return np.array(lines)

def generate_testdata(filename, max_len, batch_size):
  #overview
  #load data from filename
  #tokenize
  #pad
  #init dataloader

  #read in test data
  testdata_sentences = read_testfile(filename)

  #init tokenizer and tokenize testdata
  tokenizer = XLNetTokenizerFast.from_pretrained('xlnet-base-cased', do_lower_case=True)
  tokenized_testdata_sentences = [tokenizer.tokenize(tweet) for tweet in testdata_sentences]

  #padding
  list_of_padded_ids_tokenized_testdata_sentences = pad_sequences([tokenizer.convert_tokens_to_ids(token_list) for token_list in tokenized_testdata_sentences], maxlen=max_len, dtype="long", truncating="post", padding="post")


  #Init. DataLoader for testset

  #Init. tensor
  torch_tensor_testdata = torch.tensor(list_of_padded_ids_tokenized_testdata_sentences)

  #Init. TensorDataset
  tensor_testdata = TensorDataset(torch_tensor_testdata)

  #Init. Dataloader
  testset_loader = DataLoader(tensor_testdata, batch_size = batch_size)

  return testset_loader


def evaluate_on_testset(data_loader, classifier, device):
  classifier.eval()
  final_prediction = np.empty(0, dtype=int)

  #each element in data_loader is a list of 2 tensors.
  #implicitly unwrap
  for list_of_ids_of_tokenized_tweet in data_loader:
    #move to device
    list_of_ids_of_tokenized_tweet = list_of_ids_of_tokenized_tweet[0].to(device)

    #make prediction
    predicted_label = classifier(list_of_ids_of_tokenized_tweet)[0]
    #append predictions to final_prediction
    # final_prediction.append(np.argmax(predicted_label.detach().cpu().numpy(), axis=1).flatten())
    final_prediction = np.append(final_prediction, np.argmax(predicted_label.detach().cpu().numpy(), axis=1))
    #final_prediction.append(predicted_label)

  return final_prediction

def generate_submission_file(final_prediction, model_name):

  # final = []
  # #for each tensor
  # for tensor in final_prediction:
  #   #convert to correct datatype and append predictions to final
  #   final.extend(list(np.array(tensor)))

  #map label 0 to -1
  for index in range(len(final_prediction)):
    if final_prediction[index] == 0:
      final_prediction[index] = -1

  #path to test file
  submission_file = "/content/drive/MyDrive/CIL 2022/data/test data/"
  submission_file += model_name + "_submission.csv"

  # Create the pandas dataframe
  id = np.arange(1, len(final_prediction) + 1)
  data = {"Id": id, "Prediction": final_prediction}
  df = pd.DataFrame(data, columns=["Id", "Prediction"])
  #print(df)
  #save submission file
  df.to_csv(submission_file, index=False)
  print("Submission file generated.")
  

def load_model(path_to_model):
    # state = {
    #   "epoch_nr": epoch_nr,
    #   "classifier": classifier,
    #   "optimizer": optimizer
    #       }
  if torch.cuda.is_available():
    return torch.load(path_to_model)["classifier"]
  else:
	  return torch.load(path_to_model, map_location='cpu')["classifier"]



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 71.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.7 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

#Generate prediction on testdata

In [2]:
#specify path of (pre-processed) test file is
testset_data_path = "/content/drive/MyDrive/CIL 2022/data/test data/test_data.txt"

#dataloader
test_loader = generate_testdata(testset_data_path, 50, 64)

#Load model of choice
#specify path to model of choice
path_to_model = "/content/drive/MyDrive/CIL 2022/Saved Model States/XLNET v.A.2 - raw;fixed_token-hyperparametertuning_more_trainingdata;Piotrek;epochnr=1 time_duraction=11604.341420650482s"
classifier = load_model(path_to_model)

#generate prediction on testset
final_prediction = evaluate_on_testset(test_loader, classifier, device)
print(len(final_prediction))
#generate submission file
generate_submission_file(final_prediction, model_name)



Downloading spiece.model:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

10000
Submission file generated.
