<a href="https://colab.research.google.com/github/JonasVerbickas/GYMTABLER/blob/master/Final_NLP_CW2_Task_2_(torch_RNN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [460]:
import nltk
import os
import re
import pandas as pd
import collections
import random
import torch
from torchsummary import summary
from torchtext.vocab import build_vocab_from_iterator
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
from statistics import mean
from sklearn.model_selection import KFold

# 1. Preprocessing/Cleaning

In [461]:
USE_STEMMING_INSTEAD_OF_LEMMATIZATION = False

In [462]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
if USE_STEMMING_INSTEAD_OF_LEMMATIZATION:
  token_processing_fn = nltk.PorterStemmer().stem 
else:
  token_processing_fn = nltk.stem.WordNetLemmatizer().lemmatize


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [463]:
def specialIpodProcessing(path_to_dataset, all_other_reviews_as_string):
  """
  Inserts [t] tags into text from 'ipod.txt'
  At an average review length taken from all other reviews
  """
  # calculate the average review length
  review_lengths = []
  curr_review_length = 0
  for line in all_other_reviews_as_string.split('\n'):
    if line == '[t]':
      if curr_review_length > 0:
        review_lengths.append(curr_review_length)
      curr_review_length = 0
    else:
      curr_review_length += 1
  average_len = int(mean(review_lengths))
  print("average_len used for ipod.txt =", average_len)
  # insert [t] every after each subset of `average_len` sentences
  curr_review_length = 0
  with open(os.path.join(path_to_dataset, 'ipod.txt')) as f:
    for line in f.readlines():
      if curr_review_length > average_len:
        all_other_reviews_as_string += "\n[t]"
        curr_review_length = 0
      all_other_reviews_as_string += line
      curr_review_length += 1
  return all_other_reviews_as_string

In [464]:
def create_concat_text_content_string(path_to_dataset):
  """
  Iterate through all of txt files in a given directory
  Concatenate the contents of each .txt file to a large string
  (Special processing for ipod.txt is applied)
  """
  file_contents = ""
  for filename in os.listdir(path_to_dataset):
    # skip readme
    if filename in ['README.txt', 'ipod.txt']:
      continue
    # append contents of other files to file_contents string
    starting_corpus_size = len(file_contents)
    with open(os.path.join(path_to_dataset, filename)) as f:
      file_contents += f.read()
    print("After appending", filename, "corpus sized increased to", starting_corpus_size, "->", len(file_contents))
  ## special processing for ipod.txt
  starting_corpus_size = len(file_contents)
  file_contents = specialIpodProcessing(path_to_dataset, file_contents)
  print("After appending ipod corpus sized increased to", starting_corpus_size, "->", len(file_contents))
  return file_contents

In [465]:
def removeAnnotationSymbols(sentence):
  """
  Remove all tags that aren't part of the original text
  and should only be used for processing
  """
  static_removals = ['[t]', '[u]','[p]','[s]','[cc]','[cs]']
  for symbol in static_removals:
    sentence = sentence.replace(symbol, '')
  return sentence

In [466]:
def calculateSentement(sentiment_tags):
  """
  Finds all positive/negative numbers that are within []
  Returns their total sum
  """
  semantic_tokens = re.findall('\[([+-]\d)\]', sentiment_tags)
  semantic_sum = 0
  for token in semantic_tokens:
    semantic_sum += int(token)
  return semantic_sum

In [467]:
def process_string_of_reviews_into_df(string_of_reviews):
  """
  1. Apply casefolding
  2. Expand contractions
  3. Tokenize sentances
  4. Ignore stop-words
  5. Ignore tokens that don't contain any alphanumeric characters
  6. Ignore sentiment analysis tokens e.g. `+3`
  """
  list_of_reviews = string_of_reviews.split("[t]\n")
  tokenized_sentence_sentiment_list = []
  for review in list_of_reviews:
    sentiment = 0
    processed_review = []
    review_split_into_lines = review.split('\n')
    for line in review_split_into_lines:
      sentiment_tags, _, sentence = line.partition("##")
      sentiment += calculateSentement(sentiment_tags)
      # I use `x` here instead of proper variable names
      # because it allows me switch the order of statements without having to rename variables
      # (This is very convinient when testing; and in my opinion makes the code more readable)
      x = removeAnnotationSymbols(sentence)
      x = x.casefold()
      x = nltk.tokenize.word_tokenize(x)
      x = [token_processing_fn(token) for token in x]
      processed_review += x
    tokenized_sentence_sentiment_list.append([review, processed_review, sentiment])
  df = pd.DataFrame(tokenized_sentence_sentiment_list, columns=['original review','processed review','sentiment'])
  return df


In [468]:
class ReviewDataset(torch.utils.data.Dataset):
  """
  Creates a dataframe with [review, sentiment_score]
  Builds a vocabulary to map sentences into vectors
  Whenever __getitem__ is called it returns (vectorized_sent, sentiment)
  """
  def __init__(self, path_to_review_folder):
        string_of_reviews = create_concat_text_content_string(path_to_review_folder)
        self.review_df = process_string_of_reviews_into_df(string_of_reviews)
        set_of_all_tokens = set([token for review in self.review_df['processed review'] for token in review])
        self.vocab = build_vocab_from_iterator([set_of_all_tokens], max_tokens=len(set_of_all_tokens), specials=["<unk>"])
        print(f"{len(self.vocab)=}")
        self.vocab.set_default_index(self.vocab["<unk>"])
        
  def __len__(self):
    return len(self.review_df)

  def __getitem__(self, idx):
    review = self.review_df.iloc[idx]
    return self.vocab(list(review["processed review"])), review['sentiment']


In [469]:
PATH_TO_REVIEWS = "/content/drive/MyDrive/Colab Notebooks/product_reviews"
dataset = ReviewDataset(PATH_TO_REVIEWS)
"Example of a data sample", dataset[1]

After appending Nokia_6600.txt corpus sized increased to 0 -> 56093
After appending norton.txt corpus sized increased to 56093 -> 95013
After appending Linksys_Router.txt corpus sized increased to 95013 -> 151947
After appending MicroMP3.txt corpus sized increased to 151947 -> 259727
After appending Diaper_Champ.txt corpus sized increased to 259727 -> 294831
After appending Hitachi_router.txt corpus sized increased to 294831 -> 325078
After appending Canon_S100.txt corpus sized increased to 325078 -> 353887
After appending Canon_PowerShot_SD500.txt corpus sized increased to 353887 -> 378520
average_len used for ipod.txt = 10
After appending ipod corpus sized increased to 378520 -> 436758
len(self.vocab)=6006


('Example of a data sample',
 ([2722,
   31,
   2524,
   5345,
   862,
   2309,
   3541,
   173,
   3450,
   3654,
   619,
   2722,
   5494,
   3207,
   2939,
   68,
   5316,
   3728,
   1766,
   4923,
   2182,
   2520,
   871,
   852,
   3115,
   68,
   3399,
   2426,
   4460,
   427,
   1186,
   2060,
   299,
   2696,
   3755,
   4923,
   619,
   2722,
   1810,
   3511,
   4307,
   5251,
   3718,
   2939,
   5314,
   3487,
   68,
   5856,
   2928,
   5345,
   3950,
   4923,
   2491,
   420,
   4834,
   68,
   2939,
   2520,
   2939,
   576,
   68,
   5345,
   2928,
   5316,
   2250,
   3950,
   2722,
   3027,
   3696,
   5314,
   2520,
   5316,
   3055,
   1011,
   4653,
   48,
   2835,
   48,
   948,
   48,
   5649,
   1399,
   619,
   427,
   4990,
   68,
   2744,
   5981,
   695,
   3190,
   2309,
   617,
   5627,
   48,
   5345,
   2928,
   1642,
   5316,
   3950,
   2309,
   5981,
   68],
  8))

# 2. Implementation of the Classifier

In [483]:
from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, num_class, drop_out=0.2):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, dropout=drop_out,
                          num_layers=num_layers, bidirectional=True)
        # 2x because bidirectional RNN concats its outputs
        self.fc = nn.Linear(2*hidden_dim, num_class) 
        self.dropout = nn.Dropout(drop_out)

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        embedded = self.dropout(embedded)
        y, hidden = self.rnn(embedded)
        y = self.dropout(y)
        y = self.fc(y)
        return y

In [484]:
def collate_batch(batch):
  """
  This function allows for different inputs of different length to used in a single batch
  
  """
  label_list, text_list, offsets = [], [], [0]
  for (_text, _label) in batch:
        label_list.append(_label)
        processed_text = torch.tensor(_text, dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
  label_list = torch.tensor(label_list, dtype=torch.float32).reshape(-1, 1)
  offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
  text_list = torch.cat(text_list)
  return label_list.to(DEVICE), text_list.to(DEVICE), offsets.to(DEVICE)

# 3. Experiments to Evaluate the Classifier

## Define logic required for training

In [485]:
# DEFINE HYPERPARAMS
EPOCHS = 10 
LR = 1e-2 
K_FOLDS = 5
THRESHOLD = 0 # threshold used to classify a review as positive/negative
BATCH_SIZE = 1
TRAIN_PERCENTAGE = 0.8
EMSIZE = 100 # embedding size used by the model
HIDDEN_SIZE = 32 # size of the hidden layer
NUM_LAYERS = 1
DROP_OUT = 0.4
vocab_size = len(dataset.vocab)
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
DEVICE

device(type='cuda')

In [486]:
model = TextClassificationModel(vocab_size, EMSIZE,
                                HIDDEN_SIZE, NUM_LAYERS, num_class=1,
                                drop_out=DROP_OUT).to(DEVICE)
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 609,241 trainable parameters


In [487]:
# Define the training loop
def train(dataloader):
    correct_preds, total_preds, total_loss = 0, 0, 0
    model.train()
    for idx, (label, text, offsets) in tqdm(enumerate(dataloader), total=len(dataloader)):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        # prevent gradient explosion
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_loss += loss.item()
        # Threshold the predictions
        correct_preds += ((predicted_label>THRESHOLD).float() == (label>THRESHOLD).float()).sum().item()
        total_preds += label.size(0)
    accuracy = correct_preds/total_preds
    average_loss = total_loss/total_preds
    return accuracy, average_loss

In [488]:
# Define the validation loop
def evaluate(dataloader):
    model.eval()
    correct_preds, total_preds, total_loss = 0, 0, 0
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_loss += loss.item()
            correct_preds += ((predicted_label>THRESHOLD).float() == (label>THRESHOLD).float()).sum().item()
            total_preds += label.size(0)
    accuracy = correct_preds/total_preds
    average_loss = total_loss/total_preds
    return accuracy, average_loss

## Start training

In [489]:
# Split dataset
train_val_size = int(TRAIN_PERCENTAGE*(len(dataset)))
test_size = len(dataset) - train_val_size
train_val_set, test_set = torch.utils.data.random_split(dataset, [train_val_size, test_size])
kfold = KFold(n_splits=K_FOLDS)

In [490]:
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=BATCH_SIZE, collate_fn=collate_batch)

In [492]:
fold_val_accr = []
fold_train_accr = []
fold_test_accr = []
# Start training
for fold, (train_ids, valid_ids) in enumerate(kfold.split(train_val_set)):
  print(f"\n\n!!========== FOLD #{fold} ============!!\n")
  # Create the dataloaders
  train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
  valid_subsampler = torch.utils.data.SubsetRandomSampler(valid_ids)
  train_dataloader = torch.utils.data.DataLoader(train_val_set, batch_size=BATCH_SIZE, collate_fn=collate_batch, sampler=train_subsampler)
  val_dataloader = torch.utils.data.DataLoader(train_val_set, batch_size=BATCH_SIZE, collate_fn=collate_batch, sampler=valid_subsampler)
  # Instantiate the model
  model = TextClassificationModel(vocab_size, EMSIZE,
                                  HIDDEN_SIZE, NUM_LAYERS, num_class=1,
                                  drop_out=DROP_OUT).to(DEVICE)
  # Instantiate optimization objects
  criterion = torch.nn.L1Loss()
  optimizer = torch.optim.Adam(model.parameters(), lr=LR)
  scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
  # Keep track of losses throughout the epoch
  higest_val_accuracy = 0
  train_losses = []
  train_accrs = []
  val_losses = []
  val_accrs = []
  for epoch in range(1, EPOCHS + 1):
      train_acc, train_loss = train(train_dataloader)
      val_acc, val_loss = evaluate(val_dataloader)
      # if has accuracy is lower than in prev epoch - reduce the learning rate
      if higest_val_accuracy > val_acc:
        scheduler.step()
      highest_val_accuracy = max(higest_val_accuracy, val_acc)
      print('-' * 80)
      print(f'end of {epoch=} | {train_acc=:.4f} | {train_loss=:.6f} | {val_acc=:.4f} ')
      print('-' * 80)
      train_losses.append(train_loss)
      train_accrs.append(train_acc)
      val_accrs.append(val_acc)
      val_losses.append(val_loss)
  # append the last and (hopefully) highest accuracies 
  fold_val_accr.append(val_accrs[-1])
  fold_train_accr.append(train_accrs[-1])
  test_acc, test_loss = evaluate(test_dataloader)
  print("End of Fold | test_acc", test_acc)
  fold_test_accr.append(test_acc)






  0%|          | 0/207 [00:00<?, ?it/s]

TypeError: ignored

## Examine the results

### I look at how the training went throughout the last epoch
This allows me to see whether the model overfits

In [None]:
def plotStatThroughoutEpochs(label, train_stat, validation_stat):
  plt.plot(train_stat, label=f'Training {label}')
  plt.plot(validation_stat, label=f'Validation {label}')
  plt.xticks(range(len(train_stat)))
  plt.title(f'{label} over time')
  plt.xlabel('Epoch')
  plt.ylabel(f'{label}')
  plt.legend()
  plt.show()

In [None]:
plotStatThroughoutEpochs("Loss", train_losses, val_losses)

In [None]:
plotStatThroughoutEpochs("Accuracy", train_accrs, val_accrs)

### Examine results from different folds
Ideally we want to see low variance here

In [None]:
plt.title("Final epoch validation accuracies for each fold")
plt.xlabel("k-Folds")
plt.ylabel("Accuracy in the last epoch")
plt.bar(range(K_FOLDS), fold_val_accr)
plt.show()

### <mark>Test against custom input</mark>