<a href="https://colab.research.google.com/github/JonasVerbickas/test-jupyter/blob/main/NLP_CW2_Task_2_(torch_RNN).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
import os
import re
import pandas as pd
import collections
import random
import torch
from torchsummary import summary
from torchtext.vocab import build_vocab_from_iterator
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold
from statistics import mean

# 1. Preprocessing/Cleaning

In [2]:
USE_STEMMING_INSTEAD_OF_LEMMATIZATION = False

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
if USE_STEMMING_INSTEAD_OF_LEMMATIZATION:
  token_processing_fn = nltk.PorterStemmer().stem 
else:
  token_processing_fn = nltk.stem.WordNetLemmatizer().lemmatize


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
def create_concat_text_content_string(path_to_dataset):
  file_contents = ""
  for filename in os.listdir(path_to_dataset):
    # skip readme
    if filename in ['README.txt', 'ipod']:
      continue
    # append contents of other files to file_contents string
    starting_corpus_size = len(file_contents)
    with open(os.path.join(path_to_dataset, filename)) as f:
      file_contents += f.read()
    print("After appending", filename, "corpus sized increased to", starting_corpus_size, "->", len(file_contents))
  return file_contents

In [5]:
def removeAnnotationSymbols(sentence):
  static_removals = ['[t]', '[u]','[p]','[s]','[cc]','[cs]']
  for symbol in static_removals:
    sentence = sentence.replace(symbol, '')
  return sentence

In [6]:
def calculateSentement(unprocessed_sentence):
  semantic_tokens = re.findall('\[([+-]\d)\]', unprocessed_sentence)
  semantic_sum = 0
  for token in semantic_tokens:
    semantic_sum += int(token)
  return semantic_sum

In [7]:
def process_string_of_reviews_into_df(string_of_reviews):
  """
  1. Apply casefolding
  2. Expand contractions
  3. Tokenize sentances
  4. Ignore stop-words
  5. Ignore tokens that don't contain any alphanumeric characters
  6. Ignore sentiment analysis tokens e.g. `+3`
  """
  list_of_reviews = string_of_reviews.split("[t]\n")
  tokenized_sentence_sentiment_list = []
  for review in list_of_reviews:
    sentiment = 0
    processed_review = []
    review_split_into_lines = review.split('\n')
    for line in review_split_into_lines:
      sentiment_tags, _, sentence = line.partition("##")
      sentiment += calculateSentement(sentiment_tags)
      x = removeAnnotationSymbols(sentence)
      x = x.casefold()
      x = nltk.tokenize.word_tokenize(x)
      processed_review += x
    tokenized_sentence_sentiment_list.append([review, processed_review, sentiment])
  df = pd.DataFrame(tokenized_sentence_sentiment_list, columns=['original review','processed review','sentiment'])
  return df


In [8]:
class ReviewDataset(torch.utils.data.Dataset):
  def __init__(self, path_to_review_folder):
        string_of_reviews = create_concat_text_content_string(path_to_review_folder)
        self.review_df = process_string_of_reviews_into_df(string_of_reviews)
        set_of_all_tokens = set([token for review in self.review_df['processed review'] for token in review])
        print(f"{len(set_of_all_tokens)=}")
        print(f"{len(set_of_all_tokens)=}")
        self.vocab = build_vocab_from_iterator([set_of_all_tokens], max_tokens=len(set_of_all_tokens), specials=["<unk>"])
        print(f"{len(self.vocab)=}")
        self.vocab.set_default_index(self.vocab["<unk>"])
        
  def __len__(self):
    return len(self.review_df)

  def __getitem__(self, idx):
    review = self.review_df.iloc[idx]
    return self.vocab(list(review["processed review"])), review['sentiment']


In [9]:
PATH_TO_REVIEWS = "/content/drive/MyDrive/Colab Notebooks/product_reviews"
dataset = ReviewDataset(PATH_TO_REVIEWS)
dataset[6]

After appending Nokia_6600.txt corpus sized increased to 0 -> 56093
After appending norton.txt corpus sized increased to 56093 -> 95013
After appending Linksys_Router.txt corpus sized increased to 95013 -> 151947
After appending MicroMP3.txt corpus sized increased to 151947 -> 259727
After appending Diaper_Champ.txt corpus sized increased to 259727 -> 294831
After appending Hitachi_router.txt corpus sized increased to 294831 -> 325078
After appending Canon_S100.txt corpus sized increased to 325078 -> 353887
After appending Canon_PowerShot_SD500.txt corpus sized increased to 353887 -> 378520
After appending ipod.txt corpus sized increased to 378520 -> 436566
len(set_of_all_tokens)=6569
len(set_of_all_tokens)=6569
len(self.vocab)=6569


([1947,
  3830,
  2633,
  3621,
  6512,
  68,
  2956,
  3492,
  2599,
  48,
  5774,
  638,
  6381,
  2092,
  2181,
  6542,
  5869,
  3826,
  6344,
  68,
  2956,
  31,
  2677,
  2562,
  604,
  2217,
  3933,
  4302,
  5292,
  5840,
  6375,
  4059,
  3293,
  638,
  4074,
  3014,
  2211,
  68,
  3014,
  5855,
  1723,
  2956,
  31,
  2677,
  2562,
  4043,
  48,
  3784,
  638,
  4547,
  34,
  496,
  4800,
  35,
  5903,
  5424,
  2192,
  28,
  5730,
  68,
  3980,
  2956,
  23,
  835,
  5903,
  3933,
  5903,
  6025,
  5824,
  364,
  3745,
  68],
 0)

# 2. Implementation of the Classifier

In [10]:
from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.rnn = nn.RNN(embed_dim, 256)
        self.fc = nn.Linear(256, num_class)
        self.dropout = nn.Dropout(0.2)
        self.sigmoid = nn.Sigmoid()
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        embedded = self.dropout(embedded)
        y, hidden = self.rnn(embedded)
        y = self.fc(y)
        return self.sigmoid(y)

In [11]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_text, _label) in batch:
         label_list.append(_label)
         processed_text = torch.tensor(_text, dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = (torch.tensor(label_list, dtype=torch.int64)>0).float().reshape(-1, 1)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(DEVICE), text_list.to(DEVICE), offsets.to(DEVICE)

# 3. Experiments to Evaluate the Classifier

## Define logic required for training

In [12]:
# DEFINE HYPERPARAMS
EPOCHS = 10
LR = 1e-3 
K_FOLDS = 5
BATCH_SIZE = 1 
TRAIN_PERCENTAGE = 0.8
EMSIZE = 100 # embedding size used by the model
vocab_size = len(dataset.vocab)
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
DEVICE

device(type='cuda')

In [13]:
# Define the training loop
def train(dataloader):
    correct_preds, total_preds, total_loss = 0, 0, 0
    model.train()
    for idx, (label, text, offsets) in tqdm(enumerate(dataloader), total=len(dataloader)):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) # prevent gradient explosion
        optimizer.step()
        total_loss += loss.item()
        correct_preds += ((predicted_label>0.5).float() == label).sum().item()
        total_preds += label.size(0)
    accuracy = correct_preds/total_preds
    average_loss = total_loss/total_preds
    return accuracy, average_loss

In [14]:
# Define the validation loop
def evaluate(dataloader):
    model.eval()
    correct_preds, total_preds, total_loss = 0, 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_loss += loss.item()
            correct_preds += ((predicted_label>0.5).float() == label).sum().item()
            total_preds += label.size(0)
    accuracy = correct_preds/total_preds
    average_loss = total_loss/total_preds
    return accuracy, average_loss

## Start training

In [15]:
# Split dataset
train_val_size = int(TRAIN_PERCENTAGE*(len(dataset)))
test_size = len(dataset) - train_val_size
train_val_set, test_set = torch.utils.data.random_split(dataset, [train_val_size, test_size])
kfold = KFold(n_splits=K_FOLDS)

In [None]:
fold_val_accr_averages = []
fold_train_accr_averages = []
# Start training
for fold, (train_ids, valid_ids) in enumerate(kfold.split(train_val_set)):
  print(f"\n\n!!========== FOLD #{fold} ============!!\n")
  # Create the dataloaders
  train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
  valid_subsampler = torch.utils.data.SubsetRandomSampler(valid_ids)
  train_dataloader = torch.utils.data.DataLoader(train_val_set, batch_size=BATCH_SIZE, collate_fn=collate_batch, sampler=train_subsampler)
  val_dataloader = torch.utils.data.DataLoader(train_val_set, batch_size=BATCH_SIZE, collate_fn=collate_batch, sampler=valid_subsampler)
  # Instantiate the model
  model = TextClassificationModel(vocab_size, EMSIZE, 1).to(DEVICE)
  # Instantiate optimization objects
  criterion = torch.nn.BCELoss()
  optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.1)
  scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
  # Keep track of losses throughout the epoch
  higest_val_accuracy = 0
  train_losses = []
  train_accrs = []
  val_losses = []
  val_accrs = []
  for epoch in range(1, EPOCHS + 1):
      train_acc, train_loss = train(train_dataloader)
      val_acc, val_loss = evaluate(val_dataloader)
      if higest_val_accuracy > val_acc:
        scheduler.step()
      highest_val_accuracy = max(higest_val_accuracy, val_acc)
      print('-' * 80)
      print(f'end of {epoch=} | {train_acc=:.4f} | {train_loss=:.6f} | {val_acc=:.4f} ')
      print('-' * 80)
      train_losses.append(train_loss)
      train_accrs.append(train_acc)
      val_accrs.append(val_acc)
      val_losses.append(val_loss)
  fold_val_accr_averages.append(mean(val_accrs))
  fold_train_accr_averages.append(mean(train_accrs))






  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=1 | train_acc=0.6377 | train_loss=1.619691 | val_acc=0.5962 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=2 | train_acc=0.6473 | train_loss=1.381382 | val_acc=0.5962 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=3 | train_acc=0.7536 | train_loss=0.583355 | val_acc=0.6154 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=4 | train_acc=0.9614 | train_loss=0.080757 | val_acc=0.6154 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=5 | train_acc=1.0000 | train_loss=0.007445 | val_acc=0.6154 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=6 | train_acc=1.0000 | train_loss=0.003372 | val_acc=0.6346 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=7 | train_acc=1.0000 | train_loss=0.000514 | val_acc=0.6154 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=8 | train_acc=1.0000 | train_loss=0.000239 | val_acc=0.6538 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=9 | train_acc=1.0000 | train_loss=0.000149 | val_acc=0.6538 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=10 | train_acc=1.0000 | train_loss=0.000164 | val_acc=0.6346 
--------------------------------------------------------------------------------





  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=1 | train_acc=0.6184 | train_loss=1.186334 | val_acc=0.6154 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=2 | train_acc=0.6377 | train_loss=1.370053 | val_acc=0.6154 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=3 | train_acc=0.6908 | train_loss=0.709373 | val_acc=0.7500 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=4 | train_acc=0.9614 | train_loss=0.093535 | val_acc=0.6731 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=5 | train_acc=0.9952 | train_loss=0.025283 | val_acc=0.7308 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=6 | train_acc=1.0000 | train_loss=0.002149 | val_acc=0.7115 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=7 | train_acc=1.0000 | train_loss=0.000609 | val_acc=0.7308 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=8 | train_acc=1.0000 | train_loss=0.000339 | val_acc=0.7115 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=9 | train_acc=1.0000 | train_loss=0.000313 | val_acc=0.6538 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=10 | train_acc=1.0000 | train_loss=0.000227 | val_acc=0.7308 
--------------------------------------------------------------------------------





  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=1 | train_acc=0.6377 | train_loss=1.500141 | val_acc=0.6154 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=2 | train_acc=0.6377 | train_loss=1.321290 | val_acc=0.6154 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=3 | train_acc=0.7778 | train_loss=0.547656 | val_acc=0.7692 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=4 | train_acc=0.9517 | train_loss=0.110301 | val_acc=0.7885 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=5 | train_acc=0.9952 | train_loss=0.015089 | val_acc=0.7885 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=6 | train_acc=1.0000 | train_loss=0.000873 | val_acc=0.7885 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=7 | train_acc=1.0000 | train_loss=0.000495 | val_acc=0.7692 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=8 | train_acc=1.0000 | train_loss=0.000565 | val_acc=0.7885 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=9 | train_acc=1.0000 | train_loss=0.000127 | val_acc=0.7692 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=10 | train_acc=1.0000 | train_loss=0.000164 | val_acc=0.7692 
--------------------------------------------------------------------------------





  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=1 | train_acc=0.6377 | train_loss=1.555259 | val_acc=0.5769 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=2 | train_acc=0.6570 | train_loss=1.284258 | val_acc=0.5769 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=3 | train_acc=0.7633 | train_loss=0.598342 | val_acc=0.6346 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=4 | train_acc=0.9324 | train_loss=0.201366 | val_acc=0.6346 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=5 | train_acc=0.9807 | train_loss=0.037289 | val_acc=0.6731 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=6 | train_acc=0.9952 | train_loss=0.025371 | val_acc=0.7500 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=7 | train_acc=1.0000 | train_loss=0.001917 | val_acc=0.7500 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=8 | train_acc=1.0000 | train_loss=0.000309 | val_acc=0.7500 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=9 | train_acc=1.0000 | train_loss=0.000380 | val_acc=0.7115 
--------------------------------------------------------------------------------


  0%|          | 0/207 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=10 | train_acc=1.0000 | train_loss=0.000118 | val_acc=0.7308 
--------------------------------------------------------------------------------





  0%|          | 0/208 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=1 | train_acc=0.5962 | train_loss=1.670786 | val_acc=0.7647 
--------------------------------------------------------------------------------


  0%|          | 0/208 [00:00<?, ?it/s]

--------------------------------------------------------------------------------
end of epoch=2 | train_acc=0.6250 | train_loss=1.392307 | val_acc=0.7451 
--------------------------------------------------------------------------------


  0%|          | 0/208 [00:00<?, ?it/s]

## Examine the results

### I look at how the training went throughout the last epoch
This allows me to see whether the model overfits

In [None]:
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xticks(range(EPOCHS))
plt.title('Loss over time')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
plt.plot(train_accrs, label='Training accuracy')
plt.plot(val_accrs, label='Validation accuracy')
plt.xticks(range(EPOCHS))
plt.title('Accuracy over time')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

### Examine results from different folds

In [None]:
plt.bar(range(K_FOLDS), fold_val_accr_averages)

### Test against 

In [None]:
model.eval()
for label_list, text_list, offsets in val_dataloader:
  pred = model(text_list, offsets)
  print(f'{pred=} | {text_list}')
  break