<a href="https://colab.research.google.com/github/JonasVerbickas/test-jupyter/blob/main/torch_RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [231]:
import nltk
import os
import re
import pandas as pd
import collections
import random
import torch
from torchtext.vocab import build_vocab_from_iterator
from gensim.models import Word2Vec, FastText
from matplotlib import pyplot as plt

In [232]:
USE_STEMMING_INSTEAD_OF_LEMMATIZATION = False

In [233]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('wordnet')
nltk.download('omw-1.4')
if USE_STEMMING_INSTEAD_OF_LEMMATIZATION:
  token_processing_fn = nltk.PorterStemmer().stem 
else:
  token_processing_fn = nltk.stem.WordNetLemmatizer().lemmatize


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [234]:
def create_concat_text_content_string(path_to_dataset):
  file_contents = ""
  for filename in os.listdir(path_to_dataset):
    # skip readme
    if filename in ['README.txt', 'ipod']:
      continue
    # append contents of other files to file_contents string
    starting_corpus_size = len(file_contents)
    with open(os.path.join(path_to_dataset, filename)) as f:
      file_contents += f.read()
    print("After appending", filename, "corpus sized increased to", starting_corpus_size, "->", len(file_contents))
  return file_contents

In [235]:
def removeAnnotationSymbols(sentence):
  static_removals = ['[t]', '[u]','[p]','[s]','[cc]','[cs]']
  for symbol in static_removals:
    sentence = sentence.replace(symbol, '')
  return sentence

In [236]:
def calculateSentement(unprocessed_sentence):
  semantic_tokens = re.findall('\[([+-]\d)\]', unprocessed_sentence)
  semantic_sum = 0
  for token in semantic_tokens:
    semantic_sum += int(token)
  return semantic_sum

In [237]:
def process_string_of_reviews_into_df(string_of_reviews):
  """
  1. Apply casefolding
  2. Expand contractions
  3. Tokenize sentances
  4. Ignore stop-words
  5. Ignore tokens that don't contain any alphanumeric characters
  6. Ignore sentiment analysis tokens e.g. `+3`
  """
  list_of_reviews = string_of_reviews.split("[t]\n")
  tokenized_sentence_sentiment_list = []
  for review in list_of_reviews:
    sentiment = 0
    processed_review = []
    review_split_into_lines = review.split('\n')
    for line in review_split_into_lines:
      sentiment_tags, _, sentence = line.partition("##")
      sentiment += calculateSentement(sentiment_tags)
      x = removeAnnotationSymbols(sentence)
      x = x.casefold()
      # processed_review = f"{processed_review} {x}"
      x = nltk.tokenize.word_tokenize(x)
      processed_review += x
    tokenized_sentence_sentiment_list.append([review, processed_review, sentiment])
  df = pd.DataFrame(tokenized_sentence_sentiment_list, columns=['original review','processed review','sentiment'])
  return df


In [238]:
class ReviewDataset(torch.utils.data.Dataset):
  def __init__(self, path_to_review_folder):
        string_of_reviews = create_concat_text_content_string(path_to_review_folder)
        self.review_df = process_string_of_reviews_into_df(string_of_reviews)
        set_of_all_tokens = set([token for review in self.review_df['processed review'] for token in review])
        print(f"{len(set_of_all_tokens)=}")
        print(f"{len(set_of_all_tokens)=}")
        # bug
        # self.vocab = build_vocab_from_iterator(yield_tokens(set_of_all_tokens))
        self.vocab = build_vocab_from_iterator([set_of_all_tokens], max_tokens=len(set_of_all_tokens), specials=["<unk>"])
        print(f"{len(self.vocab)=}")
        self.vocab.set_default_index(self.vocab["<unk>"])
        
  def __len__(self):
    return len(self.review_df)

  def __getitem__(self, idx):
    review = self.review_df.iloc[idx]
    return self.vocab(list(review["processed review"])), review['sentiment']


In [239]:
PATH_TO_REVIEWS = "/content/drive/MyDrive/Colab Notebooks/product_reviews"
dataset = ReviewDataset(PATH_TO_REVIEWS)
dataset[6]

After appending Nokia_6600.txt corpus sized increased to 0 -> 56093
After appending norton.txt corpus sized increased to 56093 -> 95013
After appending Linksys_Router.txt corpus sized increased to 95013 -> 151947
After appending MicroMP3.txt corpus sized increased to 151947 -> 259727
After appending Diaper_Champ.txt corpus sized increased to 259727 -> 294831
After appending Hitachi_router.txt corpus sized increased to 294831 -> 325078
After appending Canon_S100.txt corpus sized increased to 325078 -> 353887
After appending Canon_PowerShot_SD500.txt corpus sized increased to 353887 -> 378520
After appending ipod.txt corpus sized increased to 378520 -> 436566
len(set_of_all_tokens)=6569
len(set_of_all_tokens)=6569
len(self.vocab)=6569


([1947,
  3830,
  2633,
  3621,
  6512,
  68,
  2956,
  3492,
  2599,
  48,
  5774,
  638,
  6381,
  2092,
  2181,
  6542,
  5869,
  3826,
  6344,
  68,
  2956,
  31,
  2677,
  2562,
  604,
  2217,
  3933,
  4302,
  5292,
  5840,
  6375,
  4059,
  3293,
  638,
  4074,
  3014,
  2211,
  68,
  3014,
  5855,
  1723,
  2956,
  31,
  2677,
  2562,
  4043,
  48,
  3784,
  638,
  4547,
  34,
  496,
  4800,
  35,
  5903,
  5424,
  2192,
  28,
  5730,
  68,
  3980,
  2956,
  23,
  835,
  5903,
  3933,
  5903,
  6025,
  5824,
  364,
  3745,
  68],
 0)

# Defining the model

In [240]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
TRAIN_PERCENTAGE = 0.8

In [241]:
train_size = int(TRAIN_PERCENTAGE*(len(dataset)))
val_size = len(dataset) - train_size
train_set, val_set = torch.utils.data.random_split(dataset, [train_size, val_size])


In [283]:
from torch import nn

class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.sigmoid = nn.Sigmoid()
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        y = self.fc(embedded)
        return self.sigmoid(y)

In [284]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_text, _label) in batch:
         label_list.append(_label)
         processed_text = torch.tensor(_text, dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = (torch.tensor(label_list, dtype=torch.int64)>0.5).float()
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(DEVICE), text_list.to(DEVICE), offsets.to(DEVICE)

In [285]:
EPOCHS = 20 # epoch
LR = 5e-3  # learning rate
BATCH_SIZE = 1 # batch size for training
vocab_size = len(dataset.vocab)
emsize = 50 
model = TextClassificationModel(vocab_size, emsize, 1).to(DEVICE)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

train_dataloader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, collate_fn=collate_batch)
val_dataloader = torch.utils.data.DataLoader(val_set, batch_size=BATCH_SIZE, collate_fn=collate_batch)

In [286]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 50
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text, offsets)
        # print(f"{predicted_label=}")
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += ((predicted_label>0.5).float() == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f} | loss'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count), loss)
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_acc += ((predicted_label>0.5).float() == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [287]:
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(val_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)

  return F.mse_loss(input, target, reduction=self.reduction)


| epoch   1 |    50/  259 batches | accuracy    0.510 | loss tensor(0.2396, grad_fn=<MseLossBackward0>)
| epoch   1 |   100/  259 batches | accuracy    0.600 | loss tensor(0.2709, grad_fn=<MseLossBackward0>)
| epoch   1 |   150/  259 batches | accuracy    0.680 | loss tensor(0.2353, grad_fn=<MseLossBackward0>)
| epoch   1 |   200/  259 batches | accuracy    0.660 | loss tensor(0.2570, grad_fn=<MseLossBackward0>)
| epoch   1 |   250/  259 batches | accuracy    0.660 | loss tensor(0.2273, grad_fn=<MseLossBackward0>)
-----------------------------------------------------------
| end of epoch   1 | time:  0.56s | valid accuracy    0.723 
-----------------------------------------------------------
| epoch   2 |    50/  259 batches | accuracy    0.510 | loss tensor(0.2314, grad_fn=<MseLossBackward0>)
| epoch   2 |   100/  259 batches | accuracy    0.560 | loss tensor(0.2799, grad_fn=<MseLossBackward0>)
| epoch   2 |   150/  259 batches | accuracy    0.680 | loss tensor(0.2270, grad_fn=<MseLos

In [296]:
model.eval()
label_list, text_list, offsets = collate_batch([(dataset.vocab(['please', 'this']), 4)])
model(text_list, offsets)

tensor([[0.5555]], grad_fn=<SigmoidBackward0>)