<a href="https://colab.research.google.com/github/LxYuan0420/aws-machine-learning-university-accelerated-nlp/blob/master/colab_notebooks/MLA_NLP_Lecture3_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/gdrive')

%cd "/gdrive/MyDrive/Colab Notebooks/git/aws-machine-learning-university-accelerated-nlp/colab_notebooks"

Mounted at /gdrive
/gdrive/MyDrive/Colab Notebooks/git/aws-machine-learning-university-accelerated-nlp/colab_notebooks



**Machine Learning Accelerator - Natural Language Processing - Lecture 3**

**Final Project: Neural Networks and Recurrent Neural Networks (RNNs) for the IMDB Movie Review Dataset**

Dataset: Sentiment (positive or negative) analysis of movie reviews. The dataset is originally hosted here: http://ai.stanford.edu/~amaas/data/sentiment/

We continue to work on our final project dataset. This time, you will try to see how Neural Networks, Recurrent Neural Networks (RNNs), its variants: GRU and LSTM work in predicting the sentiment of review texts. If you are interested in trying Transformers, here is a good place for that too!

Use the notebooks from the class and implement the model, train and test with the corresponding datasets. You can follow these steps:

1. Read training-test data (Given)
2. Train a classifier (Implement)
3. Make predictions on your test dataset (Implement)

**1. Reading the dataset**

We will use the pandas library to read our dataset.

Training data:

In [2]:
import pandas as pd

train_df = pd.read_csv('../data/final_project/imdb_train.csv', header=0)
test_df = pd.read_csv('../data/final_project/imdb_test.csv', header=0)

In [3]:
train_df.head()

Unnamed: 0,text,label
0,This movie makes me want to throw up every tim...,0
1,Listening to the director's commentary confirm...,0
2,One of the best Tarzan films is also one of it...,1
3,Valentine is now one of my favorite slasher fi...,1
4,No mention if Ann Rivers Siddons adapted the m...,0


**2. Train a Classifier**

In [12]:
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
import re

In [5]:
counter = Counter()
tokenizer = get_tokenizer("basic_english")

for train_sample in train_df["text"].tolist():
    counter.update(tokenizer(train_sample))

vocab = Vocab(counter, min_freq=5)

In [10]:
train_text = train_df['text'].values
train_label = train_df['label'].values
test_text = test_df['text'].values
test_label = test_df['label'].values


In [7]:
#text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
from collections import Counter
import nltk, torchtext
from nltk.tokenize import word_tokenize

nltk.download("punkt")


def cleanStr(text):
    if isinstance(text, str) == False:
        text = ""

    text = text.lower().strip()
    text = re.sub("\s+", " ", text)
    text = re.compile("<.*?>").sub("", text)
    return text

def tokenize(text):
    tokens = []
    text = cleanStr(text)
    words = word_tokenize(text)
    for word in words:
        tokens.append(word)
    return tokens

def transformText(text, vocab, max_length):
    token_arr = torch.zeros((max_length))
    tokens = tokenize(text)

    if len(tokens) < max_length:
        tokens += [0]*(max_length - len(tokens))

    tokens = tokens[:max_length]
    for idx, token in enumerate(tokens):
        try:
            token_arr[idx] = vocab.stoi[token]
        except:
            token_arr[idx] = 0 
    return token_arr

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [68]:
vocab.stoi["hello"]

4645

In [53]:
max_length=128
train_text_transformed = torch.stack([transformText(text, vocab, max_length) for text in train_text])
test_text_transformed = torch.stack([transformText(text, vocab, max_length) for text in test_text])

In [62]:
hidden_size = 12
learning_rate = 0.001
epochs = 50
batch_size = 32

num_embed = 50
vocab_size = len(vocab.itos)

In [None]:
from torch.utils.data import TensorDataset, DataLoader

train_label = torch.Tensor(train_label)
test_label = torch.Tensor(test_label)
train_dataset = TensorDataset(train_text_transformed, train_label)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

In [63]:
# define model
device = torch.device("cpu") # use "cuda:0" if you are using GPU

class Net(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers=1):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, num_hiddens, num_layers=num_layers)
        #self.linear = nn.Linear(1536, 1)
        self.linear = nn.Linear(max_length*num_hiddens, 1) #because in forward() we reshape to (bs, -1)?
        self.act = nn.Sigmoid()

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        outputs, _ = self.rnn(embeddings)
        outs = self.linear(outputs.reshape(outputs.shape[0], -1))
        return self.act(outs)
    
model = Net(vocab_size, num_embed, hidden_size).to(device)

In [64]:
trainer = torch.optim.SGD(model.parameters(), lr=learning_rate)
cross_ent_loss = nn.BCEWithLogitsLoss(reduction='none')

In [65]:
import time


for epoch in range(epochs):
    start = time.time()
    training_loss=0
    for idx, (data, target) in enumerate(train_loader):
        trainer.zero_grad()

        data = data.long().to(device)
        target = target.to(device)

        output = model(data)
        L = cross_ent_loss(output.squeeze(1), target).sum()
        training_loss += L

        L.backward()
        trainer.step()

    # one epoch finish
    training_loss /= len(train_label)
    end = time.time()
    print("Epoch %s. Train_loss %f Seconds %f" % \
          (epoch, training_loss, end-start))

Epoch 0. Train_loss 0.696218 Seconds 8.009602
Epoch 1. Train_loss 0.692315 Seconds 7.897772
Epoch 2. Train_loss 0.691051 Seconds 8.006441
Epoch 3. Train_loss 0.689180 Seconds 7.946592
Epoch 4. Train_loss 0.686496 Seconds 7.878174
Epoch 5. Train_loss 0.683239 Seconds 7.861934
Epoch 6. Train_loss 0.679714 Seconds 7.829571
Epoch 7. Train_loss 0.676184 Seconds 7.870228
Epoch 8. Train_loss 0.672821 Seconds 8.107740
Epoch 9. Train_loss 0.669692 Seconds 8.202859
Epoch 10. Train_loss 0.666817 Seconds 8.062102
Epoch 11. Train_loss 0.664194 Seconds 8.053298
Epoch 12. Train_loss 0.661803 Seconds 7.959071
Epoch 13. Train_loss 0.659608 Seconds 8.040235
Epoch 14. Train_loss 0.657575 Seconds 8.090099
Epoch 15. Train_loss 0.655677 Seconds 7.996216
Epoch 16. Train_loss 0.653892 Seconds 8.021352
Epoch 17. Train_loss 0.652199 Seconds 7.986277
Epoch 18. Train_loss 0.650590 Seconds 8.012913
Epoch 19. Train_loss 0.649056 Seconds 8.063029
Epoch 20. Train_loss 0.647595 Seconds 8.055485
Epoch 21. Train_loss 0.

In [66]:
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
# Get validation predictions
test_predictions = model(test_text_transformed.to(device).long())

# Round predictions: 1 if pred>0.5, 0 otherwise
test_predictions = np.round(test_predictions.detach().numpy())

print("Classification Report")
print(classification_report(test_label.numpy(), test_predictions))
print("Accuracy")
print(accuracy_score(test_label.numpy(), test_predictions))

Classification Report
              precision    recall  f1-score   support

         0.0       0.57      0.85      0.68     12500
         1.0       0.70      0.36      0.48     12500

    accuracy                           0.61     25000
   macro avg       0.64      0.61      0.58     25000
weighted avg       0.64      0.61      0.58     25000

Accuracy
0.60544
