Set up nltk for preprocessing

In [2]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Connect to drive and move to folder with the data (needs to be uploaded by you!)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
cd /content/drive/My Drive/Colab Notebooks

/content/drive/My Drive/Colab Notebooks


Run this script to train and test the model!

In [5]:
ls

dictionary.json   nn_model.py       reviews_test.csv   Tokenizer.py
DM2583            [0m[01;34m__pycache__[0m/      reviews_train.csv
NeuralNetwork.py  reviews_eval.csv  [01;34msaved_models[0m/


In [None]:
import pandas as pd
from pandas import DataFrame
import numpy as np
# pytorch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
from nn_model import SentimentLSTM, SentimentCNN, SentimentFC
import os
import glob
from Tokenizer import tokenize_reviews
from sklearn.metrics import plot_confusion_matrix, confusion_matrix, ConfusionMatrixDisplay
import json
import nltk

nltk.download('punkt')

vocab_size = 0
def dataloader_from_sparse_matrix(sparse, ground_truth, batch_size):
    """
     Converts a sparse matrix to a pytorch dataloader
     Used to make the tf-idf useful with the NN
    """
    sparse = sparse.tocoo()
    values = sparse.data
    indices = np.vstack((sparse.row, sparse.col))
    i = torch.LongTensor(indices)
    v = torch.tensor(values)
    shape = sparse.shape

    dataset = TensorDataset(torch.sparse.FloatTensor(
        i, v, torch.Size(shape)).float(), torch.tensor(ground_truth))
    data_loader = DataLoader(dataset, shuffle=True, batch_size=batch_size)
    return data_loader

def dataloader_from_csv(in_data, ground_truth, batch_size):
    global word_to_int
    dataset = TensorDataset(torch.tensor(tokenize_reviews(in_data, 'dictionary.json', 500)), torch.tensor(ground_truth))
    data_loader = DataLoader(dataset, shuffle=True, batch_size=batch_size)
    return data_loader


with open("dictionary.json", 'r') as f:
  word_to_int = json.load(f)
  vocab_size = len(word_to_int.keys())
# print(f"dictionary size:{len(word_to_int)}")
# hyperparameters
NUM_EPOCHS = 1
EMBEDDING_DIM = int(vocab_size ** 0.25) # rule of thumb from: https://developers.googleblog.com/2017/11/introducing-tensorflow-feature-columns.html
HIDDEN_DIM = 32
LEARNING_RATE = 0.0005  # will be adjustable, we're using ADAM
BATCH_SIZE = 64  # we have a lot of data and not a lot of time

#NOTE: you probably want to run this on colab, since it requires quite a lot of RAM

type_dict = {"text": "string", "Sentiment": int}

device = 0
print(f"Embedding dimensionality: {EMBEDDING_DIM}")
# vectorizer = TfidfVectorizer(
#     max_features=2500, norm='l2', stop_words=stopwords.words('english'))

model = 0
def train():
    model_file_name = ""
    global device
    global word_to_int
    # read training data
    train_df = pd.read_csv("reviews_train.csv", header=None, skiprows=[0],
                           names=["text", "Sentiment"])
    train_df.dropna(inplace=True)
    print("training vectorizer")
    #train_text = vectorizer.fit_transform(train_df["text"].values)

    print('loading and fitting eval data')
    eval_df = pd.read_csv("reviews_eval.csv", header=None, skiprows=[0],
                          names=["text", "Sentiment"])
    # eval_df.data = clean_data(eval_df.data)
    print(eval_df.head())
    eval_df.dropna(inplace=True)
    #val_text = vectorizer.transform(eval_df["text"].values)

    # convert to pytorch format
    print("converting data")
    train_loader = dataloader_from_csv(
        train_df["text"].values, train_df["Sentiment"].values, BATCH_SIZE)
    val_loader = dataloader_from_csv(
        eval_df["text"].values, eval_df["Sentiment"].values, BATCH_SIZE)

    # test code to chec that the conversion works
    dataiter = iter(train_loader)
    sample_x, sample_y = dataiter.next()
    print(f"sample input size: {sample_x.size()}")
    print("sample input: ", sample_x)
    print()
    print('Sample label size: ', sample_y.size())  # batch_size
    print('Sample label: \n', sample_y)

    model = SentimentLSTM(vocab_size, output_size=3, embedding_dim=EMBEDDING_DIM,hidden_dim=HIDDEN_DIM)
    model.to(device)

    loss_function = nn.CrossEntropyLoss() 
    # Adam optimizer, better convergence than standard SGD
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

    print("starting training...")

    best_val_acc = 0  # large number as placeholder
    train_iter = 0
    train_losses = []
    validation_accs = []
    x_coords = []
    for i in range(NUM_EPOCHS):
        # train for one epoch
        model = model.train()
        for review, sentiment in train_loader:
            #print(sentiment.size())
            review = review.to(device)
            sentiment = sentiment.to(device)
            model.zero_grad()  # reset gradient after each batch
            # add sequence lenght of 1 for lstm
            probabilities = model(review)
            # squueze away sequence length
            loss = loss_function(probabilities, sentiment)
            train_iter = train_iter + 1
            loss.backward()
            optimizer.step()
            if (train_iter % 500 == 0):
                print(f"iteration: {train_iter}, loss {loss.item()}")
                train_losses.append(loss.item())
                x_coords.append(train_iter)
                print("calculating accuracy on validation data")
                with torch.no_grad():  # we don't need to calcualte gradients on validation, so we save some memory here
                  dev_loss = 0
                  dev_n_correct = 0
                  total = len(val_loader.dataset)
                  accuracy= 0
                  for val_review, val_sentiment in val_loader:
                      val_review, val_sentiment = val_review.to(device), val_sentiment.to(device)
                      result = model(val_review)
                      dev_loss = loss_function(result, val_sentiment)
                      _, predicted = torch.max(result.data, 1)
                      dev_n_correct += (predicted == val_sentiment).sum().item()
                  accuracy = 100 * dev_n_correct / total
                  validation_accs.append(accuracy)
                  print(f"validation accuracy: {accuracy}")
                  print(f"validation done, loss: {dev_loss.item()}")
                  #if accuracy > best_val_acc:
                      #print("new best model, saving..")
                      #best_val_loss = loss.item()
                      #best_val_acc = accuracy
                      #model_save_prefix = os.path.join( os.getcwd(), "saved_models", "saved-")
                      #model_save_path = model_save_prefix +  f"LSTM-acc-{accuracy}.pt"
                      #model_file_name = f"LSTM-acc-{accuracy}.pt"
                      #torch.save(model.state_dict(), model_save_path)
                      #for f in glob.glob(model_save_prefix + '*'):
                          #if f != model_save_path:
                              #os.remove(f)        
    
    f, (ax1,ax2) = plt.subplots(1,2)
    ax1.plot(x_coords, train_losses, label="Training loss")
    ax1.set_title("training losses")
    ax2.plot(x_coords, validation_accs, label="Validation accuracy")
    ax2.set_title("Validation accuracy")

    plt.show()

    return model_file_name

# test the network here


def test(model_filename):
    # load test data
    global device
    global word_to_int
    print('loading and fitting test data')
    test_df = pd.read_csv("reviews_test.csv", header=None, skiprows=[0],
                          names=["text", "Sentiment"])
    # eval_df.data = clean_data(eval_df.data)
    print(test_df.head())
    test_df.dropna(inplace=True)
    #test_text = vectorizer.transform(test_df["text"].values)
    testloader = dataloader_from_csv(
        test_df["text"].values, test_df["Sentiment"].values, BATCH_SIZE)

    dataiter = iter(testloader)
    sample_x, sample_y = dataiter.next()
    print(f"sample input size: {sample_x.size()}")
    print("sample input: ", sample_x)
    print()
    print('Sample label size: ', sample_y.size())  # batch_size
    print('Sample label: \n', sample_y)
    # load model
    model_load_path = os.path.join(os.getcwd(), 'saved_models', model_filename)
    # TODO: instantiate correctly.
    model = SentimentLSTM(vocab_size, output_size=3, embedding_dim=EMBEDDING_DIM,hidden_dim=HIDDEN_DIM)
    model.to(device)
    print("laoding model")
    model.load_state_dict(torch.load(model_load_path))
    print("finished loading state dict")
    model.eval()
    iter_step = 0
    correct = 0
    total = len(testloader.dataset)
    predictions = []
    groundTruth = []
    with torch.no_grad():
        for reviews, sentiments in testloader:
            reviews, sentiments = reviews.to(device), sentiments.to(device)
            result = model(reviews)
            _, predicted = torch.max(result.data, 1)
            correct +=  (predicted == sentiments).sum().item()
            predictions.extend(predicted.tolist()) #convert to list for confusion matrix
            groundTruth.extend(sentiments.tolist())
            iter_step += 1
            if(iter_step % 1000 == 0):
              print(f"\r {iter_step}", end="")
    accuracy = 100 * correct / total
    #plot_confusion_matrix(predictions, groundTruth)
    cm = confusion_matrix(groundTruth, predictions,labels=[0, 1, 2], normalize='true')
    cmd = ConfusionMatrixDisplay(cm, display_labels=[0,1,2]).plot()
    return accuracy


def main():
    global device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"training on {device} for {NUM_EPOCHS} epochs")
    model_file_name = train()
    print("done training")
    #model_file_name = "saved-LSTM-acc-86.91171734830013.pt" #replace with train() if we want to create a new model
    print("testing")
    accuracy = test(model_file_name)
    print(f"final test accuracy: {accuracy}")


main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Embedding dimensionality: 25
training on cuda for 1 epochs
training vectorizer
loading and fitting eval data
                                                text  Sentiment
0  the service here is pretty crap but the food i...          2
1  i chose this restaurant for my extended birthd...          1
2  i hate to do this but about two months ago my ...          0
3  terrific ribs mediocre cole slaw good service ...          2
4  my experience at this toys r us location was g...          2
converting data
sample input size: torch.Size([64, 500])
sample input:  tensor([[   0,    0,    0,  ...,   68,  348,  542],
        [   0,    0,    0,  ..., 2626,   48,  106],
        [   0,    0,    0,  ...,   17,  163, 2131],
        ...,
        [   0,    0,    0,  ...,   27,   17, 1386],
        [   0,    0,    0,  ..., 1336,   32, 2165],
        [   0,    0,    0,  ...,  345,   43,  222]])

Sa