In [1]:
import csv
import glob
import math
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import numba
import numpy as np
import os
import pandas as pd
import re
import string
import time
import torch
from collections import OrderedDict
from torch import nn
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pathlib import Path
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nicho\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nicho\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 1. Vector Space Model

## 1.1 Construct Term-Document Matrix

In [2]:
# Parse the play title and text from each line of the input CSV
shakespeare_csv_path = os.path.join(os.getcwd(), 'part_1_input_data', 'ShakespearePlays_text.csv')

title_text_list = []

with open(shakespeare_csv_path, 'r') as csv_file:
    csv_data = csv.reader(csv_file)
    for row in csv_data:
        row_list = ''.join(row).split(';')
        title = row_list[1][1:-1]
        text = row_list[5].lower()
        text = re.sub('\n\d+', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        title_text_list.append((title, text))

In [3]:
# Parse the document names
document_names_file_path = os.path.join(os.getcwd(), 'part_1_input_data', 'Shakespeare_play_names.txt')

play_titles = []

with open(document_names_file_path, 'r') as file:
    play_titles = [title for title in file.read().split('\n') if title != '']

In [4]:
# Parse words from vocab file
vocab_file_path = os.path.join(os.getcwd(), 'part_1_input_data', 'Shakespeare_vocab.txt')

vocab_list = []

with open(vocab_file_path, 'r') as file:
    vocab_list = [word for word in file.read().split('\n') if word != '']

In [5]:
# Create term-document matrix
term_document_matrix = {}

for word in vocab_list:
    term_document_matrix[word] = {play_title: 0 for play_title in play_titles}

In [6]:
# Fill in the term-document matrix
for title_text_tuple in title_text_list:
    play_title = title_text_tuple[0]
    words = title_text_tuple[1].split()
    for word in words:
        if word in term_document_matrix:
            term_document_matrix[word][play_title] += 1

In [7]:
# Create a DataFrame from the term-document matrix dictionary
term_document_matrix_df = pd.DataFrame.from_dict(term_document_matrix).T

In [8]:
# Write 5 random non-zero frequency words in each play to output file
term_doc_sample_file = os.path.join(os.getcwd(), 'part_1_output', 'term_doc_sample.txt')

with open(term_doc_sample_file, 'w') as output_file:
    for column in term_document_matrix_df:
        non_zero_word_count = 0
        while non_zero_word_count < 5:
            word = term_document_matrix_df.sample().index[0]
            if term_document_matrix_df[column][word] > 0:
                line = column + ', ' + word + ', ' + str(term_document_matrix_df[column][word]) + '\n'
                output_file.write(line)
                non_zero_word_count += 1

## 1.2 Compute Document Similarity

In [9]:
def dotProduct(df, column_1, column_2):
    """Return the dot product of the two columns in the specified DataFrame."""
    return sum(df[column_1] * df[column_2])

In [10]:
def vectorLength(df, column):
    """Return the vector length of the specified column in the specified DataFrame."""
    return math.sqrt(sum(df[column] ** 2))

In [11]:
def cosineDistance(df, column_1, column_2):
    """Return the cosine distance of the two columns in the specified DataFrame."""
    dot_product = dotProduct(df, column_1, column_2)
    vector_1_length = vectorLength(df, column_1)
    vector_2_length = vectorLength(df, column_2)
    length_product = vector_1_length * vector_2_length
    if length_product != 0:
        return dot_product / length_product
    else:
        return 0

In [12]:
def computePlayCosineDistances(term_document_matrix_df, play_titles):
    """
    Return the cosine distance between each play vector, represented by a column in 
    the specified term document matrix DataFrame.
    """
    play_cosine_distances = {}

    for title_1 in play_titles:
        play_cosine_distances[title_1] = {}
        for title_2 in play_titles:
            if not title_1 == title_2:
                play_cosine_distances[title_1][title_2] = cosineDistance(term_document_matrix_df, title_1, title_2)
                
    return play_cosine_distances

In [13]:
# Compute the cosine distance between each play
play_cosine_distances = computePlayCosineDistances(term_document_matrix_df, play_titles)

In [14]:
# For each play, output the play with the highest cosine similarity to the doc_sim.txt output file
cosine_distances_file_path = os.path.join(os.getcwd(), 'part_1_output', 'doc_sim.txt')

with open(cosine_distances_file_path, 'w') as file:
    for play in play_cosine_distances:
        most_similar_play = max(play_cosine_distances[play], key=play_cosine_distances[play].get)
        line = play + ', ' + most_similar_play + ', ' + str(play_cosine_distances[play][most_similar_play]) + '\n'
        file.write(line)

## 1.3 Measuring Word Similarity using Term-Context Matrix 

In [15]:
# Create the term context matrix and set each cell to 0
term_context_matrix_df = pd.DataFrame(0, columns=vocab_list, index=vocab_list)

In [16]:
# Fill in the term context matrix using a window size of 4
# Note: I'm using columns as word vectors and rows as context words.
# Note: this may take 10-15 min to run
window_size = 4

for title_text in title_text_list:
    text_tokens = title_text[1].split()
    for index, word in enumerate(text_tokens):
        if word in vocab_list:
            left_window_start = index - window_size if (index - window_size) > 0 else 0
            right_window_end = index + window_size if (index + window_size) < len(text_tokens) else (len(text_tokens) - 1)
            left_window = text_tokens[left_window_start:index]
            right_window = text_tokens[index+1:right_window_end]
            
            for left_window_word in left_window:
                if left_window_word in vocab_list:
                    term_context_matrix_df.loc[left_window_word, word] += 1
                    
            for right_window_word in right_window:
                if right_window_word in vocab_list:
                    term_context_matrix_df.loc[right_window_word, word] += 1

In [17]:
# Create target words list
target_words = ['romeo', 'juliet', 'nobleman', 'caesar', 'friend']

In [18]:
# Create the target words cosine similarities df
target_word_cosine_similarities_df = pd.DataFrame(0, columns=target_words, index=vocab_list)

In [19]:
# Compute the cosine similarity of each of the target words with all words in the vocabulary
# Note: this may take 10-15 min to run
for target_word in target_words:
    for word in vocab_list:
        if not target_word == word:
            target_word_cosine_similarities_df.loc[word, target_word] = cosineDistance(term_context_matrix_df, target_word, word)

In [20]:
# Write the top five most similar words to each of the target words to results file
term_doc_sim_out = os.path.join(os.getcwd(), 'part_1_output', 'term_context_sim.txt')

with open(term_doc_sim_out, 'w') as file:
    for target_word in target_words:
        most_similar_words = target_word_cosine_similarities_df[target_word].nlargest(5).index.tolist()
        for similar_word in most_similar_words:
            line = target_word + ', ' + similar_word + ', ' + str(target_word_cosine_similarities_df.loc[similar_word, target_word]) + '\n'
            file.write(line)

## 1.4 TF-IDF in the Term-Context Matrix

In [21]:
# One hot encode and transpose the term document matrix
one_hot_term_doc_df = term_document_matrix_df.T.astype(bool).astype(int)

In [22]:
# Create tf-idf term context matrix
tf_idf_term_context_df = term_context_matrix_df.T.copy(deep=True)
tf_idf_term_context_df = tf_idf_term_context_df.astype(float)

In [23]:
@numba.jit(nopython=True)
def computeTfIdf(term_context_vector, idf):
    """Return vector of term context counts tranformed to tf-idf values."""
    return term_context_vector * idf

In [24]:
# Compute TF-IDF in the term-context matrix
for context_word in tf_idf_term_context_df:
    document_freq = one_hot_term_doc_df[context_word].sum()
    idf = 1 / document_freq if document_freq > 0 else 0  
    tf_idf_term_context_df[context_word] = computeTfIdf(tf_idf_term_context_df[context_word].to_numpy(), idf)

In [25]:
# Compute the cosine similarity of the tf-idf scores for the target word vectors
# with the vectors of all other words in the vocabulary
# Note: this takes 10-15 minutes to run
tf_idf_df = tf_idf_term_context_df.T
target_tf_idf_cosine_df = pd.DataFrame(0, columns=target_words, index=vocab_list)

for target_word in target_words:
    for word in vocab_list:
        if not target_word == word:
            cosine_distance = cosineDistance(tf_idf_df, target_word, word)
            target_tf_idf_cosine_df.loc[word, target_word] = cosine_distance

In [26]:
# Write the five most similar tf-idf word vectors for each of the target words to 
# the results file
tf_idf_sim_out = os.path.join(os.getcwd(), 'part_1_output', 'tf_idf_sim.txt')

with open(tf_idf_sim_out, 'w') as file:
    for target_word in target_words:
        most_similar_words = target_tf_idf_cosine_df[target_word].nlargest(5).index.tolist()
        for similar_word in most_similar_words:
            line = target_word + ', ' + similar_word + ', ' + str(target_tf_idf_cosine_df.loc[similar_word, target_word]) + '\n'
            file.write(line)

# 2. Neural Sentiment Classification

## 2.1 Train a feed-forward Neural Network

### Load Training Data

In [2]:
# Load training data 
pos_train_files = glob.glob(os.path.join(os.getcwd(), 'imdb_train_data', 'pos', '*'))
neg_train_files = glob.glob(os.path.join(os.getcwd(), 'imdb_train_data', 'neg', '*'))  
pos_train_texts = [Path(file).read_text() for file in pos_train_files]
neg_train_texts = [Path(file).read_text() for file in neg_train_files]

In [3]:
# Utility variables
cached_stopwords = stopwords.words('english')
wordnet_lemmatizer = WordNetLemmatizer()
unk_token = 'UNK'
unk_threshold = 5
device = torch.device("cpu")

### Preprocess Training Data

In [4]:
def removeNewlines(text):
    """Remove newline characters from the specified text."""
    return re.sub('\n', '', text)

In [5]:
def tokenizeText(text):
    """Tokenize the specified text using nltk."""
    return nltk.word_tokenize(text)

In [6]:
def removeStopwords(tokenized_text, stopwords_list):
    """Remove stopwords from the specified text tokens."""
    return [word for word in tokenized_text if not word in stopwords_list]

In [7]:
def removePunctuationTokens(tokenized_text):
    """Remove all tokens comprised of only punctuation characters from the specified text tokens."""
    return [word for word in tokenized_text if not all(c in string.punctuation for c in word)]

In [8]:
def lemmatizeTokens(tokenized_text, lemmatizer):
    """Lemmatize the specified list of text tokens."""
    return [lemmatizer.lemmatize(word) for word in tokenized_text]

In [9]:
def preprocessText(text, stopwords_list, lemmatizer):
    """
    Remove newline characters, stopwords, and tokens comprised of only punctuation, and 
    lemmatize the specified text.
    """
    text = removeNewlines(text)
    tokenized_text = tokenizeText(text)
    tokenized_text = removeStopwords(tokenized_text, stopwords_list)
    tokenized_text = removePunctuationTokens(tokenized_text)
    lemmatized_tokens = lemmatizeTokens(tokenized_text, lemmatizer)
    return ' '.join(lemmatized_tokens)

In [10]:
# Preprocess all training text blocks
for index,text in enumerate(pos_train_texts):
    pos_train_texts[index] = preprocessText(text, cached_stopwords, wordnet_lemmatizer)
    
for index,text in enumerate(neg_train_texts):
    neg_train_texts[index] = preprocessText(text, cached_stopwords, wordnet_lemmatizer)

### Create Training Vocabulary

In [11]:
def wordCounts(text):
    """Return a dictionary of counts for each unique word in the specified text."""
    word_counts = {}
    
    for word in text.split():
        if not word in word_counts.keys():
            word_counts[word] = 1
        else:
            word_counts[word] += 1
            
    return word_counts

In [12]:
def removeWordsByCount(word_counts, threshold):
    """
    Return a new dictionary that contains all words in the specified word_counts dictionary
    with occurrence counts greater than the specified threshold value.
    """
    new_word_counts = dict(word_counts)
    words_to_delete = []
    
    for word in new_word_counts.keys():
        if new_word_counts[word] < threshold:
            words_to_delete.append(word)
    
    for word in words_to_delete:
        del new_word_counts[word]
    
    return new_word_counts

In [13]:
# Compile a word counts dictionary using the training data
train_text = ' '.join(pos_train_texts) + ' '.join(neg_train_texts)
word_counts = wordCounts(train_text)
word_counts = removeWordsByCount(word_counts, unk_threshold)

In [14]:
# Create a vocabulary list using the word counts dictionary
train_vocabulary = list(word_counts.keys())
train_vocabulary.append(unk_token)

In [15]:
def replaceUnk(text, vocabulary, unk_token):
    """
    Replace all tokens in the specified text string that are not in the specified vocabulary
    with the specified unk_token.
    """
    text_tokens = text.split()
    
    for index,word in enumerate(text_tokens):
        if not word in word_counts.keys():
            text_tokens[index] = unk_token
    
    return ' '.join(text_tokens)

In [16]:
def countUnk(texts, unk_token):
    """Count the number of UNK characters in the specified text."""
    unk_count = 0;
    
    for text in texts:
        text_tokens = text.split()
        for word in text_tokens:
            if word == unk_token:
                unk_count += 1
            
    return unk_count

In [17]:
# Replace words in the training set that are not in the vocabulary with 'UNK'
for index,text in enumerate(pos_train_texts):
    pos_train_texts[index] = replaceUnk(text, train_vocabulary, unk_token)
    
for index,text in enumerate(neg_train_texts):
    neg_train_texts[index] = replaceUnk(text, train_vocabulary, unk_token)

In [18]:
# Create classes list and single train_text list
classes = []

for text in pos_train_texts:
    classes.append(1)

for text in neg_train_texts:
    classes.append(0)
    
train_texts = pos_train_texts + neg_train_texts

In [19]:
# Count UNK instances and add UNK to word_counts dictionary
word_counts[unk_token] = countUnk(train_texts, unk_token)

In [20]:
# Create a torch vocabulary by sorting the word count dictionary and converting'
# it to an OrderedDict
sorted_word_counts_tuples = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
ordered_word_counts = OrderedDict(sorted_word_counts_tuples)
torch_vocab = vocab(ordered_word_counts)

### Create Model

In [21]:
class ImdbDataset(Dataset):
    """Iterable DataSet class. Consists of data list and prediction target list."""
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.targets[index]
        return x, y
    
    def __len__(self):
        return len(self.data)

In [22]:
# Lambda to convert input text to integers (indices)
text_pipeline = lambda x: torch_vocab(x.split())

In [23]:
def collate_batch(batch):
    """
    For the specified batch, return a list of labels, transformed text indices, and and offsets.
    
    Sources: 
        https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
    """
    label_list, text_list, offsets = [], [], [0]
    
    for (text, label) in batch:
        label_probs = [1, 0] if label == 0 else [0, 1]
        label_list.append(label_probs)
        text_indices = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(text_indices)
        offsets.append(text_indices.size(0))
        
    label_list = torch.tensor(label_list, dtype=torch.float32)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    
    return label_list.to(device), text_list.to(device), offsets.to(device)

In [24]:
class TextClassifierSingleHiddenLayer(nn.Module):
    """
    Text classifier model.
    Consists of a word embedding layer, hidden layer, top layer, and activation function.
    
    Sources: 
        https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
        https://www.analyticsvidhya.com/blog/2020/01/first-text-classification-in-pytorch/
        http://www.cse.chalmers.se/~richajo/nlp2019/l2/Text%20classification%20using%20a%20CBoW%20representation.html
    """
    def __init__(self, vocab_size, embedding_dims, num_hidden_nodes, num_classes):
        super(TextClassifierSingleHiddenLayer, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_dims, sparse=True)
        self.hidden_layer = nn.Linear(embedding_dims, num_hidden_nodes)
        self.top_layer = nn.Linear(num_hidden_nodes, num_classes)
        self.act = nn.Sigmoid()
    
    def init_weights(self):
        initranges = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.hidden_layer.weight.data.uniform_(-initrange, initrange)
        self.hidden_layer.bias.data.zero_()
    
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        hidden = self.act(self.hidden_layer(embedded))
        scores = self.top_layer(hidden)
        return self.act(scores)

### Train Model

In [25]:
def train(dataloader, model, optimizer, criterion, epoch):
    """
    Train the specified model using the data in the specified dataloader.
    
    Sources:
        https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
    """
    model.train()
    total_accuracy, total_count = 0, 0
    log_interval = 500
    start_time = time.time()
    
    for idx, (label, text, offsets) in enumerate(dataloader):
        # Clear the gradients for all optimizer parameters
        optimizer.zero_grad()
        
        # Predict labels for the batch input
        predicted_label = model(text, offsets)
        
        # Compute the loss
        loss = criterion(predicted_label, label)
        
        # Compute the gradient for every parameter
        loss.backward()
        
        # Perform gradient clipping to avoid gradient explosion
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        
        # Update parameters for current gradient
        optimizer.step()
        
        # Compute the running total accuracy for this epoch
        total_accuracy += (predicted_label.argmax(1) == label.argmax(1)).sum().item()
        total_count += label.size(0)
        
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            total_accuracy, total_count = 0, 0
            start_time = time.time()

In [26]:
def evaluate(test_dataloader, model, criterion):
    """
    Evaluate the specified model with data from the test_dataloader.
    
    Sources:
        https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
    """
    model.eval()
    total_accuracy, total_count = 0, 0
    
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(test_dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label, label)
            total_accuracy += (predicted_label.argmax(1) == label.argmax(1)).sum().item()
            total_count += label.size(0)
    return total_accuracy / total_count

In [27]:
def createTrainTestSplits(X, y, train_indices, test_indices):
    """Create train/test splits from the data (X) and target (y) lists."""
    X_train = [X[i] for i in train_indices]
    y_train = [y[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    y_test = [y[i] for i in test_indices]
    
    return X_train, y_train, X_test, y_test

In [28]:
def crossValidation(X, y, model, model_out_path, epochs=4, n_splits=10):
    """Train the specified model using 10-fold cross validation."""
    batch_size = 3
    learning_rate = 5
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)
    for fold_index, (train_indices, test_indices) in enumerate(skf.split(X, y)):
        X_train, y_train, X_test, y_test = createTrainTestSplits(X, y, train_indices, test_indices)
            
        criterion = nn.BCELoss()
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 3.0, gamma=0.1)
        total_accuracy = None
            
        train_dataset = ImdbDataset(X_train, y_train)
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
            
        test_dataset = ImdbDataset(X_test, y_test)
        test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
            
        print('fold: {}'.format(fold_index))
        for epoch in range(1, epochs + 1):
            epoch_start_time = time.time()
            train(train_dataloader, model, optimizer, criterion, epoch)
            epoch_accuracy = evaluate(test_dataloader, model, criterion)
            if total_accuracy is not None and total_accuracy > epoch_accuracy:
                # Update the learning rate
                scheduler.step()
            else:
                total_accuracy = epoch_accuracy
                torch.save(model.state_dict(), model_out_path)
            print('\t' + ('-' * 59))
            print('\t| end of epoch {:3d} | time: {:5.2f}s | test accuracy {:8.3f} '.format(epoch, time.time() - epoch_start_time, epoch_accuracy))
            print('\t' + ('-' * 59))

In [29]:
# Create the single hidden layer text classifier model
vocab_size = len(train_vocabulary)
embedding_dims = 5
num_hidden_nodes = 40
num_classes = 2

model = TextClassifierSingleHiddenLayer(vocab_size, embedding_dims, num_hidden_nodes, num_classes).to(device)

In [30]:
# Create model output path
single_layer_model_out_path = os.path.join(os.getcwd(), 'part_2_output', 'best_single_hidden_layer_weights.pt')

In [31]:
# Train the model with the IMDB training set
crossValidation(train_texts, classes, model, single_layer_model_out_path)

fold: 0
	-----------------------------------------------------------
	| end of epoch   1 | time:  0.98s | test accuracy    0.500 
	-----------------------------------------------------------
	-----------------------------------------------------------
	| end of epoch   2 | time:  1.31s | test accuracy    0.500 
	-----------------------------------------------------------
	-----------------------------------------------------------
	| end of epoch   3 | time:  1.17s | test accuracy    0.500 
	-----------------------------------------------------------
	-----------------------------------------------------------
	| end of epoch   4 | time:  1.17s | test accuracy    0.500 
	-----------------------------------------------------------
fold: 1
	-----------------------------------------------------------
	| end of epoch   1 | time:  1.04s | test accuracy    0.500 
	-----------------------------------------------------------
	-----------------------------------------------------------
	| end o

In [32]:
# Print the parameters for the best performing model's embedding, hidden, and output layers
# Note: the embedding layer is truncated
best_model = TextClassifierSingleHiddenLayer(vocab_size, embedding_dims, num_hidden_nodes, num_classes)
best_model.load_state_dict(torch.load(single_layer_model_out_path))

for name, param in best_model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

embedding.weight tensor([[-0.4321, -0.2709, -0.2175, -0.3858,  1.2870],
        [-0.5192,  0.1369, -0.0348, -0.5189, -0.5206],
        [-0.9257,  0.0254, -1.0665, -1.4876,  0.6701],
        ...,
        [-0.7875, -0.3922, -0.4267,  0.2914,  1.0996],
        [ 0.4884, -1.1890,  1.4996, -2.0429, -0.4221],
        [ 1.9716, -0.1226,  0.3288,  1.2408,  0.0205]])
hidden_layer.weight tensor([[ 3.4457e-02,  3.0551e-01,  4.2761e-01,  2.3980e-01, -9.1255e-01],
        [ 2.3182e+00,  1.4155e+00,  7.2949e-01, -1.2646e+00,  1.7920e+00],
        [-4.5588e-01, -2.2772e-01,  4.6248e-02,  4.0328e-01, -7.7850e-01],
        [-2.2462e-01, -1.5815e-01,  3.1412e-01,  5.2820e-01, -7.6246e-01],
        [-2.2733e+00, -1.3973e+00, -9.2936e-01,  1.2767e+00, -1.4810e+00],
        [-1.1290e+00, -8.2203e-01,  9.5091e-02,  1.0133e+00, -1.0069e+00],
        [-1.2967e+00, -8.4557e-01,  5.6132e-02,  6.7173e-01, -1.6004e+00],
        [-4.8835e-01,  1.0687e-01,  1.9625e-01,  2.6353e-01, -1.2168e+00],
        [ 5.6281e-0

### Parameter Optimization Process

After the dataset class, batch collator, and model class were created, the parameter optimization process began with selecting hyperparameters. I wasn't really thinking about results at this stage; I ran into memory issues running the model on my graphics card, so I switched to using the CPU and lowered the embedding dimensions to 5 and hidden layer nodes to 40. With these hyperparameters, I was able to start training the model, fixing tensor dimension errors as they popped up. For the criterion, or loss function, I chose binary cross entropy because the classification problem is a binary one. I used stochastic gradient descent as the optimizer, following what was done in the pytorch tutorial. I may have achieved better results more quickly using Adam, but this is a simple problem so I didn't worry about it too much.

The training loop involved the followings steps:<br>
    1) Create train/test dataloaders from the random 10-fold split.<br>
    2) Start loop for each epoch. For each epoch, send the train dataloader to the train function and iterate through the dataloader by batch size.<br>
    3) Clear the gradients for all optimizer parameters.<br>
    4) Predict the batch labels.<br>
    5) Compute the loss.<br>
    6) Compute the gradient for every parameter.<br>
    7) Clip the gradients to avoid gradient value explosion.<br>
    8) Update the parameters with the current gradient.<br>
    9) Compute the running total accuracy and count values for the epoch.<br>
    10) Compute an epoch accuracy score by evaluating the model on the test dataloader.<br>
    11) Update the learning rate if the current total accuracy is greater than the epoch accuracy (the epoch failed to improve model performance). otherwise update the total accuracy value to the epoch accuracy and save the models parameters.<br>
    12) Repeat steps 1-11 for the number of cross fold validation splits (10).<br>

## 2.2 Test Feed-Forward Neural Network

In [33]:
# Run the model on the entire training dataset
batch_size = 5
criterion = nn.BCELoss()
training_dataset = ImdbDataset(train_texts, classes)
training_dataloader = DataLoader(training_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

training_set_accuracy1 = evaluate(training_dataloader, best_model, criterion)

In [34]:
# Report the model's accuracy on the entire training dataset
print("Best model's accuracy on the entire training dataset: {:.3}".format(training_set_accuracy1))

Best model's accuracy on the entire training dataset: 1.0


## 2.3 Feed-Forward Neural Network with More Hidden Layers

In [35]:
class TextClassifierTwoHiddenLayers(nn.Module):
    """
    Text classifier model.
    Consists of a word embedding layer, two hidden layers, top layer, and activation function.
    
    Sources: 
        https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html
        https://www.analyticsvidhya.com/blog/2020/01/first-text-classification-in-pytorch/
        http://www.cse.chalmers.se/~richajo/nlp2019/l2/Text%20classification%20using%20a%20CBoW%20representation.html
    """
    def __init__(self, vocab_size, embedding_dims, n_layer1_nodes, n_layer2_nodes, num_classes):
        super(TextClassifierTwoHiddenLayers, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embedding_dims, sparse=True)
        self.hidden_layer1 = nn.Linear(embedding_dims, n_layer1_nodes)
        self.hidden_layer2 = nn.Linear(n_layer1_nodes, n_layer2_nodes)
        self.top_layer = nn.Linear(n_layer2_nodes, num_classes)
        self.act = nn.Sigmoid()
        self.dropout = nn.Dropout(0.2)
    
    def init_weights(self):
        initranges = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.hidden_layer1.weight.data.uniform_(-initrange, initrange)
        self.hidden_layer2.weight.data.uniform_(-initrange, initrange)
        self.hidden_layer.bias.data.zero_()
    
    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        hidden1 = self.act(self.hidden_layer1(embedded))
        self.dropout(hidden1)
        hidden2 = self.act(self.hidden_layer2(hidden1))
        scores = self.top_layer(hidden2)
        return self.act(scores)

In [36]:
# Create the double hidden layer text classifier model
vocab_size = len(train_vocabulary)
embedding_dims = 5
n_layer1_nodes = 5
n_layer2_nodes = 10
num_classes = 2

two_hidden_layer_model = TextClassifierTwoHiddenLayers(vocab_size, embedding_dims, n_layer1_nodes, n_layer2_nodes, num_classes).to(device)

In [37]:
# Create model output path
double_layer_model_out_path = os.path.join(os.getcwd(), 'part_2_output', 'best_double_hidden_layer_weights.pt')

In [None]:
# Train the double hidden layer model with the IMDB training set
crossValidation(train_texts, classes, two_hidden_layer_model, double_layer_model_out_path, epochs=25)

fold: 0
	-----------------------------------------------------------
	| end of epoch   1 | time:  1.09s | test accuracy    0.500 
	-----------------------------------------------------------
	-----------------------------------------------------------
	| end of epoch   2 | time:  0.90s | test accuracy    0.500 
	-----------------------------------------------------------
	-----------------------------------------------------------
	| end of epoch   3 | time:  0.96s | test accuracy    0.500 
	-----------------------------------------------------------
	-----------------------------------------------------------
	| end of epoch   4 | time:  1.05s | test accuracy    0.500 
	-----------------------------------------------------------
	-----------------------------------------------------------
	| end of epoch   5 | time:  0.96s | test accuracy    0.500 
	-----------------------------------------------------------
	-----------------------------------------------------------
	| end of epoch 

In [None]:
# Print the parameters for the best performing model's embedding, hidden, and output layers
# Note: the embedding layer is truncated
best_double_layer_model = TextClassifierTwoHiddenLayers(vocab_size, embedding_dims, n_layer1_nodes, n_layer2_nodes, num_classes)
best_double_layer_model.load_state_dict(torch.load(double_layer_model_out_path))

for name, param in best_double_layer_model.named_parameters():
    if param.requires_grad:
        print(name, param.data)

In [None]:
# Run the model on the entire training dataset
batch_size = 5
criterion = nn.BCELoss()
training_dataset = ImdbDataset(train_texts, classes)
training_dataloader = DataLoader(training_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)

training_set_accuracy2 = evaluate(training_dataloader, best_double_layer_model, criterion)

In [None]:
# Report the model's accuracy on the entire training dataset
print("Best model's accuracy on the entire training dataset: {:.3}".format(training_set_accuracy2))

## 2.4 Neural Network Classification on the Test Set

In [None]:
# Load the test data
test_files = glob.glob(os.path.join(os.getcwd(), 'imdb_test_data', '*'))
test_texts = [Path(file).read_text() for file in test_files]

In [None]:
# Preprocess the test texts
for index,text in enumerate(test_texts):
    test_texts[index] = preprocessText(text, cached_stopwords, wordnet_lemmatizer)
    
for index,text in enumerate(test_texts):
    test_texts[index] = replaceUnk(text, train_vocabulary, unk_token)

In [None]:
# Create dictionary of filenames and test_text indices
test_file_names_texts = {}

for index,text in enumerate(test_texts):
    file_name = test_files[index].rsplit('\\', 1)[1].rsplit('.txt')[0]
    test_file_names_texts[index] = file_name

In [None]:
def predict(model, test_texts):
    """
    Run the specified model in the specified test texts and return the predictions.
    """
    model.eval()
    total_accuracy, total_count = 0, 0
    batch_size = 5
    criterion = nn.BCELoss()
    test_classes = [None] * len(test_texts)
    test_dataset = ImdbDataset(test_texts, test_classes)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
    predictions = {}
    
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(test_dataloader):
            predicted_label = model(text, offsets)       
            labels = predicted_label.argmax(1).tolist()
            
            for index in range(0, len(labels)):
                text_index = (idx * batch_size) + index
                predictions[text_index] = labels[index]
                
    return predictions

In [None]:
# Run the model on the entire training dataset
if training_set_accuracy1 > training_set_accuracy2:
    test_predictions = predict(best_model, test_texts)
else:
    test_predictions = predict(best_double_layer_model, test_texts)

In [None]:
def outputPredictions(output_file_path, target_label, predictions, test_file_dict):
    """
    Write the names of the files classified by the model as belonging to the target
    label class to the specified output file.
    """
    with open(output_file_path, 'w') as file:
        for test_file_index in predictions.keys():
            if predictions[test_file_index] == target_label:
                test_file_name = test_file_dict[test_file_index]
                line = test_file_name + '\n'
                file.write(line)

In [None]:
# Group file names according to their predicted label and write file names to output files
pos_predictions_out = os.path.join(os.getcwd(), 'part_2_output', 'pos.txt')
neg_predictions_out = os.path.join(os.getcwd(), 'part_2_output', 'neg.txt')
outputPredictions(pos_predictions_out, 1, test_predictions, test_file_names_texts)
outputPredictions(neg_predictions_out, 0, test_predictions, test_file_names_texts)