1. [20 pts] Ingest, pre-process, and tokenize the reviews with your favorite approach and then generate a Tf-Idf matrix with 1000 top features (i.e., terms).

In [17]:
import re
from nltk.corpus import stopwords
import nltk
def ie_preprocess(document):
    # Sentence Parse
    document = re.sub('<br />', '', document)
    document = re.sub(r'[^\w\s]', '', document)
    sentences = nltk.sent_tokenize(document)
    
    # Word Parse and remove stopwords
    stop_words = set(stopwords.words('english'))
    
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [[word for word in sent if word.lower() not in stop_words and len(word) > 2 and word.isalpha()] for sent in sentences]
    
    return sentences

In [18]:
import csv
import pandas as pd
path='./movie_data.csv'
df=pd.read_csv(path)
df['review']=df['review'].apply(ie_preprocess)

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(df['review'].astype('str'))

In [21]:
y=df['sentiment']

In [5]:
from sklearn.svm import LinearSVC
svc_classifier = LinearSVC()
# Extract the coefficients and feature names
svc_classifier.fit(X,y)

coefficients = svc_classifier.coef_[0]
feature_names = tfidf_vectorizer.get_feature_names_out()
feature_coefficients = list(zip(feature_names, coefficients))

# Sort the feature coefficients
sorted_feature_coefficients = sorted(feature_coefficients, key=lambda x: x[1])

# Print the top and bottom coefficients
def print_top_and_bottom_coefs(sorted_feature_coefficients, num_top=1000):
    print("Keywords for Best:")
    for i, (feature, coefficient) in enumerate(sorted_feature_coefficients[-num_top:][::-1]):
        print(f"#{i+1}, {feature}: {coefficient}")

print_top_and_bottom_coefs(sorted_feature_coefficients)



Keywords for Best:
#1, excellent: 2.5298819401162476
#2, great: 2.2093691132720177
#3, amazing: 2.182372679732128
#4, perfect: 2.133838148669818
#5, superb: 2.070957968843606
#6, wonderful: 1.8569252256014506
#7, perfectly: 1.8308555720637896
#8, hilarious: 1.787231316245075
#9, fantastic: 1.7843864906754971
#10, favorite: 1.776720000500201
#11, brilliant: 1.7351234221576741
#12, solid: 1.6819121754668769
#13, best: 1.6783160660167218
#14, enjoyed: 1.6651351940770112
#15, loved: 1.6465845955064686
#16, today: 1.6020476406570272
#17, enjoyable: 1.5926570709960723
#18, highly: 1.587534819926375
#19, unique: 1.5852193706789572
#20, incredible: 1.4478497078951662
#21, surprised: 1.4125351913587825
#22, masterpiece: 1.3954493505851648
#23, definitely: 1.3641140758184918
#24, powerful: 1.3399162467967922
#25, entertaining: 1.3262425639271371
#26, greatest: 1.2810452346111363
#27, fun: 1.2679582653430144
#28, unlike: 1.1959834365236865
#29, simple: 1.190098968146206
#30, strong: 1.18640081512

Report the size of the matrix.

In [6]:
num_documents, num_features = X.shape

# Print the size
print(f"Number of documents: {num_documents}")
print(f"Number of features (terms): {num_features}")

Number of documents: 50000
Number of features (terms): 1000


Generate the y-vector for sentiment labels for PyTorch, report the size of the vector and the unique values.
(Hint: recall y-vector values must be of type int64 for PyTorch)

In [7]:
import numpy as np
y_tfidf = np.array(df['sentiment'].astype('category').cat.codes, dtype='int64')

In [8]:
y_tfidf.shape

(50000,)

2. [10 pts] Write a 10-fold classification evaluation function with arguments of (classifier,
X, y).
(Hint: we have similar functions in several notebooks)

In [9]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


In [22]:
from sklearn.model_selection import train_test_split

# Class labels (integers) are int64 for PyTorch
y_tfidf = np.array(df['sentiment'], dtype='int64')


In [11]:
def kfold_eval_docs(_clf, _Xdocs, _ydocs):
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import accuracy_score

    # Need indexable data structure
    accuracy = []
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
    for train_index, test_index in kf.split(_Xdocs, _ydocs):
        _clf.fit(_Xdocs[train_index], _ydocs[train_index])
        ypred = _clf.predict(_Xdocs[test_index])
        accuracy += [accuracy_score(_ydocs[test_index], ypred)]
    return np.array(accuracy)

In [42]:
import torch

# Create a CountVectorizer for text data
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Prepare your features and labels
X_tfidf = tfidf_vectorizer.fit_transform(df['review'].astype('str'))
y_tfidf = df['sentiment']
# Transform the training data using the CountVectorizer

# Initialize and train the Logistic Regression classifier

#SVM was too slow my computer took too long
acc = kfold_eval_docs(LinearSVC(class_weight='balanced'), X_tfidf, y)
print(f"Linear SVM 10-fold CV accuracy= {np.mean(acc):.2f} {chr(177)}{np.std(acc):.3f}")




Linear SVM 10-fold CV accuracy= 0.86 ±0.005


In [34]:
def to_str(review):
    return str(review)

In [36]:
df['review'].head()

0    [[teenager, Martha, Moxley, Maggie, Grace, mov...
1    [[really, like, Kris, Kristofferson, usual, ea...
2    [[SPOILER, read, think, watching, movie, altho...
3    [[people, seen, wonderful, movie, sure, thet, ...
4    [[recently, bought, DVD, forgetting, much, hat...
Name: review, dtype: object

3. [20 pts] Write a PyTorch feed forward neural network with 1 hidden layer.

Instantiate a classifier with 100 hidden layer size, 10 epochs, 0.1 eta and 2000 batch size.

In [89]:
X_torch = torch.tensor(X_tfidf.toarray())
y_torch = torch.tensor(y_tfidf,dtype=torch.float64)

X_train, X_test, y_train, y_test = train_test_split(X_torch, y_torch, test_size=0.3)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1)
y_train = y_train.long()
y_val = y_val.long()


import torch.nn as nn

class MyNetwork(nn.Module):
    """ A PyTorch neural network model """
    def __init__(self):
        super(MyNetwork, self).__init__()
        # Create the input and hidden layers with ReLU activation, 100 hidden layer size
        self.fc1 = nn.Linear(1000, 100)  # A simple input layer, size M x Hsize
        self.fc2 = nn.Linear( 100, 100)  # A simple hidden layer, size Hsize x Hsize
        self.fc3 = nn.Linear( 100,  20)  # A simple output layer, size Hsize x K

    def forward(self, _x, apply_softmax=True):
        _x = _x.float()
        _x = nn.functional.relu(self.fc1(_x))
        _x = nn.functional.relu(self.fc2(_x))
        # the output with Softmax activation.
        _x = self.fc3(_x)
        if apply_softmax:
            _x = nn.functional.softmax(_x, dim=1)
        return _x

# Instantiate the neural network
Net1 = MyNetwork()
print(Net1)

# Set the learning rate - this part is magic of course

import torch.optim as optim
eta = 0.1
# Create a Stochastic Gradient Descent optimizer
optimizer = optim.SGD(Net1.parameters(), lr=eta, momentum=0.9)

# Create the loss function
loss_func = nn.CrossEntropyLoss()

def predict(_x):
    net_out = Net1.forward(_x, apply_softmax=True)
    p_values, indices = net_out.max(dim=1)
    return indices

import sys

epochs=10
minibatch_size=2000

# The main training loop
for i in range(epochs):
    indices = np.arange(X_train.shape[0])
    for start_idx in range(0, indices.shape[0] - minibatch_size + 1, minibatch_size):
        batch_idx = indices[start_idx:start_idx + minibatch_size]
        # step 1.
        optimizer.zero_grad()
        # step 2.
        net_out = Net1.forward(X_train[batch_idx])
        # step 3.
        loss = loss_func(net_out, y_train[batch_idx])
        # step 4.
        loss.backward()
        # step 5.
        optimizer.step()

    y_pred = predict(X_train)
    y_val_pred = predict(X_val)
    
    train_acc = (torch.sum(y_train == y_pred).float() / X_train.shape[0])
    val_acc = (torch.sum(y_val == y_val_pred).float() / X_val.shape[0])
    
    sys.stderr.write(f"\r{i+1}/{epochs} | Cost: {loss:.2f} | Train/Valid Acc.: {train_acc*100:.2f}%/{val_acc*100:.2f}%")
    sys.stderr.flush()

MyNetwork(
  (fc1): Linear(in_features=1000, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=20, bias=True)
)


10/10 | Cost: 2.57 | Train/Valid Acc.: 50.20%/50.97%