In [1]:
import csv
from gensim.models import Word2Vec

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

In [2]:
def preprocess(text):
    text = text.lower()
    tokens = ''.join([char if char.isalnum() or char.isspace() else ' ' for char in text]).split()
    stopwords = {'is', 'the', 'a', 'and', 'in', 'on', 'to', 'of'}
    tokens = [token for token in tokens if token not in stopwords]
    return tokens

In [3]:
def make_data(name):
    data = []
    with open(name, mode = 'r') as file:
        csvFile = csv.reader(file)
        next(csvFile) #skip the first line
        for lines in csvFile:
            data.append(lines)
    return data

data = make_data("IMDB_reviews.csv")
train_data = data[:2000]
test_data = data[4000:6000]

dataset = [row[0] for row in train_data]
labels = [row[1] for row in train_data]

test_dataset = [row[0] for row in test_data]
test_labels = [row[1] for row in test_data]

In [4]:
# Preprocess and tokenize the corpus
tokenized_corpus = [preprocess(sentence) for sentence in dataset]

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

In [5]:
import numpy as np

def tokenize_with_word2vec(text, model, vector_size=100):
    tokens = preprocess(text)
    embeddings = []
    
    for token in tokens:
        if token in model.wv:
            embeddings.append(model.wv[token])
        else:
            embeddings.append(np.zeros(vector_size))  # Handle out-of-vocabulary words
    
    return np.array(embeddings)

# average word embeddings for sklearn models
def average_word_embeddings(embeddings):
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    return np.zeros(embeddings.shape[1])

In [6]:
tokenized = [tokenize_with_word2vec(sentence, word2vec_model) for sentence in dataset]
average_embeddings = [average_word_embeddings(sentence_embeddings) for sentence_embeddings in tokenized]

lda_model = LDA()
lda_model.fit(np.array(average_embeddings), labels)

qda_model = QDA()
qda_model.fit(np.array(average_embeddings), labels)

log_model = LogisticRegression(max_iter=1000)
log_model.fit(np.array(average_embeddings), labels)

nb_model = NB()
nb_model.fit(np.array(average_embeddings), labels)

In [8]:
test_tokenized = [tokenize_with_word2vec(sentence, word2vec_model) for sentence in test_dataset]

#has to be averaged, could effect accuracy
test_average = [average_word_embeddings(sentence_embeddings) for sentence_embeddings in test_tokenized]
# test_dense = test_average.toarray()

lda_predictions = lda_model.predict(test_average)
qda_predictions = qda_model.predict(test_average)
nb_predictions = nb_model.predict(test_average)
log_predictions = log_model.predict(test_average)


print("LDA")
print(classification_report(test_labels, lda_predictions))
print("QDA")
print(classification_report(test_labels, qda_predictions))
print("Logistic Regression")
print(classification_report(test_labels, log_predictions))
print("NaiveBayes")
print(classification_report(test_labels, nb_predictions))

LDA
              precision    recall  f1-score   support

    negative       0.76      0.77      0.77      1001
    positive       0.77      0.75      0.76       999

    accuracy                           0.76      2000
   macro avg       0.76      0.76      0.76      2000
weighted avg       0.76      0.76      0.76      2000

QDA
              precision    recall  f1-score   support

    negative       0.69      0.69      0.69      1001
    positive       0.69      0.69      0.69       999

    accuracy                           0.69      2000
   macro avg       0.69      0.69      0.69      2000
weighted avg       0.69      0.69      0.69      2000

Logistic Regression
              precision    recall  f1-score   support

    negative       0.65      0.66      0.66      1001
    positive       0.66      0.65      0.65       999

    accuracy                           0.65      2000
   macro avg       0.65      0.65      0.65      2000
weighted avg       0.65      0.65      0.65   