In [None]:
import csv
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.naive_bayes import MultinomialNB as NB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report

Setting up data:

In [16]:
def make_data(name):
    data = []
    with open(name, mode = 'r') as file:
        csvFile = csv.reader(file)
        next(csvFile) #skip the first line
        for lines in csvFile:
            data.append(lines)
    return data

In [49]:
data = make_data("IMDB Dataset.csv")
train_data = data[:2000]
test_data = data[4000:6000]

print(len(test_data))

2000


In [50]:
dataset = [row[0] for row in train_data]
labels = [row[1] for row in train_data]

test_dataset = [row[0] for row in test_data]
test_labels = [row[1] for row in test_data]

Tokenizer method:

In [19]:
# TF (term frequency)
tf_vectorizer = CountVectorizer()
tf_train = tf_vectorizer.fit_transform(dataset)
tf_dense = tf_train.toarray()

In [20]:
#TF-IDF
idf_vectorizer = TfidfVectorizer()
idf_train = idf_vectorizer.fit_transform(dataset)
idf_dense = idf_train.toarray()

In [21]:
#embedding? might need Word2Vec or GloVe

LDA:

In [22]:
lda_model = LDA()
lda_model.fit(tf_dense, labels)

#2000 takes about 1min

QDA:

In [23]:
qda_model = QDA()
qda_model.fit(tf_dense, labels)

#warning: issues of collinear bc the words are dependent on each other, not completely independent
#2000 takes about 6 sec



Logistic Regression:

In [24]:
log_model = LogisticRegression(max_iter=1000)
log_model.fit(tf_train, labels)

#2000 takes about .5 sec

NaiveBayes:

In [25]:
nb_model = NB()
# X_dense = X_train.toarray()
nb_model.fit(tf_train, labels)

Testing:

In [26]:
test_text = ["The soundtrack was beautiful"]
tf_test = tf_vectorizer.transform(test_text)
tf_test_dense = tf_test.toarray()

print(f"Sentiment LDA: {lda_model.predict(tf_test_dense)}")
print(f"Sentiment QDA: {qda_model.predict(tf_test_dense)}")
print(f"Sentiment Log: {log_model.predict(tf_test)}")
print(f"Sentiment NB: {nb_model.predict(tf_test)}")

Sentiment LDA: ['positive']
Sentiment QDA: ['negative']
Sentiment Log: ['positive']
Sentiment NB: ['positive']


Analysis:

In [54]:
tf_test = tf_vectorizer.transform(test_dataset)
tf_test_dense = tf_test.toarray()

In [None]:
predictions = log_model.predict(tf_test)
print(classification_report(test_labels, predictions))
    

              precision    recall  f1-score   support

    negative       0.85      0.82      0.84      1001
    positive       0.83      0.86      0.84       999

    accuracy                           0.84      2000
   macro avg       0.84      0.84      0.84      2000
weighted avg       0.84      0.84      0.84      2000

