In [1]:
import pandas as pd
import numpy as np
import time
import csv
import fasttext

from scipy import stats

import matplotlib
import matplotlib.pyplot as plt

import statsmodels.api as sm

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD

%matplotlib inline

In [2]:
tomatoes_path = "C:/Users/maddo/CS770_data/project_data/rotten_tomatoes_data"

df = pd.read_csv(tomatoes_path + "/rotten_tomatoes.csv")

In [3]:
df = df['review_content'].dropna().reset_index(drop=True)

In [4]:
f = open(tomatoes_path + "/tomatoes_reviews_text.txt", 'a', encoding="utf-8")

for review in df.values:
    f.write(review + " ")
    
f.close()

In [7]:
review_vectors = fasttext.train_unsupervised(tomatoes_path + "/tomatoes_reviews_text.txt")
review_vectors.save_model(tomatoes_path + "/tomatoes_vectors.bin")

In [8]:
model = fasttext.load_model(tomatoes_path + "/tomatoes_vectors.bin")

with open(tomatoes_path + "/tomatoes_vectors.txt", "w", encoding="utf-8") as f:
    # Write vocabulary size and embedding dimensions as the first line
    f.write(f"{len(model.get_words())} {model.get_dimension()}\n")
    
    # Write each word and its vector to the file
    for word in model.get_words():
        vector = model.get_word_vector(word)
        vector_str = " ".join(map(str, vector))
        f.write(f"{word} {vector_str}\n")



In [9]:
imdb_path = "C:/Users/maddo/CS770_data/project_data/imdb_data/IMDB Dataset.csv"

df = pd.read_csv(imdb_path)

df['review'] = df['review'].str.replace("<br /><br />", " ").str.replace('\u200b', '').str.strip()

In [10]:
# X contains text contents of reviews, y contains labeled sentiment
X = df['review']
y = df['sentiment']

In [11]:
# split the data into training, testing and validation data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [12]:
# record start time of data training model
start_time = time.perf_counter()

# tfidf vectorizer using character n-grams (with bounds), n-gram range of 3-6 letters, and keep words in vocab that appear at least 5 times
vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,6), min_df=5)
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

# dimensionality reduction of sparce vectors from vectorization
# convert sparse vectors into dense word embedding vectors
# somewhat of an imitation of fasttext's word embedding hidden layer
svd = TruncatedSVD(n_components=300, random_state=42)
X_train_dense = svd.fit_transform(X_train_vectors)
X_test_dense = svd.transform(X_test_vectors)

def load_embeddings(file):
    embeddings = {}
    with open(file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

file = tomatoes_path + "/tomatoes_vectors.txt"  # Example path to 100D GloVe embeddings
embeddings = load_embeddings(file)

# Step 4: Function to compute average word embedding for each document
def compute_average_word_embedding(text, embeddings):
    words = text.split()
    word_vectors = [embeddings.get(word, np.zeros(100)) for word in words]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(100)

# Compute average word embeddings for X_train and X_test
X_train_embeddings = np.array([compute_average_word_embedding(text, embeddings) for text in X_train])
X_test_embeddings = np.array([compute_average_word_embedding(text, embeddings) for text in X_test])

# Step 5: Concatenate reduced TF-IDF vectors and average word embeddings
X_train_combined = np.hstack((X_train_dense, X_train_embeddings))
X_test_combined = np.hstack((X_test_dense, X_test_embeddings))

# scale resulting data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_combined)
X_test_scaled = scaler.transform(X_test_combined)

# get total time taken to train and scale
end_time = time.perf_counter()
initial_elapsed_time = end_time - start_time

print("Time Taken to Train and Scale Raw Text Data:")
print(str(initial_elapsed_time) + " Seconds")

Time Taken to Train and Scale Raw Text Data:
1039.459019 Seconds


In [14]:
# initialize time and model accuracy lists
times = []
accuracies = []

In [15]:
param_grid = {'alpha': [.00001, .0001, .001, .01, .1, 1]}

# record start time of training and predicting model
start_time = time.perf_counter()

# svm with stochastic gradient descent training
sgdlog_clf = SGDClassifier(loss='hinge', n_jobs=-1)

grid_search_2 = GridSearchCV(sgdlog_clf, param_grid, scoring='accuracy', n_jobs=-1)
grid_search_2.fit(X_train_scaled, y_train)
best_sgdlog = grid_search_2.best_estimator_

y_pred = best_sgdlog.predict(X_test_scaled)

# get total time taken to train model and predict
end_time = time.perf_counter()
elapsed_time = end_time - start_time
times.append(elapsed_time + initial_elapsed_time)

accuracies.append(accuracy_score(y_test, y_pred))

print("Tuned Alpha Value:")
print(grid_search_2.best_params_)

print("\nTotal Time Taken to Train and Evaluate Model:")
print(str(elapsed_time + initial_elapsed_time) + " Seconds")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Tuned Alpha Value:
{'alpha': 0.01}

Total Time Taken to Train and Evaluate Model:
1060.0550573 Seconds

Confusion Matrix:
[[8777 1212]
 [1055 8956]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.89      0.88      0.89      9989
    positive       0.88      0.89      0.89     10011

    accuracy                           0.89     20000
   macro avg       0.89      0.89      0.89     20000
weighted avg       0.89      0.89      0.89     20000

