In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.layers import TextVectorization
from keras.utils import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, Bidirectional
import os

In [8]:
# Read and prepare the data
pos_train = []
directory = 'C:\\Users\\sayantan.manik\\OneDrive - The University of South Dakota\\Desktop\\CSC447\\aclImdb_v1\\aclImdb\\train\\pos'
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        with open(os.path.join(directory, filename), 'r') as file:
            pos_train.append(file.read())

pos_labels = np.ones(len(pos_train))            

neg_train = []
directory = 'C:\\Users\\sayantan.manik\\OneDrive - The University of South Dakota\\Desktop\\CSC447\\aclImdb_v1\\aclImdb\\train\\neg'
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        with open(os.path.join(directory, filename), 'r') as file:
            neg_train.append(file.read())
            
neg_labels = np.zeros(len(neg_train))

x_train = np.concatenate((pos_train, neg_train))
y_train = np.concatenate((pos_labels, neg_labels))

pos_test = []
directory = 'C:\\Users\\sayantan.manik\\OneDrive - The University of South Dakota\\Desktop\\CSC447\\aclImdb_v1\\aclImdb\\test\\pos'
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        with open(os.path.join(directory, filename), 'r') as file:
            pos_test.append(file.read())

neg_test = []
directory = 'C:\\Users\\sayantan.manik\\OneDrive - The University of South Dakota\\Desktop\\CSC447\\aclImdb_v1\\aclImdb\\test\\pos'
for filename in os.listdir(directory):
    if filename.endswith('.txt'):
        with open(os.path.join(directory, filename), 'r') as file:
            neg_test.append(file.read())    

x_test = np.concatenate((pos_test, neg_test))
y_test = np.concatenate((pos_labels, neg_labels))

# Shuffle the data
np.random.seed(0)
shuffle_indices = np.random.permutation(x_train.shape[0])
x_train = x_train[shuffle_indices]
y_train = y_train[shuffle_indices]

In [9]:
max_len = 1000
max_token = 50000

# Preprocessing

def unigram_preprocessing(text_data,max_len):
    vectorizer = CountVectorizer(stop_words="english")
    unigram = vectorizer.fit_transform(text_data)
    unigram = unigram.toarray()
    unigram = pad_sequences(unigram, maxlen=max_len)
    return unigram

def bigram_preprocessing(text_data,max_len):
    vectorizer = CountVectorizer(ngram_range=(2, 2),stop_words="english")
    bigram = vectorizer.fit_transform(text_data)
    bigram = bigram.astype(np.uint8).toarray() # to save memory
    bigram = pad_sequences(bigram,maxlen=max_len)
    return bigram

def tf_df_preprocessing(text_data,max_len):
    vectorizer = TfidfVectorizer(stop_words="english")
    tf_df = vectorizer.fit_transform(text_data)
    tf_df = tf_df.toarray()
    tf_df = pad_sequences(tf_df,maxlen=max_len)
    return tf_df

def LSTM_preprocessing(text_data,max_len,max_token):
    vectorizer = TextVectorization(max_tokens=max_token, output_sequence_length=max_len)
    vectorizer.adapt(text_data)
    vectorizer = vectorizer(text_data)
    return vectorizer

In [15]:
# Bag of Words
def bagofwords_evaluation(x_train, y_train, x_test, y_test, max_len):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(max_len,)),
        # Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    # Train the model
    model.fit(x_train, y_train, epochs=10, batch_size=32)
    # Evaluate the model
    loss, accuracy = model.evaluate(x_test, y_test)
    print('Test loss:', loss)
    print('Test accuracy:', accuracy)

# LSTM
def LSTM_evaluation(x_train, y_train, x_test, y_test, max_len,max_token):
    model = Sequential([
        Embedding(input_dim=max_token, output_dim=64, input_length=max_len),
        LSTM(units=64, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(x_train, y_train, epochs=1, batch_size=32) # epochs=1 because it takes too long to train
    loss, accuracy = model.evaluate(x_test, y_test)
    print('Test loss:', loss)
    print('Test accuracy:', accuracy)

In [11]:
# unigram
bagofwords_evaluation(unigram_preprocessing(x_train,max_len), y_train, unigram_preprocessing(x_test,max_len), y_test, max_len)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 1.8275463581085205
Test accuracy: 0.5


In [12]:
# bigram
bagofwords_evaluation(bigram_preprocessing(x_train,max_len), y_train, bigram_preprocessing(x_test,max_len), y_test, max_len)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.7576073408126831
Test accuracy: 0.5


In [13]:
# tf_df
bagofwords_evaluation(tf_df_preprocessing(x_train,max_len), y_train, tf_df_preprocessing(x_test,max_len), y_test, max_len)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.6931456923484802
Test accuracy: 0.5


In [16]:
# LSTM
LSTM_evaluation(LSTM_preprocessing(x_train,max_len,max_token), y_train, LSTM_preprocessing(x_test,max_len,max_token), y_test, max_len, max_token)

Test loss: 0.6931659579277039
Test accuracy: 0.5
