In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [3]:
# Path to the dataset
train_dir= r'C:\Users\Isi\Desktop\datanerds\Project_1\aclImdb_v1\aclImdb\train'
test_dir = r'C:\Users\Isi\Desktop\datanerds\Project_1\aclImdb_v1\aclImdb\test'

def load_data(data_dir):
    data = []
    labels = []
    for label in ['pos', 'neg']:
        folder = os.path.join(data_dir, label)
        print(f"Loading data from {folder}")  # Debug print
        for filename in os.listdir(folder):
            with open(os.path.join(folder, filename), 'r', encoding='utf-8') as file:
                data.append(file.read())
            labels.append(1 if label == 'pos' else 0)
    return data, labels

# Load train and test data
train_data, train_labels = load_data(train_dir)
test_data, test_labels = load_data(test_dir)


Loading data from C:\Users\Isi\Desktop\datanerds\Project_1\aclImdb_v1\aclImdb\train\pos
Loading data from C:\Users\Isi\Desktop\datanerds\Project_1\aclImdb_v1\aclImdb\train\neg
Loading data from C:\Users\Isi\Desktop\datanerds\Project_1\aclImdb_v1\aclImdb\test\pos
Loading data from C:\Users\Isi\Desktop\datanerds\Project_1\aclImdb_v1\aclImdb\test\neg


In [4]:
# Split a validation set from training data
train_texts, val_texts, train_labels, val_labels = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)

### Model 1: Naive Bayes Classifier

In [5]:
# Using TF-IDF Vectorizer
tfidf_vectorizer= TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
X_val_tfidf = tfidf_vectorizer.transform(val_texts)
X_test_tfidf = tfidf_vectorizer.transform(test_data)

# Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, train_labels)

# Predictions
nb_preds = nb_model.predict(X_val_tfidf)
print("Naive Bayes Validation Accuracy: ", accuracy_score(val_labels, nb_preds))
print(classification_report(val_labels, nb_preds))

Naive Bayes Validation Accuracy:  0.8482
              precision    recall  f1-score   support

           0       0.84      0.85      0.85      2485
           1       0.85      0.84      0.85      2515

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000



### Model 2: Logistic Regression

In [6]:
# Logistic Regression model with L2 regularization
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, train_labels)

# Predictions
lr_preds = lr_model.predict(X_val_tfidf)
print("Logistic Regression Validation Accuracy: ", accuracy_score(val_labels, lr_preds))
print(classification_report(val_labels, lr_preds))


Logistic Regression Validation Accuracy:  0.8878
              precision    recall  f1-score   support

           0       0.89      0.88      0.89      2485
           1       0.88      0.90      0.89      2515

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



### Model 3: LSTM (Recurrent Neural Network)

In [7]:
# Tokenize text for LSTM
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_texts)
X_train_seq = tokenizer.texts_to_sequences(train_texts)
X_val_seq = tokenizer.texts_to_sequences(val_texts)
X_test_seq = tokenizer.texts_to_sequences(test_data)

# Padding sequences to have the same length
maxlen = 200
X_train_seq = pad_sequences(X_train_seq, maxlen=maxlen)
X_val_seq = pad_sequences(X_val_seq, maxlen=maxlen)
X_test_seq = pad_sequences(X_test_seq, maxlen=maxlen)

# Build LSTM Model
lstm_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=5000, output_dim=128, input_length=maxlen),
    tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



In [8]:
# Train the LSTM model
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
lstm_model.fit(X_train_seq, np.array(train_labels), epochs=10, batch_size=64, validation_data=(X_val_seq, np.array(val_labels)), callbacks=[early_stopping])

# Predictions
lstm_preds = (lstm_model.predict(X_val_seq) > 0.5).astype("int32")
print("LSTM Validation Accuracy: ", accuracy_score(val_labels, lstm_preds))

Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 351ms/step - accuracy: 0.6999 - loss: 0.5650 - val_accuracy: 0.8098 - val_loss: 0.4229
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 348ms/step - accuracy: 0.8511 - loss: 0.3592 - val_accuracy: 0.8208 - val_loss: 0.3921
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 344ms/step - accuracy: 0.8671 - loss: 0.3229 - val_accuracy: 0.7852 - val_loss: 0.4435
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 340ms/step - accuracy: 0.8598 - loss: 0.3322 - val_accuracy: 0.8508 - val_loss: 0.3652
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 348ms/step - accuracy: 0.9018 - loss: 0.2461 - val_accuracy: 0.8404 - val_loss: 0.3831
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 340ms/step - accuracy: 0.9178 - loss: 0.2136 - val_accuracy: 0.8560 - val_loss: 0.3601
Epoc

#### Evaluation

In [9]:
# Evaluate on the test data
nb_test_preds = nb_model.predict(X_test_tfidf)
print("Naive Bayes Test Accuracy: ", accuracy_score(test_labels, nb_test_preds))

lr_test_preds = lr_model.predict(X_test_tfidf)
print("Logistic Regression Test Accuracy: ", accuracy_score(test_labels, lr_test_preds))

lstm_test_preds = (lstm_model.predict(X_test_seq) > 0.5).astype("int32")
print("LSTM Test Accuracy: ", accuracy_score(test_labels, lstm_test_preds))

Naive Bayes Test Accuracy:  0.84048
Logistic Regression Test Accuracy:  0.879
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 59ms/step
LSTM Test Accuracy:  0.84656


In [10]:
import joblib

# Save the model
joblib.dump(lr_model, 'logistic_regression_model.joblib')


['logistic_regression_model.joblib']

In [12]:
# Save the vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

['tfidf_vectorizer.joblib']