In [11]:
import pandas as pd
import re
import spacy
from collections import Counter
import pickle
from sklearn.metrics import roc_auc_score

In [12]:
# Download the English model for spaCy
import spacy.cli
spacy.cli.download("en_core_web_sm")

# Load Dataset
df = pd.read_csv('IMDB Dataset.csv')
print(df.head())
print(df.isnull().sum())

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
review       0
sentiment    0
dtype: int64


In [13]:
# Disable componenents to speed up processing and focus on tokenization and lemmatization
nlp = spacy.load("en_core_web_sm",disable=["parser", "ner"])

# Process reviews efficiently using streaming (generator) to avoid memory overflow on large datasets
def preprocess_dataset(texts):
    # Remove HTML tags first
    texts = [re.sub(r'<[^>]+>', '', text) for text in texts]
    preprocessed_text = []
    # nlp.pipe processes texts as a stream and is much faster than applying nlp to each text individually
    for doc in nlp.pipe(texts, batch_size=1000):
        # Extract lemmas for non-stop, non-punct, alphabetic tokens, lowercased
        tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and token.is_alpha]
        preprocessed_text.append(' '.join(tokens))
    return preprocessed_text
            

# Display original and preprocessed text for the first few reviews to verify preprocessing
# Note: Preprocessing will be done after train/test split to prevent data leakage

In [14]:
# Build Vocabulary & Vectorizers
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Initialize vectorizers with n-gram support
bow_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=5, max_features=5000) # ngram_range=(1,2) means unigrams + bigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_features=5000) # min_df=5 ignores terms that appear in less than 5 documents (removes noise)

In [15]:
# Vectorization Phase
from sklearn.model_selection import train_test_split

# Encode labels first (positive = 1, negative = 0)
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Split data into train/test (80/20) to prevent data leakage
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['review'], 
    df['label'], 
    test_size=0.2, 
    stratify=df['label'], 
    random_state=42
)

# Preprocess training and test sets separately
X_train_text = preprocess_dataset(X_train_text)
X_test_text = preprocess_dataset(X_test_text)

# Fit vectorizers on training data only, then transform both sets
X_train_bow = bow_vectorizer.fit_transform(X_train_text)
X_test_bow = bow_vectorizer.transform(X_test_text)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

print(f"Training set size: {len(X_train_text)}")
print(f"Test set size: {len(X_test_text)}")
print(f"BoW Matrix Shape (train): {X_train_bow.shape}")
print(f"TF-IDF Matrix Shape (train): {X_train_tfidf.shape}")

Training set size: 40000
Test set size: 10000
BoW Matrix Shape (train): (40000, 5000)
TF-IDF Matrix Shape (train): (40000, 5000)


In [16]:
# Training Pipeline with Cross-Validation
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
import numpy as np

# Set up Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize models
model_tfidf = LogisticRegressionCV(
    max_iter=1000, 
    Cs=20, 
    solver='saga', 
    n_jobs=-1, 
    cv=skf,
    random_state=42,
    l1_ratios=(0,),
    use_legacy_attributes=True
)

model_bow = LogisticRegressionCV(
    max_iter=1000, 
    Cs=20, 
    solver='saga', 
    n_jobs=-1, 
    cv=skf,
    random_state=42,
    l1_ratios=(0,),
    use_legacy_attributes=True
)

model_tfidf_alt = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
model_bow_alt = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)

# Train all models on training set
model_tfidf.fit(X_train_tfidf, y_train)
model_tfidf_alt.fit(X_train_tfidf, y_train)
model_bow_alt.fit(X_train_bow, y_train)
model_bow.fit(X_train_bow, y_train)

# Evaluate on test set
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
print("TF-IDF Logistic Regression Test Results:")
print(classification_report(y_test, y_pred_tfidf))
print(confusion_matrix(y_test, y_pred_tfidf))
print(f"ROC AUC: {roc_auc_score(y_test, model_tfidf.predict_proba(X_test_tfidf)[:, 1]):.4f}")

y_pred_tfidf_alt = model_tfidf_alt.predict(X_test_tfidf)
print("TF-IDF Naive Bayes Test Results:")
print(classification_report(y_test, y_pred_tfidf_alt))
print(confusion_matrix(y_test, y_pred_tfidf_alt))
print(f"ROC AUC: {roc_auc_score(y_test, model_tfidf_alt.predict_proba(X_test_tfidf)[:, 1]):.4f}")

y_pred_bow_alt = model_bow_alt.predict(X_test_bow)
print("BoW Naive Bayes Test Results:")
print(classification_report(y_test, y_pred_bow_alt))
print(confusion_matrix(y_test, y_pred_bow_alt))
print(f"ROC AUC: {roc_auc_score(y_test, model_bow_alt.predict_proba(X_test_bow)[:, 1]):.4f}")

y_pred_bow = model_bow.predict(X_test_bow)
print("BoW Logistic Regression Test Results:")
print(classification_report(y_test, y_pred_bow))
print(confusion_matrix(y_test, y_pred_bow))
print(f"ROC AUC: {roc_auc_score(y_test, model_bow.predict_proba(X_test_bow)[:, 1]):.4f}")





TF-IDF Logistic Regression Test Results:
              precision    recall  f1-score   support

           0       0.89      0.88      0.89      5000
           1       0.88      0.90      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

[[4382  618]
 [ 517 4483]]
ROC AUC: 0.9544
TF-IDF Naive Bayes Test Results:
              precision    recall  f1-score   support

           0       0.87      0.83      0.85      5000
           1       0.84      0.87      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

[[4155  845]
 [ 640 4360]]
ROC AUC: 0.9282
BoW Naive Bayes Test Results:
              precision    recall  f1-score   support

           0       0.85      0.84      0.84      5000
           1       0.84      0.85      0.85    

In [17]:
# Save all trained models and vectorizers for deployment
with open('sentiment_models.pkl', 'wb') as f:
    pickle.dump({
        'tfidf_model': model_tfidf,
        'tfidf_model_alt': model_tfidf_alt,
        'bow_model': model_bow,
        'bow_model_alt': model_bow_alt,
        'tfidf_vectorizer': tfidf_vectorizer,
        'bow_vectorizer': bow_vectorizer
    }, f)

print("All models and vectorizers saved successfully!")

All models and vectorizers saved successfully!
