In [13]:
import pandas as pd
import re
import spacy
from collections import Counter
import pickle

In [14]:
# Download the English model for spaCy
import spacy.cli
spacy.cli.download("en_core_web_sm")

# Load Dataset
df = pd.read_csv('IMDB Dataset.csv')
print(df.head())
print(df.isnull().sum())

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
review       0
sentiment    0
dtype: int64


In [15]:
# Disable componenents to speed up processing and focus on tokenization and lemmatization
nlp = spacy.load("en_core_web_sm",disable=["parser", "ner"])

# Process reviews efficiently using streaming (generator) to avoid memory overflow on large datasets
def preprocess_dataset(texts):
    preprocessed_text = []
    # nlp.pipe processes texts as a stream and is much faster than applying nlp to each text individually
    for doc in nlp.pipe(texts, batch_size=1000):
        # Extract lemmas for non-stop, non-punct tokens
        tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        preprocessed_text.append(' '.join(tokens))
    return preprocessed_text
            
df['preprocessed_text'] = preprocess_dataset(df['review'])

# Display original and preprocessed text for the first few reviews to verify preprocessing
print(df[['review', 'preprocessed_text']].head())

                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                   preprocessed_text  
0  reviewer mention watch 1 Oz episode hook right...  
1  wonderful little production < br /><br />the f...  
2  think wonderful way spend time hot summer week...  
3  basically family little boy Jake think zombie ...  
4  Petter Mattei love Time money visually stunnin...  


In [16]:
# Build Vocabulary & Vectorizers
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Initialize vectorizers with n-gram support
bow_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=5, max_features=1000) # ngram_range=(1,2) means unigrams + bigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=5, max_features=1000) # min_df=5 ignores terms that appear in less than 5 documents (removes noise)

# Save vectorizers to a pickle file
with open('vectorizers.pkl', 'wb') as f:
    pickle.dump({
        'bow_vectorizer': bow_vectorizer,
        'tfidf_vectorizer': tfidf_vectorizer
    }, f)

In [17]:
# Vectorization Phase
from sklearn.model_selection import train_test_split

# Encode labels first (positive = 1, negative = 0)
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Split data before vectorization to prevent data leakage
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['preprocessed_text'], 
    df['label'], 
    test_size=0.2, 
    stratify=df['label'], 
    random_state=42
)

# Fit vectorizers on training data only, then transform both sets
X_train_bow = bow_vectorizer.fit_transform(X_train_text)
X_test_bow = bow_vectorizer.transform(X_test_text)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

print(f"Training set size: {len(X_train_text)}")
print(f"Test set size: {len(X_test_text)}")
print(f"BoW Matrix Shape (train): {X_train_bow.shape}")
print(f"TF-IDF Matrix Shape (train): {X_train_tfidf.shape}")

Training set size: 40000
Test set size: 10000
BoW Matrix Shape (train): (40000, 1000)
TF-IDF Matrix Shape (train): (40000, 1000)


In [None]:
# Training Pipeline with Cross-Validation
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
import numpy as np

# Set up Stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize model with regularization to prevent overfitting
model_tfidf = LogisticRegressionCV(
    max_iter=1000, 
    Cs=20,  # Range of inverse regularization strengths to try
    solver='saga',  # Fast for large datasets
    n_jobs=-1,  # Use all CPU cores
    cv = skf,
    random_state=42,
    l1_ratios=(0,),  # Set to (0,) for L2 regularization only, avoiding the l1_ratios warning
    use_legacy_attributes=True  # Set to True to silence the legacy attributes warning
)

model_bow = LogisticRegressionCV(
    max_iter=1000, 
    Cs=20, 
    solver='saga', 
    n_jobs=-1, 
    cv = skf,
    random_state=42,
    l1_ratios=(0,),  # Same as above
    use_legacy_attributes=True  # Same as above
)

model_tfidf_alt = MultinomialNB(
    alpha=1.0,  # Laplace smoothing parameter
    fit_prior=True,  # Learn class prior probabilities
    class_prior=None  # Let the model learn class priors from the data
)

model_bow_alt = MultinomialNB(
    alpha=1.0,
    fit_prior=True,
    class_prior=None
)

# Train TF-IDF model on full training set
model_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = model_tfidf.predict(X_test_tfidf)
print("TF-IDF Logistic Regression Test Results:")
print(classification_report(y_test, y_pred_tfidf))
print(confusion_matrix(y_test, y_pred_tfidf))

# Train alternative TF-IDF model (Naive Bayes) on full training set
model_tfidf_alt.fit(X_train_tfidf, y_train)
y_pred_tfidf_alt = model_tfidf_alt.predict(X_test_tfidf)
print("TF-IDF Naive Bayes Test Results:")
print(classification_report(y_test, y_pred_tfidf_alt))
print(confusion_matrix(y_test, y_pred_tfidf_alt))

# Train alternative BoW model (Naive Bayes) on full training set
model_bow_alt.fit(X_train_bow, y_train)
y_pred_bow_alt = model_bow_alt.predict(X_test_bow)
print("BoW Naive Bayes Test Results:")
print(classification_report(y_test, y_pred_bow_alt))
print(confusion_matrix(y_test, y_pred_bow_alt))

# Train BoW model on full training set
model_bow.fit(X_train_bow, y_train)
y_pred_bow = model_bow.predict(X_test_bow)
print("BoW Logistic Regression Test Results:")
print(classification_report(y_test, y_pred_bow))
print(confusion_matrix(y_test, y_pred_bow))

TF-IDF Test Results:
              precision    recall  f1-score   support

           0       0.87      0.85      0.86      5000
           1       0.85      0.87      0.86      5000

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

[[4254  746]
 [ 655 4345]]
TF-IDF Naive Bayes Test Results:
              precision    recall  f1-score   support

           0       0.84      0.81      0.83      5000
           1       0.82      0.85      0.83      5000

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000

[[4057  943]
 [ 748 4252]]
BoW Naive Bayes Test Results:
              precision    recall  f1-score   support

           0       0.84      0.81      0.82      5000
           1       0.82      0.84      0.83      5000

    accuracy                           0.83 

In [22]:
# Save trained models for deployment
with open('sentiment_models.pkl', 'wb') as f:
    pickle.dump({
        'tfidf_model': model_tfidf,
        'tfidf_model_alt': model_tfidf_alt,
        'bow_model': model_bow,
        'bow_model_alt': model_bow_alt,
        'tfidf_vectorizer': tfidf_vectorizer,
        'bow_vectorizer': bow_vectorizer
    }, f)

print("Models and vectorizers saved successfully!")

Models and vectorizers saved successfully!
