In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import nltk
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Suppress warnings for better readability
warnings.filterwarnings('ignore')


In [3]:
# Load dataset
df = pd.read_csv('/content/IMDB Dataset.csv')

In [4]:
# Display dataset info
print("Dataset Shape:", df.shape)
print("Class Distribution:\n", df['sentiment'].value_counts())

Dataset Shape: (50000, 2)
Class Distribution:
 sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [5]:
# Function to clean text
def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'https?://\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters
    text = text.lower().strip()  # Convert to lowercase and strip spaces
    return text


In [6]:
# Apply text cleaning
df['review'] = df['review'].apply(clean_text)

In [7]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [8]:
# Initialize stopwords, tokenizer, and lemmatizer
stop_words = set(stopwords.words('english'))
w_tokenizer = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

In [10]:
# Function for stopword removal and lemmatization
def preprocess_text(text):
    words = [lemmatizer.lemmatize(word) for word in w_tokenizer.tokenize(text) if word not in stop_words]
    return ' '.join(words)

In [11]:
# Apply text preprocessing
df['review'] = df['review'].apply(preprocess_text)

In [12]:
# Encode labels
encoder = LabelEncoder()
df['encoded_sentiment'] = encoder.fit_transform(df['sentiment'])

In [13]:
# Reduce dataset size for efficiency
df_sample = df.sample(n=20000, random_state=42)

In [14]:
# Train-test split
train_reviews, test_reviews, train_sentiments, test_sentiments = train_test_split(
    df_sample['review'], df_sample['encoded_sentiment'], test_size=0.2, random_state=42
)

In [15]:
# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(min_df=2, max_df=0.95, ngram_range=(1, 2), sublinear_tf=True)
X_train = vectorizer.fit_transform(train_reviews)
X_test = vectorizer.transform(test_reviews)

In [16]:
# Initialize models
logreg = LogisticRegression(max_iter=500, solver='saga', n_jobs=-1)
nb = MultinomialNB()
sgd = SGDClassifier(loss="hinge", penalty="l2", max_iter=1000, tol=1e-3)

In [17]:
# Train models
start_time = time.time()
logreg.fit(X_train, train_sentiments)
nb.fit(X_train, train_sentiments)
sgd.fit(X_train, train_sentiments)

In [18]:
# Predictions
logreg_preds = logreg.predict(X_test)
nb_preds = nb.predict(X_test)
sgd_preds = sgd.predict(X_test)

In [19]:
# Evaluation
def evaluate_model(model_name, y_true, y_pred):
    print(f"\n{model_name} Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(classification_report(y_true, y_pred))

evaluate_model("Logistic Regression", test_sentiments, logreg_preds)
evaluate_model("Naive Bayes", test_sentiments, nb_preds)
evaluate_model("SGD Classifier", test_sentiments, sgd_preds)

print(f"\nTotal Execution Time: {time.time() - start_time:.2f} seconds")



Logistic Regression Accuracy: 0.8850
              precision    recall  f1-score   support

           0       0.90      0.87      0.88      1996
           1       0.87      0.90      0.89      2004

    accuracy                           0.89      4000
   macro avg       0.89      0.88      0.88      4000
weighted avg       0.89      0.89      0.88      4000


Naive Bayes Accuracy: 0.8802
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      1996
           1       0.88      0.88      0.88      2004

    accuracy                           0.88      4000
   macro avg       0.88      0.88      0.88      4000
weighted avg       0.88      0.88      0.88      4000


SGD Classifier Accuracy: 0.8958
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      1996
           1       0.88      0.91      0.90      2004

    accuracy                           0.90      4000
   macro avg       0.90    