In [16]:
# 1.1 Import Libraries
import pandas as pd
import numpy as np
import re 
import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib 

print("Libraries imported successfully.")
df = pd.read_csv("IMDB Dataset.csv") 
print("\nDataset Info:")
df.info()

print("\nFirst 9 rows:")
print(df.tail())

df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

print("\nSentiment value counts (1: positive, 0: negative):")
print(df['sentiment'].value_counts())

Libraries imported successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB

First 9 rows:
                                                  review sentiment
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

Sentiment value counts (1: positive, 0: negative):
sentiment
1    25000
0    25000
Name: count, dtype: int64


In [17]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chenn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
#2. Text Preprocessing Function
stop_words = set(stopwords.words('english'))
# ps = PorterStemmer() 

def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    words = text.split()
    words = [word for word in words if word not in stop_words]
    text = ' '.join(words)
    return text

print("\nPreprocessing text data... (This may take a few minutes)")
df['cleaned_review'] = df['review'].apply(preprocess_text)
print("Text preprocessing complete.")

print("\nExample Preprocessing:")
print("Original:", df['review'][0][:200] + "...") # Show first 200 chars
print("Cleaned:", df['cleaned_review'][0][:200] + "...")


Preprocessing text data... (This may take a few minutes)
Text preprocessing complete.

Example Preprocessing:
Original: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me abo...
Cleaned: one reviewers mentioned watching oz episode youll hooked right exactly happened methe first thing struck oz brutality unflinching scenes violence set right word go trust show faint hearted timid show ...


In [None]:
# 3.1 Split Data into Training and Testing Sets
X = df['cleaned_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
 

print(f"\nData Split:")
print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")

# 3.2 TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # You can tune max_features

print("\nFitting TF-IDF Vectorizer and transforming training data...")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

print("Transforming test data...")
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("TF-IDF transformation complete.")
print(f"Shape of TF-IDF matrix (Train): {X_train_tfidf.shape}") # (num_samples, num_features)
print(f"Shape of TF-IDF matrix (Test): {X_test_tfidf.shape}")


Data Split:
Training set size: 37500 samples
Testing set size: 12500 samples

Fitting TF-IDF Vectorizer and transforming training data...
Transforming test data...
TF-IDF transformation complete.
Shape of TF-IDF matrix (Train): (37500, 5000)
Shape of TF-IDF matrix (Test): (12500, 5000)


In [20]:
# 4. Model Training
log_reg = LogisticRegression(C=1.0, max_iter=1000, random_state=42, solver='liblinear') # liblinear is good for binary classification with larger datasets

print("\nTraining Logistic Regression model...")
log_reg.fit(X_train_tfidf, y_train)
print("Model training complete.")

# Optional: Save the model and vectorizer
# joblib.dump(log_reg, 'logistic_regression_model.pkl')
# joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
# print("Model and Vectorizer saved.")


Training Logistic Regression model...
Model training complete.


In [21]:
# 5. Model Evaluation
print("\nEvaluating model on the test set...")
y_pred = log_reg.predict(X_test_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# Display confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
# Format:
# [[TN, FP],
#  [FN, TP]]

# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative (0)', 'Positive (1)']))


Evaluating model on the test set...

Accuracy: 0.8881

Confusion Matrix:
[[5496  754]
 [ 645 5605]]

Classification Report:
              precision    recall  f1-score   support

Negative (0)       0.89      0.88      0.89      6250
Positive (1)       0.88      0.90      0.89      6250

    accuracy                           0.89     12500
   macro avg       0.89      0.89      0.89     12500
weighted avg       0.89      0.89      0.89     12500



In [22]:
# 6. Testing on New Examples

# Load saved model and vectorizer if needed
# log_reg = joblib.load('logistic_regression_model.pkl')
# tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Example reviews
new_reviews = [
    "This movie was absolutely fantastic! The acting was superb and the storyline kept me engaged throughout.",
    "What a waste of time. The plot was predictable and the characters were incredibly boring. I would not recommend this film.",
    "It was an okay movie, not great but not terrible either. Some good moments but overall quite average."
]

print("\n--- Testing on New Reviews ---")
# Preprocess the new reviews
cleaned_new_reviews = [preprocess_text(review) for review in new_reviews]
print("Cleaned Reviews:", cleaned_new_reviews)

new_reviews_tfidf = tfidf_vectorizer.transform(cleaned_new_reviews)
print("Shape of TF-IDF for new reviews:", new_reviews_tfidf.shape)

new_predictions = log_reg.predict(new_reviews_tfidf)
sentiment_labels = {1: 'Positive', 0: 'Negative'}

# Print results
for review, prediction in zip(new_reviews, new_predictions):
    print(f"\nReview: \"{review[:100]}...\"")
    print(f"Predicted Sentiment: {sentiment_labels[prediction]} ({prediction})")

print("\n--- Project Complete ---")


--- Testing on New Reviews ---
Cleaned Reviews: ['movie absolutely fantastic acting superb storyline kept engaged throughout', 'waste time plot predictable characters incredibly boring would recommend film', 'okay movie great terrible either good moments overall quite average']
Shape of TF-IDF for new reviews: (3, 5000)

Review: "This movie was absolutely fantastic! The acting was superb and the storyline kept me engaged through..."
Predicted Sentiment: Positive (1)

Review: "What a waste of time. The plot was predictable and the characters were incredibly boring. I would no..."
Predicted Sentiment: Negative (0)

Review: "It was an okay movie, not great but not terrible either. Some good moments but overall quite average..."
Predicted Sentiment: Negative (0)

--- Project Complete ---
