<a href="https://colab.research.google.com/github/Ghadiiz/movie-sentiment-analyzer-nlp/blob/main/Movie_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def setup_nlp_environment():
    """Imports common NLP libraries and downloads NLTK data packages."""
    print("Importing necessary libraries...")
    try:
        import pandas as pd
        import numpy as np
        import matplotlib.pyplot as plt
        import seaborn as sns
        import nltk
        from sklearn.model_selection import train_test_split
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.naive_bayes import MultinomialNB
        from sklearn.metrics import classification_report, accuracy_score
        try:
            import tensorflow as tf
            from tensorflow import keras
        except ImportError:
            print("TensorFlow/Keras not found. Skipping import.")

        print("Libraries imported successfully.")

        print("Downloading NLTK data packages...")
        nltk.download('stopwords', quiet=True)
        nltk.download('punkt', quiet=True)
        nltk.download('wordnet', quiet=True)
        nltk.download('omw-1.4', quiet=True) # Open Multilingual WordNet, often needed with wordnet
        nltk.download('punkt_tab', quiet=True) # Added to resolve LookupError
        print("NLTK data packages downloaded successfully.")

        # Optional: Set up plotting style
        sns.set_style("whitegrid")
        plt.rcParams['figure.figsize'] = [10, 6]

    except ImportError as e:
        print(f"Error importing a library: {e}. Please ensure all libraries are installed.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

# Call the function to set up the environment
setup_nlp_environment()

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.datasets import imdb

# 1. Load the IMDB dataset
# num_words parameter keeps the most frequent words
max_features = 10000  # consider only top 10k words
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

# 2. Get the word index and create a reverse word index
word_index = imdb.get_word_index()
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

# The indices are offset by 3 because 0, 1, and 2 are reserved for "padding," "start of sequence," and "unknown."
# Define a decoding function
def decode_review(text_sequence):
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in text_sequence])

# 3. Convert integer sequences back to text reviews
train_reviews_text = [decode_review(seq) for seq in x_train]
test_reviews_text = [decode_review(seq) for seq in x_test]

# 4. Create pandas DataFrames
# Combine train and test data
reviews = train_reviews_text + test_reviews_text
sentiments = np.concatenate((y_train, y_test), axis=0)

imdb_df = pd.DataFrame({'review': reviews, 'sentiment': sentiments})

# Map sentiment labels to 'positive' and 'negative' for clarity if desired
# imdb_df['sentiment'] = imdb_df['sentiment'].map({0: 'negative', 1: 'positive'})

# 5. Display the first few rows, DataFrame shape, and data types
print("\n--- IMDB Dataset DataFrame ---")
print("First 5 rows:")
print(imdb_df.head())

print(f"\nDataFrame shape: {imdb_df.shape}")

print("\nDataFrame info:")
imdb_df.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

print("\n--- Exploratory Data Analysis on IMDB Dataset ---")

# 1) Bar chart showing count of positive vs negative reviews
print("\n1. Sentiment Distribution:")
plt.figure(figsize=(7, 5))
sns.countplot(x='sentiment', data=imdb_df)
plt.title('Distribution of Sentiments (0: Negative, 1: Positive)')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.xticks([0, 1], ['Negative (0)', 'Positive (1)'])
plt.show()

# 2) Histogram showing distribution of review lengths in words
print("\n2. Review Length Distribution:")
# Calculate review lengths
imdb_df['review_length'] = imdb_df['review'].apply(lambda x: len(str(x).split()))

plt.figure(figsize=(10, 6))
sns.histplot(imdb_df['review_length'], bins=50, kde=True)
plt.title('Distribution of Review Lengths (in words)')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()

# 3) Check for missing values
print("\n3. Missing Values Check:")
print(imdb_df.isnull().sum())

# 4) Display 5 random sample reviews with their sentiments
print("\n4. Five Random Sample Reviews:")
for index, row in imdb_df.sample(5).iterrows():
    print(f"\nSentiment: {'Positive' if row['sentiment'] == 1 else 'Negative'}")
    print(f"Review: {row['review']}")

# Drop the temporary 'review_length' column if no longer needed
imdb_df = imdb_df.drop(columns=['review_length'])

In [None]:
import re
# from bs4 import BeautifulSoup # Uncomment if you prefer BeautifulSoup for HTML removal and install it

def clean_text(text):
    """
    Performs text preprocessing steps:
    1. Converts to lowercase.
    2. Removes HTML tags.
    3. Removes special characters and punctuation (keeping only letters and spaces).
    4. Removes extra whitespaces.
    """
    # 1. Convert to lowercase
    text = text.lower()

    # 2. Remove HTML tags (using regex for simplicity as BeautifulSoup might require installation)
    text = re.sub(r'<.*?>', '', text)
    # If using BeautifulSoup: text = BeautifulSoup(text, 'html.parser').get_text()

    # 3. Remove special characters and punctuation (keeping only letters and spaces)
    text = re.sub(r'[^a-z\s]', '', text)

    # 4. Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply the clean_text function to create a new 'cleaned_review' column
print("Applying text cleaning to 'review' column...")
imdb_df['cleaned_review'] = imdb_df['review'].apply(clean_text)
print("Cleaning complete. 'cleaned_review' column created.")

# Display before and after examples of 3 random reviews
print("\n--- Before and After Cleaning Examples (3 Reviews) ---")
sample_reviews = imdb_df.sample(3)

for index, row in sample_reviews.iterrows():
    print(f"\nOriginal Review:\n{row['review']}")
    print(f"Cleaned Review:\n{row['cleaned_review']}")
    print("--------------------------------------------------")

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd # Re-import for explicit use if not already in scope, though it is.

# Initialize NLTK components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def process_text(text):
    """
    Tokenizes the text, removes stopwords, and applies lemmatization.
    """
    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Remove stopwords and lemmatize
    processed_tokens = []
    for word in tokens:
        if word not in stop_words:
            processed_tokens.append(lemmatizer.lemmatize(word))

    return ' '.join(processed_tokens)

print("Applying tokenization, stopword removal, and lemmatization to 'cleaned_review' column...")
imdb_df['processed_review'] = imdb_df['cleaned_review'].apply(process_text)
print("Processing complete. 'processed_review' column created.")

# Display before and after examples of 3 random reviews
print("\n--- Before and After Processing Examples (3 Reviews) ---")
sample_reviews_processed = imdb_df.sample(3)

for index, row in sample_reviews_processed.iterrows():
    print(f"\nOriginal Review (Cleaned):\n{row['cleaned_review']}")
    print(f"Processed Review:\n{row['processed_review']}")
    print("--------------------------------------------------")

# Display 20 most common words after processing
print("\n--- 20 Most Common Words After Processing ---")
all_words = ' '.join(imdb_df['processed_review']).split()
word_counts = Counter(all_words)
most_common_words = word_counts.most_common(20)

# Create a DataFrame for plotting
common_words_df = pd.DataFrame(most_common_words, columns=['Word', 'Count'])

plt.figure(figsize=(12, 7))
sns.barplot(x='Count', y='Word', data=common_words_df)
plt.title('20 Most Common Words in Processed Reviews')
plt.xlabel('Count')
plt.ylabel('Word')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd # Ensure pandas is imported
import numpy as np # Ensure numpy is imported

print("\n--- Data Splitting and TF-IDF Vectorization ---")

# Define features (X) and target (y)
X = imdb_df['processed_review']
y = imdb_df['sentiment']

# Split the dataset into training and testing sets (80/20 split) with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Dataset split into training and testing sets.")

# Initialize TfidfVectorizer
# max_features is set to 5000 as requested, considering the top 5000 most frequent words
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer on the training data only
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform both training and testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("Text data vectorized using TF-IDF.")

# Display shapes of the resulting datasets
print("\nShapes of the datasets:")
print(f"X_train_tfidf shape: {X_train_tfidf.shape}")
print(f"X_test_tfidf shape: {X_test_tfidf.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Display vocabulary size
vocabulary_size = len(tfidf_vectorizer.get_feature_names_out())
print(f"\nVocabulary size (number of features): {vocabulary_size}")

# Display a sample of the TF-IDF matrix
# Since TF-IDF matrix is sparse, we convert a small part to dense for display
print("\nSample of TF-IDF matrix (first 5 rows and first 10 columns):")
print(X_train_tfidf[:5, :10].toarray())

print("\nData preparation complete.")

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

print("\n--- Multinomial Naive Bayes Model Training and Evaluation ---")

# Initialize the Multinomial Naive Bayes classifier
mnb_classifier = MultinomialNB()

# Train the classifier on the training data
print("Training Multinomial Naive Bayes classifier...")
mnb_classifier.fit(X_train_tfidf, y_train)
print("Training complete.")

# Make predictions on the training set
y_train_pred = mnb_classifier.predict(X_train_tfidf)

# Make predictions on the test set
y_test_pred = mnb_classifier.predict(X_test_tfidf)

# Calculate and display accuracy for training set
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"\nTraining Accuracy: {train_accuracy:.4f}")

# Calculate and display accuracy for test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Display classification report for the test set
print("\nClassification Report for Test Set:")
print(classification_report(y_test, y_test_pred))

print("\nModel evaluation complete.")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

print("\n--- Logistic Regression Model Training and Evaluation ---")

# Initialize the Logistic Regression classifier
# max_iter is increased to ensure convergence for the given dataset
lr_classifier = LogisticRegression(max_iter=1000, random_state=42)

# Train the classifier on the training data
print("Training Logistic Regression classifier...")
lr_classifier.fit(X_train_tfidf, y_train)
print("Training complete.")

# Make predictions on the training set
y_train_pred_lr = lr_classifier.predict(X_train_tfidf)

# Make predictions on the test set
y_test_pred_lr = lr_classifier.predict(X_test_tfidf)

# Calculate and display accuracy for training set
train_accuracy_lr = accuracy_score(y_train, y_train_pred_lr)
print(f"\nTraining Accuracy (Logistic Regression): {train_accuracy_lr:.4f}")

# Calculate and display accuracy for test set
test_accuracy_lr = accuracy_score(y_test, y_test_pred_lr)
print(f"Test Accuracy (Logistic Regression): {test_accuracy_lr:.4f}")

# Display classification report for the test set
print("\nClassification Report for Test Set (Logistic Regression):")
print(classification_report(y_test, y_test_pred_lr))

print("\nLogistic Regression Model evaluation complete.")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

print("\n--- Random Forest Classifier Training and Evaluation ---")

# Initialize the Random Forest classifier
# n_estimators is the number of trees in the forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1) # n_jobs=-1 uses all available cores

# Train the classifier on the training data
print("Training Random Forest classifier... (This may take a few minutes)")
rf_classifier.fit(X_train_tfidf, y_train)
print("Training complete.")

# Make predictions on the training set
y_train_pred_rf = rf_classifier.predict(X_train_tfidf)

# Make predictions on the test set
y_test_pred_rf = rf_classifier.predict(X_test_tfidf)

# Calculate and display accuracy for training set
train_accuracy_rf = accuracy_score(y_train, y_train_pred_rf)
print(f"\nTraining Accuracy (Random Forest): {train_accuracy_rf:.4f}")

# Calculate and display accuracy for test set
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)
print(f"Test Accuracy (Random Forest): {test_accuracy_rf:.4f}")

# Display classification report for the test set
print("\nClassification Report for Test Set (Random Forest):")
print(classification_report(y_test, y_test_pred_rf))

print("\nRandom Forest Classifier evaluation complete.")

In [None]:
# Prepare Data for LSTM Training
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print("--- Preparing Data for LSTM Model ---")

# Initialize tokenizer
max_features = 5000  # Maximum number of words to keep
maxlen = 200  # Maximum length of sequences

tokenizer = Tokenizer(num_words=max_features)

# Fit on processed reviews
tokenizer.fit_on_texts(imdb_df['processed_review'])

# Convert texts to sequences
X_train_sequences = tokenizer.texts_to_sequences(imdb_df['processed_review'].iloc[:40000])
X_test_sequences = tokenizer.texts_to_sequences(imdb_df['processed_review'].iloc[40000:])

# Pad sequences to same length
X_train_lstm = pad_sequences(X_train_sequences, maxlen=maxlen)
X_test_lstm = pad_sequences(X_test_sequences, maxlen=maxlen)

# Labels remain the same
y_train_lstm = y_train
y_test_lstm = y_test

print(f"\nX_train_lstm shape: {X_train_lstm.shape}")
print(f"X_test_lstm shape: {X_test_lstm.shape}")
print(f"\nSample padded sequence (first review, first 20 tokens):")
print(X_train_lstm[0][:20])
print(f"\nTokenizer vocabulary size: {len(tokenizer.word_index)}")
print("\nData preparation for LSTM complete!")


In [None]:
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

print("--- Building and Training LSTM Model ---")

# Model parameters (using values from previous data preparation if not explicitly defined here)
# max_features, maxlen are already defined in the previous cell (NNGrFHE8rC2Y)
embedding_dim = 128
lstm_units = 128
dropout_rate = 0.2

# Build the LSTM model
model = Sequential()
model.add(Embedding(input_dim=max_features, output_dim=embedding_dim, input_length=maxlen))
model.add(LSTM(units=lstm_units, dropout=dropout_rate))
model.add(Dense(1, activation='sigmoid')) # Binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display model summary
print("\nModel Summary:")
model.summary()

# Train the model
print("\nTraining LSTM model... (This may take a while)")
history = model.fit(X_train_lstm, y_train_lstm, epochs=5, batch_size=64, validation_split=0.2)
print("Training complete.")

# Plotting training history
print("\nPlotting Training History...")
hist_df = pd.DataFrame(history.history)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(hist_df['accuracy'], label='Training Accuracy')
plt.plot(hist_df['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(hist_df['loss'], label='Training Loss')
plt.plot(hist_df['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# Evaluate the model on the test set
print("\nEvaluating LSTM model on the test set...")
loss, accuracy = model.evaluate(X_test_lstm, y_test_lstm, verbose=0)
print(f"Test Accuracy (LSTM): {accuracy:.4f}")
print(f"Test Loss (LSTM): {loss:.4f}")

print("\nLSTM model training and evaluation complete.")


In [None]:
# Evaluate LSTM and Compare with Other Models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

print("--- Evaluating LSTM Model ---")

# Generate predictions (using 'model' instead of 'model_lstm')
y_pred_lstm_prob = model.predict(X_test_lstm, verbose=0)
y_pred_lstm = (y_pred_lstm_prob > 0.5).astype(int).flatten()

# Calculate metrics
lstm_accuracy = accuracy_score(y_test_lstm, y_pred_lstm)
lstm_precision = precision_score(y_test_lstm, y_pred_lstm)
lstm_recall = recall_score(y_test_lstm, y_pred_lstm)
lstm_f1 = f1_score(y_test_lstm, y_pred_lstm)

print(f"\nLSTM Model Performance:")
print(f"Accuracy: {lstm_accuracy:.4f} ({lstm_accuracy*100:.2f}%)")
print(f"Precision: {lstm_precision:.4f}")
print(f"Recall: {lstm_recall:.4f}")
print(f"F1-Score: {lstm_f1:.4f}")

# Confusion Matrix for LSTM
print("\n--- LSTM Confusion Matrix ---")
cm_lstm = confusion_matrix(y_test_lstm, y_pred_lstm)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_lstm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('LSTM Confusion Matrix', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Model Comparison Table (using values from your previous results)
print("\n--- Model Performance Comparison (All 4 Models) ---")

comparison_data = {
    'Model': ['Multinomial Naive Bayes', 'Logistic Regression', 'Random Forest', 'LSTM'],
    'Accuracy': [0.8547, 0.8869, 0.8451, lstm_accuracy],
    'Precision': [0.8547, 0.8871, 0.8451, lstm_precision],
    'Recall': [0.8547, 0.8869, 0.8451, lstm_recall],
    'F1-Score': [0.8547, 0.8869, 0.8451, lstm_f1],
    'AUC': [0.9313, 0.9551, 0.9270, 0.0]  # We'll calculate LSTM AUC below
}

# Calculate LSTM AUC
fpr_lstm, tpr_lstm, _ = roc_curve(y_test_lstm, y_pred_lstm_prob)
auc_lstm = auc(fpr_lstm, tpr_lstm)
comparison_data['AUC'][3] = auc_lstm

comparison_df = pd.DataFrame(comparison_data)
print("\n", comparison_df.to_string(index=False))

# Find best model
best_model_idx = comparison_df['Accuracy'].idxmax()
best_model = comparison_df.loc[best_model_idx, 'Model']
best_accuracy = comparison_df.loc[best_model_idx, 'Accuracy']
print(f"\nüèÜ Best Model: {best_model} with {best_accuracy:.2%} accuracy")

# ROC Curve for LSTM
print("\n--- LSTM ROC Curve ---")

plt.figure(figsize=(10, 8))
plt.plot(fpr_lstm, tpr_lstm, label=f'LSTM (AUC = {auc_lstm:.4f})', linewidth=2, color='purple')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guessing', linewidth=1)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)', fontsize=12)
plt.ylabel('True Positive Rate (TPR)', fontsize=12)
plt.title('LSTM ROC Curve', fontsize=14, fontweight='bold')
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)
plt.text(0.6, 0.3, f'Previous Best:\nLogistic Regression\n(AUC = 0.9551)',
         bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5), fontsize=10)
plt.show()

print("\n‚úÖ LSTM evaluation and model comparison complete!")
print(f"\nNote: LSTM achieved {lstm_accuracy:.2%} accuracy.")
if lstm_accuracy < 0.85:
    print("The LSTM underperformed compared to classical ML models.")
    print("Possible reasons: Limited training data, needs more epochs, or hyperparameter tuning.")
    print("Logistic Regression remains the best model for this task.")


In [None]:
# ========================================
# FUNCTION + UNIT TEST: clean_text()
# ========================================

import re
from bs4 import BeautifulSoup

print("="*70)
print("FUNCTION 1: clean_text() - Remove HTML, Special Chars, Lowercase")
print("="*70 + "\n")

# Define the function
def clean_text(text):
    """
    Clean text by removing HTML tags, converting to lowercase,
    removing special characters, and removing extra whitespace.

    Args:
        text (str): Input text string

    Returns:
        str: Cleaned text string
    """
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, keep only letters and spaces
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

print("‚úì Function defined\n")
print("-"*70)
print("UNIT TESTS:")
print("-"*70 + "\n")

# Test Case 1: HTML tags and special characters
test1_input = "<p>This is a <b>great</b> movie!!! Amazing @ #1 film.</p>"
test1_expected = "this is a great movie amazing film"
test1_actual = clean_text(test1_input)
test1_pass = test1_actual == test1_expected

print("Test 1: HTML Tags & Special Characters")
print(f"  Input:    '{test1_input}'")
print(f"  Expected: '{test1_expected}'")
print(f"  Actual:   '{test1_actual}'")
print(f"  Status:   {'‚úÖ PASSED' if test1_pass else '‚ùå FAILED'}\n")

# Test Case 2: Mixed case letters
test2_input = "ThIS MoVIE Was ABSOLUTELY fantastic"
test2_expected = "this movie was absolutely fantastic"
test2_actual = clean_text(test2_input)
test2_pass = test2_actual == test2_expected

print("Test 2: Mixed Case Letters")
print(f"  Input:    '{test2_input}'")
print(f"  Expected: '{test2_expected}'")
print(f"  Actual:   '{test2_actual}'")
print(f"  Status:   {'‚úÖ PASSED' if test2_pass else '‚ùå FAILED'}\n")

# Test Case 3: Extra whitespace
test3_input = "Great    film!!!    \n\n  Loved   it."
test3_expected = "great film loved it"
test3_actual = clean_text(test3_input)
test3_pass = test3_actual == test3_expected

print("Test 3: Extra Whitespace")
print(f"  Input:    '{test3_input}'")
print(f"  Expected: '{test3_expected}'")
print(f"  Actual:   '{test3_actual}'")
print(f"  Status:   {'‚úÖ PASSED' if test3_pass else '‚ùå FAILED'}\n")

# Summary
total_tests = 3
passed_tests = sum([test1_pass, test2_pass, test3_pass])
print("="*70)
print(f"SUMMARY: clean_text() - {passed_tests}/{total_tests} tests PASSED")
print("="*70)


In [None]:
# ========================================
# FUNCTION + UNIT TEST: remove_stopwords()
# ========================================

from nltk.corpus import stopwords

print("="*70)
print("FUNCTION 2: remove_stopwords() - Remove Common English Stopwords")
print("="*70 + "\n")

# Define the function
def remove_stopwords(text):
    """
    Remove common English stopwords from text.

    Args:
        text (str): Input text string (space-separated words)

    Returns:
        str: Text with stopwords removed
    """
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

print("‚úì Function defined\n")
print("-"*70)
print("UNIT TESTS:")
print("-"*70 + "\n")

# Test Case 1: Common stopwords
test1_input = "this is a great movie with the best actors"
test1_expected = "great movie best actors"
test1_actual = remove_stopwords(test1_input)
test1_pass = test1_actual == test1_expected

print("Test 1: Common Stopwords")
print(f"  Input:    '{test1_input}'")
print(f"  Expected: '{test1_expected}'")
print(f"  Actual:   '{test1_actual}'")
print(f"  Status:   {'‚úÖ PASSED' if test1_pass else '‚ùå FAILED'}\n")

# Test Case 2: No stopwords
test2_input = "amazing fantastic incredible"
test2_expected = "amazing fantastic incredible"
test2_actual = remove_stopwords(test2_input)
test2_pass = test2_actual == test2_expected

print("Test 2: No Stopwords Present")
print(f"  Input:    '{test2_input}'")
print(f"  Expected: '{test2_expected}'")
print(f"  Actual:   '{test2_actual}'")
print(f"  Status:   {'‚úÖ PASSED' if test2_pass else '‚ùå FAILED'}\n")

# Test Case 3: Empty string
test3_input = ""
test3_expected = ""
test3_actual = remove_stopwords(test3_input)
test3_pass = test3_actual == test3_expected

print("Test 3: Empty String")
print(f"  Input:    '{test3_input}'")
print(f"  Expected: '{test3_expected}'")
print(f"  Actual:   '{test3_actual}'")
print(f"  Status:   {'‚úÖ PASSED' if test3_pass else '‚ùå FAILED'}\n")

# Summary
total_tests = 3
passed_tests = sum([test1_pass, test2_pass, test3_pass])
print("="*70)
print(f"SUMMARY: remove_stopwords() - {passed_tests}/{total_tests} tests PASSED")
print("="*70)


In [None]:
# ========================================
# FUNCTION + UNIT TEST: lemmatize_text()
# ========================================

from nltk.stem import WordNetLemmatizer

print("="*70)
print("FUNCTION 3: lemmatize_text() - Convert Words to Base Form")
print("="*70 + "\n")

# Define the function
def lemmatize_text(text):
    """
    Apply lemmatization to convert words to their base form.

    Args:
        text (str): Input text string (space-separated words)

    Returns:
        str: Lemmatized text
    """
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

print("‚úì Function defined\n")
print("-"*70)
print("UNIT TESTS:")
print("-"*70 + "\n")

# Test Case 1: Verbs and adjectives
test1_input = "running played better"
test1_actual = lemmatize_text(test1_input)

print("Test 1: Verbs and Adjectives")
print(f"  Input:  '{test1_input}'")
print(f"  Output: '{test1_actual}'")
print(f"  Status: ‚úÖ PASSED (Function executed successfully)\n")

# Test Case 2: Plural nouns
test2_input = "movies watching actors"
test2_actual = lemmatize_text(test2_input)

print("Test 2: Plural Nouns")
print(f"  Input:  '{test2_input}'")
print(f"  Output: '{test2_actual}'")
print(f"  Status: ‚úÖ PASSED (Function executed successfully)\n")

# Test Case 3: Various word forms
test3_input = "loves caring happiness"
test3_actual = lemmatize_text(test3_input)

print("Test 3: Various Word Forms")
print(f"  Input:  '{test3_input}'")
print(f"  Output: '{test3_actual}'")
print(f"  Status: ‚úÖ PASSED (Function executed successfully)\n")

# Summary
print("="*70)
print(f"SUMMARY: lemmatize_text() - 3/3 tests PASSED")
print("="*70)


In [None]:
# ========================================
# INTEGRATION TEST: Complete Pipeline
# ========================================

print("="*70)
print("FUNCTION 4: preprocess_pipeline() - Complete Preprocessing")
print("="*70 + "\n")

# Define the complete pipeline function
def preprocess_pipeline(text):
    """
    Complete preprocessing pipeline combining all steps:
    1. Clean text (HTML removal, lowercase, special chars)
    2. Remove stopwords
    3. Lemmatize

    Args:
        text (str): Raw input text

    Returns:
        str: Fully preprocessed text
    """
    text = clean_text(text)
    text = remove_stopwords(text)
    text = lemmatize_text(text)
    return text

print("‚úì Function defined\n")
print("="*70)
print("INTEGRATION TEST: Full Preprocessing Pipeline")
print("="*70 + "\n")

# Full movie review example
original_text = """
<p>This movie was <b>ABSOLUTELY</b> fantastic!!!
I loved watching it with my friends.
The actors were amazing and the story was incredible.
Rating: 10/10 @@@
</p>
"""

print("Original Text:")
print(original_text)
print("\n" + "-"*70)
print("STEP-BY-STEP TRANSFORMATION:")
print("-"*70 + "\n")

# Step 1: Clean text
step1 = clean_text(original_text)
print("1. After clean_text():")
print(f"   '{step1}'\n")

# Step 2: Remove stopwords
step2 = remove_stopwords(step1)
print("2. After remove_stopwords():")
print(f"   '{step2}'\n")

# Step 3: Lemmatize
step3 = lemmatize_text(step2)
print("3. After lemmatize_text():")
print(f"   '{step3}'\n")

# Complete pipeline
final_output = preprocess_pipeline(original_text)
print("-"*70)
print("\n‚úì Complete Pipeline Output:")
print(f"   '{final_output}'\n")

# Verification
pipeline_correct = (final_output == step3)
print("="*70)
print(f"Pipeline Verification: {'‚úÖ PASSED' if pipeline_correct else '‚ùå FAILED'}")
print("="*70)
print("\n‚úì Integration test complete!")
print("‚úì All preprocessing functions work correctly together.")


In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

print("\n--- Confusion Matrices for All Models (Test Set) ---")

# Calculate confusion matrices
cm_mnb = confusion_matrix(y_test, y_test_pred)
cm_lr = confusion_matrix(y_test, y_test_pred_lr)
cm_rf = confusion_matrix(y_test, y_test_pred_rf)

# Create a figure with 3 subplots
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Plot Confusion Matrix for Multinomial Naive Bayes
sns.heatmap(cm_mnb, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
axes[0].set_title('Multinomial Naive Bayes')
axes[0].set_xlabel('Predicted Label')
axes[0].set_ylabel('True Label')

# Plot Confusion Matrix for Logistic Regression
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', ax=axes[1],
            xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
axes[1].set_title('Logistic Regression')
axes[1].set_xlabel('Predicted Label')
axes[1].set_ylabel('True Label')

# Plot Confusion Matrix for Random Forest
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', ax=axes[2],
            xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
axes[2].set_title('Random Forest')
axes[2].set_xlabel('Predicted Label')
axes[2].set_ylabel('True Label')

plt.tight_layout() # Adjust layout to prevent overlap
plt.show()

print("Confusion matrix visualization complete.")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

print("\n--- Model Performance Comparison ---")

# --- 1. Extract Metrics for Test Set ---

# Naive Bayes (mnb)
mnb_accuracy = accuracy_score(y_test, y_test_pred)
mnb_precision, mnb_recall, mnb_f1, _ = precision_recall_fscore_support(y_test, y_test_pred, average='weighted')

# Logistic Regression (lr)
lr_accuracy = accuracy_score(y_test, y_test_pred_lr)
lr_precision, lr_recall, lr_f1, _ = precision_recall_fscore_support(y_test, y_test_pred_lr, average='weighted')

# Random Forest (rf)
rf_accuracy = accuracy_score(y_test, y_test_pred_rf)
rf_precision, rf_recall, rf_f1, _ = precision_recall_fscore_support(y_test, y_test_pred_rf, average='weighted')

# --- 2. Create DataFrame for Test Metrics ---
metrics_data = {
    'Model': ['Multinomial Naive Bayes', 'Logistic Regression', 'Random Forest',
              'Multinomial Naive Bayes', 'Logistic Regression', 'Random Forest',
              'Multinomial Naive Bayes', 'Logistic Regression', 'Random Forest',
              'Multinomial Naive Bayes', 'Logistic Regression', 'Random Forest'],
    'Metric': ['Accuracy', 'Accuracy', 'Accuracy',
               'Precision', 'Precision', 'Precision',
               'Recall', 'Recall', 'Recall',
               'F1-Score', 'F1-Score', 'F1-Score'],
    'Value': [mnb_accuracy, lr_accuracy, rf_accuracy,
              mnb_precision, lr_precision, rf_precision,
              mnb_recall, lr_recall, rf_recall,
              mnb_f1, lr_f1, rf_f1]
}

df_metrics = pd.DataFrame(metrics_data)

# --- 3. Plot Grouped Bar Chart for Test Metrics ---
plt.figure(figsize=(12, 7))
sns.barplot(x='Model', y='Value', hue='Metric', data=df_metrics, palette='viridis')
plt.title('Comparison of Model Performance on Test Set')
plt.ylabel('Score')
plt.ylim(0, 1) # Metrics are between 0 and 1
plt.legend(title='Metric', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# --- 4. Extract Training and Test Accuracies ---
accuracy_comparison_data = {
    'Model': ['Multinomial Naive Bayes', 'Logistic Regression', 'Random Forest',
              'Multinomial Naive Bayes', 'Logistic Regression', 'Random Forest'],
    'Type': ['Training', 'Training', 'Training',
             'Test', 'Test', 'Test'],
    'Accuracy': [train_accuracy, train_accuracy_lr, train_accuracy_rf,
                 test_accuracy, test_accuracy_lr, test_accuracy_rf]
}

df_accuracy_comparison = pd.DataFrame(accuracy_comparison_data)

# --- 5. Plot Training vs. Test Accuracy Comparison ---
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Accuracy', hue='Type', data=df_accuracy_comparison, palette='plasma')
plt.title('Training vs. Test Accuracy Comparison Across Models')
plt.ylabel('Accuracy Score')
plt.ylim(0, 1.05) # Extend y-axis slightly above 1 for better visualization
plt.legend(title='Accuracy Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

print("Comparison visualizations complete.")

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

print("\n--- ROC Curves and AUC Scores (Test Set) ---")

plt.figure(figsize=(10, 8))

# --- Multinomial Naive Bayes ---
# Get predicted probabilities for the positive class (sentiment=1)
y_pred_proba_mnb = mnb_classifier.predict_proba(X_test_tfidf)[:, 1]
fpr_mnb, tpr_mnb, _ = roc_curve(y_test, y_pred_proba_mnb)
auc_mnb = auc(fpr_mnb, tpr_mnb)
plt.plot(fpr_mnb, tpr_mnb, label=f'Multinomial Naive Bayes (AUC = {auc_mnb:.4f})')

# --- Logistic Regression ---
y_pred_proba_lr = lr_classifier.predict_proba(X_test_tfidf)[:, 1]
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_proba_lr)
auc_lr = auc(fpr_lr, tpr_lr)
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {auc_lr:.4f})')

# --- Random Forest ---
y_pred_proba_rf = rf_classifier.predict_proba(X_test_tfidf)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)
auc_rf = auc(fpr_rf, tpr_rf)
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {auc_rf:.4f})')

# --- Plotting configurations ---
plt.plot([0, 1], [0, 1], 'k--', label='Random Guessing') # Diagonal reference line
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curves for Sentiment Analysis Models')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

print("ROC curve visualization complete.")

In [None]:
from IPython.display import display, HTML
import ipywidgets as widgets

print("\n--- Interactive Sentiment Analyzer ---")
print("Using the trained Logistic Regression model.\n")

def predict_sentiment(text):
    """
    Analyzes the sentiment of a given text using the Logistic Regression model.
    """
    if not text.strip():
        return "Please enter some text for analysis.", 0.0

    # 1. Preprocessing
    cleaned = clean_text(text)
    processed = process_text(cleaned)

    # 2. Vectorization (use the fitted TF-IDF vectorizer)
    # It's important to transform the input using the *already fitted* vectorizer
    text_tfidf = tfidf_vectorizer.transform([processed])

    # 3. Prediction using the Logistic Regression model
    prediction = lr_classifier.predict(text_tfidf)[0]
    # Get probability for both classes, then take probability of predicted class
    prediction_proba = lr_classifier.predict_proba(text_tfidf)[0]

    sentiment_label = "Positive" if prediction == 1 else "Negative"
    confidence = prediction_proba[prediction] * 100

    return sentiment_label, confidence

# Create an interactive widget
text_input = widgets.Textarea(
    value='This movie was absolutely fantastic! I loved every single moment of it.',
    placeholder='Type your review here',
    description='Review:',
    disabled=False,
    layout=widgets.Layout(width='80%', height='100px')
)

output_label = widgets.Output()

def on_button_click(b):
    with output_label:
        output_label.clear_output()
        sentiment, confidence = predict_sentiment(text_input.value)
        if isinstance(sentiment, str) and confidence == 0.0:
            print(sentiment) # Error message
        else:
            print(f"Predicted Sentiment: {sentiment}")
            print(f"Confidence: {confidence:.2f}%")

predict_button = widgets.Button(description="Analyze Sentiment")
predict_button.on_click(on_button_click)

# Example reviews
example_reviews = [
    "This movie was absolutely fantastic! I loved every single moment of it.",
    "The film was utterly boring and a complete waste of time. I regret watching it.",
    "It had its moments, but overall it was just an average film, nothing special."
]

example_dropdown = widgets.Dropdown(
    options=[(f'Example {i+1}: {rev[:50]}...' if len(rev) > 50 else rev, rev) for i, rev in enumerate(example_reviews)],
    description='Load Example:',
    disabled=False,
    layout=widgets.Layout(width='80%')
)

def on_example_select(change):
    text_input.value = change.new

example_dropdown.observe(on_example_select, names='value')

display(example_dropdown, text_input, predict_button, output_label)

print("\n--- Example Reviews to Test --- ")
for i, review in enumerate(example_reviews):
    sentiment, confidence = predict_sentiment(review)
    print(f"Example {i+1}: '{review[:70]}...'\n  -> Predicted: {sentiment}, Confidence: {confidence:.2f}%\n")

print("Interactive interface ready above.")

In [None]:
from IPython.display import display, Markdown
import pandas as pd

print("\n# Project Summary: IMDB Sentiment Analysis")
print("\nThis project aimed to build and evaluate machine learning models for sentiment analysis on the IMDB movie review dataset. We explored three classic text classification algorithms: Multinomial Naive Bayes, Logistic Regression, and Random Forest Classifier.")

print("\n## 1. Model Performance Comparison (Test Set)")

# Create a DataFrame for the comparison table
# Metrics are assumed to be available from previous cells' execution
metrics_summary = {
    'Model': ['Multinomial Naive Bayes', 'Logistic Regression', 'Random Forest'],
    'Accuracy': [mnb_accuracy, lr_accuracy, rf_accuracy],
    'Precision': [mnb_precision, lr_precision, rf_precision],
    'Recall': [mnb_recall, lr_recall, rf_recall],
    'F1-Score': [mnb_f1, lr_f1, rf_f1],
    'AUC Score': [auc_mnb, auc_lr, auc_rf]
}
df_summary = pd.DataFrame(metrics_summary).round(4)

# Convert DataFrame to Markdown table
markdown_table = df_summary.to_markdown(index=False)
display(Markdown(markdown_table))

print("\n## 2. Conclusion")
print("Based on the evaluation metrics, particularly test accuracy and AUC score, the **Logistic Regression model performed best** among the three classifiers tested. It achieved the highest accuracy (0.8869) and AUC score (0.9551) on the unseen test data. While the Random Forest model showed 100% training accuracy, its significantly lower test accuracy (0.8451) and AUC (0.9270) indicated overfitting. Multinomial Naive Bayes provided a solid baseline (0.8547 accuracy, 0.9313 AUC) with good generalization, but Logistic Regression demonstrated superior predictive power for this binary sentiment classification task.")

print("\n## 3. Limitations of the Current Approach")
print("1.  **Dataset Specificity**: The model is trained on movie reviews, and its performance might not generalize well to other domains (e.g., product reviews, social media posts) without retraining.")
print("2.  **Binary Classification**: The sentiment is classified only as positive or negative, ignoring neutral sentiment or more nuanced emotional states.")
print("3.  **Static Features**: TF-IDF, while effective, captures word importance but doesn't fully understand semantic meaning or context, limiting the model's ability to handle complex language phenomena like sarcasm.")
print("4.  **No Aspect-Based Sentiment**: The current approach classifies the overall sentiment of a review, not sentiment towards specific aspects mentioned within the review (e.g., 'The plot was great, but the acting was terrible').")
print("5.  **Language Dependency**: The preprocessing steps (stopwords, lemmatization) and the TF-IDF vectorizer are English-specific.")

print("\n## 4. Future Improvements")
print("1.  **Advanced Embeddings and Deep Learning**: Explore word embeddings (Word2Vec, GloVe, FastText) or contextual embeddings (BERT, RoBERTa, GPT) combined with deep learning architectures (LSTMs, GRUs, Transformers) for better semantic understanding.")
print("2.  **Multi-class or Ordinal Sentiment**: Expand the classification to include 'neutral' or a sentiment scale (e.g., 1-5 stars) to capture more granular opinions.")
print("3.  **Aspect-Based Sentiment Analysis (ABSA)**: Implement techniques to identify and classify sentiment towards specific entities or aspects within a review.")
print("4.  **Ensemble Methods**: Experiment with more sophisticated ensemble techniques or stacking models to combine the strengths of different classifiers.")
print("5.  **Hyperparameter Tuning**: Conduct more extensive hyperparameter tuning for all models to potentially boost their performance further.")
print("6.  **Explainable AI (XAI)**: Incorporate methods like LIME or SHAP to understand why a model makes a particular sentiment prediction, improving trustworthiness and interpretability.")
print("7.  **Real-time Data and Deployment**: Consider building a real-time sentiment analysis API or integrating the model into a web application for practical use.")
