# 🎭 Sentiment Analysis - IMDb Movie Reviews

**Project**: Natural Language Processing - Text Classification  
**Level**: Advanced  
**Dataset**: IMDb Movie Reviews (Synthetic)  

## 📋 Project Overview

This project performs sentiment analysis on movie reviews using NLP and deep learning techniques. We'll learn:

- NLP fundamentals and text preprocessing
- Tokenization, stemming, and lemmatization
- Word embeddings (Word2Vec, GloVe)
- LSTM/GRU models for text classification
- Traditional ML vs deep learning for NLP

Let's analyze movie sentiments! 🎬

## 1. Import Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Natural Language Processing
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter

# Machine Learning - Traditional
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

# Utilities
import warnings
import random
from tqdm import tqdm

warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('vader_lexicon', quiet=True)

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print(f"🧠 TensorFlow version: {tf.__version__}")
print(f"🎭 Ready for sentiment analysis!")

## 2. Data Generation and Exploration

In [None]:
# Generate synthetic movie reviews dataset
np.random.seed(42)
random.seed(42)

print(f"🎬 Generating synthetic movie reviews dataset...")

# Positive review templates and words
positive_templates = [
    "This movie is absolutely {adj}! The {aspect} was {pos_word} and I {enjoyed} every minute.",
    "I {loved} this film! {pos_word} {aspect} and {excellent} acting made it {amazing}.",
    "What a {fantastic} movie! The {aspect} was {brilliant} and the story was {engaging}.",
    "{excellent} film with {outstanding} {aspect}. I would {recommend} it to everyone!",
    "This is one of the {best} movies I've seen. {pos_word} {aspect} and {great} direction."
]

negative_templates = [
    "This movie is {terrible}! The {aspect} was {bad} and I {hated} every minute.",
    "I {disliked} this film. {bad} {aspect} and {poor} acting made it {awful}.",
    "What a {disappointing} movie! The {aspect} was {boring} and the story was {confusing}.",
    "{terrible} film with {poor} {aspect}. I would not {recommend} it to anyone.",
    "This is one of the {worst} movies I've seen. {bad} {aspect} and {terrible} direction."
]

# Word lists
positive_adjectives = ['amazing', 'fantastic', 'brilliant', 'excellent', 'outstanding', 'wonderful', 'superb', 'great']
negative_adjectives = ['terrible', 'awful', 'horrible', 'disappointing', 'boring', 'confusing', 'poor', 'bad', 'worst']
aspects = ['plot', 'acting', 'cinematography', 'soundtrack', 'direction', 'script', 'characters', 'dialogue']
positive_verbs = ['loved', 'enjoyed', 'adored', 'appreciated']
negative_verbs = ['hated', 'disliked', 'despised']

# Generate reviews
reviews = []
labels = []
n_reviews = 5000

for i in range(n_reviews):
    if i < n_reviews // 2:  # Positive reviews
        template = random.choice(positive_templates)
        review = template.format(
            adj=random.choice(positive_adjectives),
            aspect=random.choice(aspects),
            pos_word=random.choice(positive_adjectives),
            enjoyed=random.choice(positive_verbs),
            loved=random.choice(positive_verbs),
            excellent=random.choice(positive_adjectives),
            amazing=random.choice(positive_adjectives),
            fantastic=random.choice(positive_adjectives),
            brilliant=random.choice(positive_adjectives),
            engaging=random.choice(positive_adjectives),
            outstanding=random.choice(positive_adjectives),
            recommend='recommend',
            best='best',
            great=random.choice(positive_adjectives)
        )
        labels.append(1)  # Positive
    else:  # Negative reviews
        template = random.choice(negative_templates)
        review = template.format(
            terrible=random.choice(negative_adjectives),
            aspect=random.choice(aspects),
            bad=random.choice(negative_adjectives),
            hated=random.choice(negative_verbs),
            disliked=random.choice(negative_verbs),
            poor=random.choice(negative_adjectives),
            awful=random.choice(negative_adjectives),
            disappointing=random.choice(negative_adjectives),
            boring=random.choice(negative_adjectives),
            confusing=random.choice(negative_adjectives),
            recommend='recommend',
            worst='worst'
        )
        labels.append(0)  # Negative
    
    reviews.append(review)

# Add some noise and variation
noise_words = ['really', 'very', 'quite', 'somewhat', 'definitely', 'absolutely', 'completely']
for i in range(len(reviews)):
    if random.random() < 0.3:  # 30% chance to add noise
        noise = random.choice(noise_words)
        reviews[i] = reviews[i].replace(' was ', f' was {noise} ')

# Create DataFrame
df = pd.DataFrame({
    'review': reviews,
    'sentiment': labels
})

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\n🎭 Movie reviews dataset created!")
print(f"Dataset shape: {df.shape}")
print(f"Total reviews: {len(df):,}")
print(f"Positive reviews: {(df['sentiment'] == 1).sum():,}")
print(f"Negative reviews: {(df['sentiment'] == 0).sum():,}")

In [None]:
# Dataset exploration
print("📊 Dataset Information:")
print(f"Total reviews: {len(df):,}")
print(f"Features: {list(df.columns)}")
print(f"Missing values: {df.isnull().sum().sum()}")

# Sentiment distribution
sentiment_counts = df['sentiment'].value_counts()
print(f"\n🎭 Sentiment Distribution:")
print(f"• Negative (0): {sentiment_counts[0]:,} ({sentiment_counts[0]/len(df):.1%})")
print(f"• Positive (1): {sentiment_counts[1]:,} ({sentiment_counts[1]/len(df):.1%})")

# Review length analysis
df['review_length'] = df['review'].str.len()
df['word_count'] = df['review'].str.split().str.len()

print(f"\n📝 Review Statistics:")
print(f"• Average review length: {df['review_length'].mean():.1f} characters")
print(f"• Average word count: {df['word_count'].mean():.1f} words")
print(f"• Shortest review: {df['review_length'].min()} characters")
print(f"• Longest review: {df['review_length'].max()} characters")

# Sample reviews
print(f"\n🔍 Sample Reviews:")
print("\nPositive Reviews:")
positive_samples = df[df['sentiment'] == 1]['review'].head(3)
for i, review in enumerate(positive_samples, 1):
    print(f"{i}. {review}")

print("\nNegative Reviews:")
negative_samples = df[df['sentiment'] == 0]['review'].head(3)
for i, review in enumerate(negative_samples, 1):
    print(f"{i}. {review}")

## 3. Exploratory Data Analysis

In [None]:
# Sentiment and text analysis visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('🎭 Sentiment Analysis - Text Exploration', fontsize=16, fontweight='bold')

# Sentiment distribution
sentiment_counts = df['sentiment'].value_counts()
colors = ['#FF6B6B', '#4ECDC4']
bars1 = axes[0,0].bar(['Negative', 'Positive'], sentiment_counts.values, color=colors)
axes[0,0].set_title('🎭 Sentiment Distribution')
axes[0,0].set_ylabel('Number of Reviews')

# Add value labels
for bar in bars1:
    height = bar.get_height()
    axes[0,0].text(bar.get_x() + bar.get_width()/2., height + 50,
                   f'{int(height):,}', ha='center', va='bottom', fontweight='bold')

# Review length distribution by sentiment
positive_lengths = df[df['sentiment'] == 1]['review_length']
negative_lengths = df[df['sentiment'] == 0]['review_length']

axes[0,1].hist(positive_lengths, bins=30, alpha=0.7, label='Positive', color='#4ECDC4', density=True)
axes[0,1].hist(negative_lengths, bins=30, alpha=0.7, label='Negative', color='#FF6B6B', density=True)
axes[0,1].set_title('📝 Review Length Distribution')
axes[0,1].set_xlabel('Review Length (characters)')
axes[0,1].set_ylabel('Density')
axes[0,1].legend()

# Word count distribution by sentiment
positive_words = df[df['sentiment'] == 1]['word_count']
negative_words = df[df['sentiment'] == 0]['word_count']

axes[1,0].hist(positive_words, bins=20, alpha=0.7, label='Positive', color='#4ECDC4', density=True)
axes[1,0].hist(negative_words, bins=20, alpha=0.7, label='Negative', color='#FF6B6B', density=True)
axes[1,0].set_title('📊 Word Count Distribution')
axes[1,0].set_xlabel('Word Count')
axes[1,0].set_ylabel('Density')
axes[1,0].legend()

# Average review length by sentiment
avg_lengths = df.groupby('sentiment')['review_length'].mean()
bars2 = axes[1,1].bar(['Negative', 'Positive'], avg_lengths.values, color=colors)
axes[1,1].set_title('📏 Average Review Length by Sentiment')
axes[1,1].set_ylabel('Average Length (characters)')

# Add value labels
for bar in bars2:
    height = bar.get_height()
    axes[1,1].text(bar.get_x() + bar.get_width()/2., height + 1,
                   f'{height:.1f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"📊 Key Observations:")
print(f"• Dataset is balanced: {sentiment_counts[0]} negative, {sentiment_counts[1]} positive")
print(f"• Average positive review length: {positive_lengths.mean():.1f} characters")
print(f"• Average negative review length: {negative_lengths.mean():.1f} characters")
print(f"• Average positive word count: {positive_words.mean():.1f} words")
print(f"• Average negative word count: {negative_words.mean():.1f} words")