# 🎬 Movie Recommendation System

**Project**: Recommender System - Collaborative Filtering  
**Level**: Intermediate  
**Dataset**: MovieLens Dataset (Synthetic)  

## 📋 Project Overview

This project builds a movie recommendation system using collaborative filtering techniques. We'll learn:

- Collaborative filtering fundamentals
- User-based and item-based filtering
- Similarity metrics (cosine, pearson)
- Matrix factorization techniques
- Recommendation evaluation

Let's build a movie recommendation engine! 🍿

## 1. Import Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and recommender systems
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.sparse import csr_matrix
from scipy.stats import pearsonr

# Utilities
import warnings
from collections import defaultdict
import random

warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print("🎬 Ready for movie recommendation analysis!")

## 2. Data Generation and Exploration

In [None]:
# Generate synthetic MovieLens-style dataset
np.random.seed(42)
random.seed(42)

# Parameters
n_users = 1000
n_movies = 500
n_ratings = 50000
sparsity_level = 0.9  # 90% sparsity (realistic for recommender systems)

print(f"🎬 Generating synthetic MovieLens dataset...")
print(f"Users: {n_users:,}")
print(f"Movies: {n_movies:,}")
print(f"Target ratings: {n_ratings:,}")

# Create movie metadata
genres = ['Action', 'Comedy', 'Drama', 'Horror', 'Romance', 'Sci-Fi', 'Thriller', 'Animation']
movie_years = np.random.randint(1980, 2024, n_movies)

movies_data = []
for movie_id in range(1, n_movies + 1):
    title = f"Movie_{movie_id}"
    year = movie_years[movie_id - 1]
    genre = np.random.choice(genres)
    movies_data.append({
        'movie_id': movie_id,
        'title': title,
        'year': year,
        'genre': genre
    })

movies_df = pd.DataFrame(movies_data)

# Generate ratings with realistic patterns
ratings_data = []
user_preferences = {}  # Store user genre preferences

# Create user preferences
for user_id in range(1, n_users + 1):
    # Each user has preferred genres
    preferred_genres = np.random.choice(genres, size=np.random.randint(1, 4), replace=False)
    user_preferences[user_id] = preferred_genres

# Generate ratings based on preferences
for _ in range(n_ratings):
    user_id = np.random.randint(1, n_users + 1)
    movie_id = np.random.randint(1, n_movies + 1)
    
    # Get movie genre
    movie_genre = movies_df[movies_df['movie_id'] == movie_id]['genre'].iloc[0]
    
    # Rating based on user preference
    if movie_genre in user_preferences[user_id]:
        # Higher rating for preferred genres
        rating = np.random.choice([3, 4, 5], p=[0.2, 0.4, 0.4])
    else:
        # Lower rating for non-preferred genres
        rating = np.random.choice([1, 2, 3, 4, 5], p=[0.2, 0.3, 0.3, 0.15, 0.05])
    
    ratings_data.append({
        'user_id': user_id,
        'movie_id': movie_id,
        'rating': rating
    })

# Remove duplicates (same user rating same movie multiple times)
ratings_df = pd.DataFrame(ratings_data).drop_duplicates(['user_id', 'movie_id'])

print(f"\n📊 Dataset created successfully!")
print(f"Actual ratings: {len(ratings_df):,}")
print(f"Movies with ratings: {ratings_df['movie_id'].nunique():,}")
print(f"Active users: {ratings_df['user_id'].nunique():,}")
print(f"Sparsity: {1 - len(ratings_df)/(n_users * n_movies):.1%}")

In [None]:
# Dataset exploration
print("📊 Dataset Information:")
print(f"Total ratings: {len(ratings_df):,}")
print(f"Unique users: {ratings_df['user_id'].nunique():,}")
print(f"Unique movies: {ratings_df['movie_id'].nunique():,}")
print(f"Rating scale: {ratings_df['rating'].min()} - {ratings_df['rating'].max()}")
print(f"Average rating: {ratings_df['rating'].mean():.2f}")

print("\n🎭 Genre Distribution:")
genre_counts = movies_df['genre'].value_counts()
for genre, count in genre_counts.items():
    print(f"• {genre}: {count} movies")

print("\n⭐ Rating Distribution:")
rating_counts = ratings_df['rating'].value_counts().sort_index()
for rating, count in rating_counts.items():
    print(f"• {rating} stars: {count:,} ratings ({count/len(ratings_df):.1%})")

# Display sample data
print("\n🔍 Sample Ratings:")
sample_ratings = ratings_df.merge(movies_df, on='movie_id').head(10)
print(sample_ratings[['user_id', 'title', 'genre', 'rating']])

## 3. Exploratory Data Analysis

In [None]:
# Visualization of rating patterns
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('🎬 Movie Rating Analysis', fontsize=16, fontweight='bold')

# Rating distribution
rating_counts = ratings_df['rating'].value_counts().sort_index()
bars1 = axes[0,0].bar(rating_counts.index, rating_counts.values, color='skyblue')
axes[0,0].set_title('⭐ Rating Distribution')
axes[0,0].set_xlabel('Rating')
axes[0,0].set_ylabel('Count')
axes[0,0].grid(True, alpha=0.3)

# Add value labels
for bar in bars1:
    height = bar.get_height()
    axes[0,0].text(bar.get_x() + bar.get_width()/2., height + 100,
                   f'{int(height):,}', ha='center', va='bottom', fontweight='bold')

# Genre popularity (by number of ratings)
genre_ratings = ratings_df.merge(movies_df, on='movie_id')['genre'].value_counts()
bars2 = axes[0,1].bar(range(len(genre_ratings)), genre_ratings.values, color='lightcoral')
axes[0,1].set_title('🎭 Genre Popularity (by Ratings)')
axes[0,1].set_xlabel('Genre')
axes[0,1].set_ylabel('Number of Ratings')
axes[0,1].set_xticks(range(len(genre_ratings)))
axes[0,1].set_xticklabels(genre_ratings.index, rotation=45)
axes[0,1].grid(True, alpha=0.3)

# User activity distribution
user_activity = ratings_df['user_id'].value_counts()
axes[1,0].hist(user_activity.values, bins=30, color='lightgreen', alpha=0.7)
axes[1,0].set_title('👥 User Activity Distribution')
axes[1,0].set_xlabel('Number of Ratings per User')
axes[1,0].set_ylabel('Number of Users')
axes[1,0].grid(True, alpha=0.3)

# Movie popularity distribution
movie_popularity = ratings_df['movie_id'].value_counts()
axes[1,1].hist(movie_popularity.values, bins=30, color='gold', alpha=0.7)
axes[1,1].set_title('🎬 Movie Popularity Distribution')
axes[1,1].set_xlabel('Number of Ratings per Movie')
axes[1,1].set_ylabel('Number of Movies')
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"📊 Key Statistics:")
print(f"• Most active user rated {user_activity.max()} movies")
print(f"• Average user rated {user_activity.mean():.1f} movies")
print(f"• Most popular movie has {movie_popularity.max()} ratings")
print(f"• Average movie has {movie_popularity.mean():.1f} ratings")

In [None]:
# Create user-item matrix
print("🔧 Creating user-item rating matrix...")

# Pivot table to create user-item matrix
user_item_matrix = ratings_df.pivot_table(
    index='user_id', 
    columns='movie_id', 
    values='rating', 
    fill_value=0
)

print(f"User-item matrix shape: {user_item_matrix.shape}")
print(f"Matrix sparsity: {(user_item_matrix == 0).sum().sum() / user_item_matrix.size:.1%}")
print(f"Non-zero entries: {(user_item_matrix != 0).sum().sum():,}")

# Convert to sparse matrix for efficiency
sparse_user_item = csr_matrix(user_item_matrix.values)
print(f"Sparse matrix created with {sparse_user_item.nnz:,} non-zero entries")

# Analyze rating patterns by genre
print("\n🎭 Average Ratings by Genre:")
genre_ratings_detailed = ratings_df.merge(movies_df, on='movie_id')
genre_avg_ratings = genre_ratings_detailed.groupby('genre')['rating'].agg(['mean', 'count', 'std']).round(3)
genre_avg_ratings.columns = ['Avg_Rating', 'Count', 'Std_Dev']
genre_avg_ratings = genre_avg_ratings.sort_values('Avg_Rating', ascending=False)

print(genre_avg_ratings)

# Find most and least popular movies
movie_stats = ratings_df.groupby('movie_id').agg({
    'rating': ['mean', 'count']
}).round(2)
movie_stats.columns = ['avg_rating', 'num_ratings']
movie_stats = movie_stats.merge(movies_df, on='movie_id')

# Most popular movies (by number of ratings)
print("\n🏆 Top 10 Most Popular Movies:")
top_popular = movie_stats.nlargest(10, 'num_ratings')[['title', 'genre', 'avg_rating', 'num_ratings']]
for idx, row in top_popular.iterrows():
    print(f"• {row['title']} ({row['genre']}) - {row['avg_rating']:.1f}⭐ ({row['num_ratings']} ratings)")

# Highest rated movies (with minimum 10 ratings)
print("\n⭐ Top 10 Highest Rated Movies (min 10 ratings):")
top_rated = movie_stats[movie_stats['num_ratings'] >= 10].nlargest(10, 'avg_rating')[['title', 'genre', 'avg_rating', 'num_ratings']]
for idx, row in top_rated.iterrows():
    print(f"• {row['title']} ({row['genre']}) - {row['avg_rating']:.1f}⭐ ({row['num_ratings']} ratings)")