
# Movie Recommendation System

This notebook builds a content-based movie recommender using the MovieLens dataset. It uses:
- TF-IDF for genres
- Average ratings
- Cosine similarity
- PCA and KMeans for clustering
- A Streamlit interface for recommendations

---

## 1. Import Libraries and Load Data


In [None]:

import pandas as pd
import numpy as np
import streamlit as st
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

# Load data
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

st.title("🎬 Movie Recommender System")

st.write("Dataset loaded successfully!")


## 2. EDA: Exploring Movie Genres and Ratings

In [None]:

# Preprocess genres
movies['genres'] = movies['genres'].str.replace('|', ' ')
movies['num_genres'] = movies['genres'].apply(lambda x: len(x.split()) if pd.notna(x) else 0)

# Display genre distribution plot
st.subheader("Distribution of Number of Genres")
fig, ax = plt.subplots()
sns.histplot(movies['num_genres'], bins=20, ax=ax)
st.pyplot(fig)

# Top genres
from collections import Counter
all_genres = ' '.join(movies['genres'].dropna()).split()
genre_counts = Counter(all_genres)
genre_df = pd.DataFrame(genre_counts.items(), columns=['Genre', 'Count']).sort_values('Count', ascending=False)

st.subheader("Top Genres")
st.dataframe(genre_df.head(10))


## 3. Feature Extraction from Genres + Ratings

In [None]:

# TF-IDF on genres
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movies['genres'].fillna(''))

# Average rating
avg_ratings = ratings.groupby('movieId')['rating'].mean().reset_index()
avg_ratings.columns = ['movieId', 'avg_rating']
movies = movies.merge(avg_ratings, on='movieId', how='left')

# Combine features
features = np.hstack((tfidf_matrix.toarray(), movies[['avg_rating']].fillna(0).values))

# Scale
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)


## 4. Clustering Movies using KMeans + PCA

In [None]:

# PCA
pca = PCA(n_components=2)
pca_features = pca.fit_transform(features_scaled)

# KMeans
kmeans = KMeans(n_clusters=10, random_state=42)
clusters = kmeans.fit_predict(features_scaled)

movies['Cluster'] = clusters

# Plot
st.subheader("PCA Clustering Plot")
fig2, ax2 = plt.subplots()
scatter = ax2.scatter(pca_features[:, 0], pca_features[:, 1], c=clusters, cmap='tab10')
plt.colorbar(scatter)
st.pyplot(fig2)


## 5. Streamlit Recommender Interface

In [None]:

# Similarity matrix
cos_sim = cosine_similarity(tfidf_matrix)

# Build simple recommender
movie_titles = movies['title'].tolist()
selected_movie = st.selectbox("Select a movie to get recommendations:", movie_titles)

if selected_movie:
    index = movies[movies['title'] == selected_movie].index[0]
    similarity_scores = list(enumerate(cos_sim[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_movies = [movie_titles[i[0]] for i in similarity_scores[1:6]]

    st.write("Top 5 Recommended Movies:")
    for i, title in enumerate(top_movies, 1):
        st.write(f"{i}. {title}")
