# Movie Recommendation System

This notebook demonstrates an end-to-end recommender using MovieLens (`ml-latest-small`).

Sections:
1. Imports
2. Load data
3. Preprocessing
4. Content-based (TF-IDF on genres)
5. Collaborative (TruncatedSVD on user-item)
6. Hybrid recommendations
7. Evaluation (RMSE, Precision@K)
8. Optional sentiment filtering


In [None]:
# 1) Imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math


In [None]:
# 2) Load data
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')
print('movies', movies.shape)
print('ratings', ratings.shape)
movies.head()


In [None]:
# 3) Preprocessing
movies['genres'] = movies['genres'].fillna('')
movies['genres_clean'] = movies['genres'].str.replace('|', ' ')
ratings.head()


In [None]:
# 4) Content-based: TF-IDF on genres
vectorizer = TfidfVectorizer(stop_words='english')
genre_tfidf = vectorizer.fit_transform(movies['genres_clean'])
content_sim = cosine_similarity(genre_tfidf, genre_tfidf)
movie_idx = pd.Series(movies.index, index=movies['movieId'])
print('Content similarity matrix shape:', content_sim.shape)


In [None]:
# 5) Collaborative: build user-item matrix and get item embeddings via TruncatedSVD
user_item = ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
print('User-item shape:', user_item.shape)

svd = TruncatedSVD(n_components=50, random_state=42)
item_factors = svd.fit_transform(user_item.T)
item_movieIds = user_item.columns.tolist()
item_sim_latent = cosine_similarity(item_factors)
print('Latent item similarity shape:', item_sim_latent.shape)


In [None]:
# 6) Helper functions

def get_content_recs(movie_title, top_n=10):
    matches = movies[movies['title'].str.contains(movie_title, case=False, regex=False)]
    if matches.empty:
        return pd.DataFrame()
    mid = matches.iloc[0]['movieId']
    idx = movie_idx[mid]
    sim_scores = list(enumerate(content_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, score in sim_scores[1: top_n+1]]
    return movies.iloc[top_indices][['movieId','title','genres']]


def recommend_hybrid(fav_movieIds, top_n=10):
    scores = np.zeros(item_sim_latent.shape[0])
    for mid in fav_movieIds:
        if mid in item_movieIds:
            scores += item_sim_latent[item_movieIds.index(mid)]
    for mid in fav_movieIds:
        if mid in item_movieIds:
            scores[item_movieIds.index(mid)] = -np.inf
    top_cols = np.argsort(scores)[-top_n:][::-1]
    recs = [item_movieIds[c] for c in top_cols]
    return movies[movies['movieId'].isin(recs)][['movieId','title','genres']]


In [None]:
# Examples
print('Content-based similar to Toy Story:')
print(get_content_recs('Toy Story', top_n=5))

print('\nHybrid example (use movieIds):')
print(recommend_hybrid([1, 3114], top_n=5))


In [None]:
# 7) Evaluation: simple RMSE on test set for a baseline matrix-factorization style prediction
# We'll do a simple SVD-based prediction using TruncatedSVD components

# Build train/test split
train, test = train_test_split(ratings, test_size=0.2, random_state=42)
train_ui = train.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
svd2 = TruncatedSVD(n_components=20, random_state=42)
item_factors2 = svd2.fit_transform(train_ui.T)
# compute predicted ratings by projecting users onto item latent space (approx)
# This is a simplified approach for demonstration only
user_factors_approx = np.linalg.pinv(item_factors2).dot(train_ui.T)

# A helper to predict a rating (approx)
def predict_rating(userId, movieId):
    try:
        colidx = list(train_ui.columns).index(movieId)
        # user vector: multiply pseudo-inverse to estimate user latent vector
        user_idx = list(train_ui.index).index(userId)
        user_vec = user_factors_approx[:, user_idx]
        pred = user_vec.dot(item_factors2[colidx])
        return pred
    except Exception:
        return np.nan

# Apply on test (small sample to save time)
sample = test.sample(frac=0.2, random_state=42)
preds = []
truths = []
for _, row in sample.iterrows():
    p = predict_rating(row['userId'], row['movieId'])
    if not np.isnan(p):
        preds.append(p)
        truths.append(row['rating'])

if preds:
    rmse = math.sqrt(mean_squared_error(truths, preds))
    print('Approx RMSE on sample:', rmse)
else:
    print('No predictions could be made on the sample (cold-starts).')


## 8) Optional: Sentiment-based filtering

If you have external reviews or tags, use a sentiment analyzer (VADER) to compute sentiment scores per movie and filter recommendations to only show movies with non-negative sentiment. MovieLens `ml-latest-small` doesn't include free-text reviews, so you'd need external sources (OMDb/TMDb).