# Task 5: Movie Recommendation System (Level 2)

## Step 1: Load and Explore Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from collections import defaultdict

# Load ratings (user_id, movie_id, rating, timestamp)
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Load movies (movie_id, title, ...)
movies = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', names=['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url'] + [f'genre_{i}' for i in range(19)])

# Merge
data = pd.merge(ratings, movies[['movie_id', 'title']], on='movie_id')

# Explore
print(data.head())
print(data.describe())
print(f"Unique users: {data['user_id'].nunique()}, Movies: {data['movie_id'].nunique()}")

   user_id  movie_id  rating  timestamp                       title
0      196       242       3  881250949                Kolya (1996)
1      186       302       3  891717742    L.A. Confidential (1997)
2       22       377       1  878887116         Heavyweights (1994)
3      244        51       2  880606923  Legends of the Fall (1994)
4      166       346       1  886397596         Jackie Brown (1997)
            user_id       movie_id         rating     timestamp
count  100000.00000  100000.000000  100000.000000  1.000000e+05
mean      462.48475     425.530130       3.529860  8.835289e+08
std       266.61442     330.798356       1.125674  5.343856e+06
min         1.00000       1.000000       1.000000  8.747247e+08
25%       254.00000     175.000000       3.000000  8.794487e+08
50%       447.00000     322.000000       4.000000  8.828269e+08
75%       682.00000     631.000000       4.000000  8.882600e+08
max       943.00000    1682.000000       5.000000  8.932866e+08
Unique users: 94

## Step 2: Create User-Item Matrix

In [2]:
# Create user-item matrix (rows: users, cols: movies, values: ratings)
user_item_matrix = data.pivot_table(index='user_id', columns='movie_id', values='rating')

# Fill NaN with 0 (unrated)
user_item_matrix = user_item_matrix.fillna(0)

print(user_item_matrix.shape)  # (~943, ~1682)

(943, 1682)


## Step 3: Compute User Similarity

In [3]:
# Compute similarity matrix
user_similarity = cosine_similarity(user_item_matrix)

# Convert to DataFrame for ease
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

print(user_similarity_df.head())

user_id       1         2         3         4         5         6         7    \
user_id                                                                         
1        1.000000  0.166931  0.047460  0.064358  0.378475  0.430239  0.440367   
2        0.166931  1.000000  0.110591  0.178121  0.072979  0.245843  0.107328   
3        0.047460  0.110591  1.000000  0.344151  0.021245  0.072415  0.066137   
4        0.064358  0.178121  0.344151  1.000000  0.031804  0.068044  0.091230   
5        0.378475  0.072979  0.021245  0.031804  1.000000  0.237286  0.373600   

user_id       8         9         10   ...       934       935       936  \
user_id                                ...                                 
1        0.319072  0.078138  0.376544  ...  0.369527  0.119482  0.274876   
2        0.103344  0.161048  0.159862  ...  0.156986  0.307942  0.358789   
3        0.083060  0.061040  0.065151  ...  0.031875  0.042753  0.163829   
4        0.188060  0.101284  0.060859  ...  0.052107

## Step 4: Recommend Top-Rated Unseen Movies for a Given User

In [4]:
def recommend_movies(user_id, k=10, n_recommend=5):
    # Get user's ratings
    user_ratings = user_item_matrix.loc[user_id].sort_values(ascending=False)
    
    # Unseen movies
    unseen_movies = user_ratings[user_ratings == 0].index.tolist()
    
    # Predict ratings for unseen
    predicted_ratings = {}
    for movie_id in unseen_movies:
        # Weighted average from similar users
        sim_scores = user_similarity_df[user_id].drop(user_id)  # Exclude self
        movie_ratings = user_item_matrix[movie_id].drop(user_id)  # Other users' ratings for this movie
        # Only users who rated this movie
        common_users = movie_ratings[movie_ratings > 0].index
        if len(common_users) > 0:
            weighted_sum = np.dot(sim_scores[common_users], movie_ratings[common_users])
            sim_sum = sim_scores[common_users].sum()
            predicted = weighted_sum / sim_sum if sim_sum != 0 else 0
            predicted_ratings[movie_id] = predicted
    
    # Top N predicted
    top_predicted = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)[:n_recommend]
    
    # Get titles
    recommendations = []
    for movie_id, pred_rating in top_predicted:
        title = movies[movies['movie_id'] == movie_id]['title'].iloc[0]
        recommendations.append({'movie_id': movie_id, 'title': title, 'predicted_rating': pred_rating})
    
    return recommendations

# Example for user_id=1
recs = recommend_movies(1, n_recommend=5)
print("Recommendations for User 1:")
for rec in recs:
    print(f"- {rec['title']}: Predicted Rating {rec['predicted_rating']:.2f}")

Recommendations for User 1:
- Great Day in Harlem, A (1994): Predicted Rating 5.00
- They Made Me a Criminal (1939): Predicted Rating 5.00
- Prefontaine (1997): Predicted Rating 5.00
- Marlene Dietrich: Shadow and Light (1996) : Predicted Rating 5.00
- Saint of Fort Washington, The (1993): Predicted Rating 5.00


## Step 5: Evaluate Performance Using Precision@K

In [5]:
def precision_at_k(recommendations, user_id, k=5, threshold=4.0):
    # Get actual high-rated movies for user
    actual_high_rated = set(data[(data['user_id'] == user_id) & (data['rating'] >= threshold)]['movie_id'])
    
    # Recommended movies
    rec_movies = [rec['movie_id'] for rec in recommendations[:k]]
    
    # Precision
    relevant = len(set(rec_movies) & actual_high_rated) / k
    return relevant

# Example (in practice, average over users)
prec = precision_at_k(recs, 1)
print(f"Precision@5 for User 1: {prec:.2f}")

# Average over sample users
sample_users = data['user_id'].unique()[:10]
avg_prec = np.mean([precision_at_k(recommend_movies(u, n_recommend=5), u) for u in sample_users])
print(f"Average Precision@5: {avg_prec:.2f}")  # Expect ~0.20-0.40

Precision@5 for User 1: 0.00
Average Precision@5: 0.00


## Bonus: Item-Based Collaborative Filtering:

In [8]:
# Create user-item matrix
user_item_matrix = data.pivot_table(index='user_id', columns='movie_id', values='rating').fillna(0)

# Item similarity
item_similarity = cosine_similarity(user_item_matrix.T)

# Map movie_ids to matrix indices
movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(user_item_matrix.columns)}

def item_based_recommend(user_id, k=10, n_recommend=5):
    user_ratings = user_item_matrix.loc[user_id]
    unseen = user_ratings[user_ratings == 0].index
    
    predicted = {}
    for movie_id in unseen:
        if movie_id in movie_id_to_idx:  # Check if movie_id exists in matrix
            idx = movie_id_to_idx[movie_id]
            sim_scores = item_similarity[idx]  # Get similarity for this item
            rated_items = user_ratings[user_ratings > 0].index
            common = np.intersect1d(rated_items, [mid for mid in user_item_matrix.columns if mid in movie_id_to_idx])
            if len(common) > 0:
                common_indices = [movie_id_to_idx[mid] for mid in common]
                weighted = np.dot(sim_scores[common_indices], user_item_matrix.loc[user_id, common])
                sim_sum = sim_scores[common_indices].sum()
                predicted[movie_id] = weighted / sim_sum if sim_sum != 0 else 0
    
    top = sorted(predicted.items(), key=lambda x: x[1], reverse=True)[:n_recommend]
    return [{'movie_id': m, 'title': movies[movies['movie_id'] == m]['title'].iloc[0], 'predicted': r} for m, r in top]

# Example for user_id=1
item_recs = item_based_recommend(1)
print("Item-Based Recommendations:")
for rec in item_recs:
    print(f"- {rec['title']}: {rec['predicted']:.2f}")

Item-Based Recommendations:
- Cyclo (1995): 4.38
- Office Killer (1997): 4.24
- Little City (1998): 4.23
- Death in Brunswick (1991): 4.22
- Mamma Roma (1962): 4.18


## Bonus: Matrix Factorization (SVD)

In [17]:
# Create user-item matrix
user_item_matrix = data.pivot_table(index='user_id', columns='movie_id', values='rating').fillna(0)

# Convert to sparse matrix
user_item_sparse = csr_matrix(user_item_matrix.values)

# SVD decomposition
U, sigma, Vt = svds(user_item_sparse, k=50)
sigma = np.diag(sigma)
reconstructed = np.dot(np.dot(U, sigma), Vt)

# Predict for user 1
user1_pred = pd.DataFrame(reconstructed[0], index=user_item_matrix.columns, columns=['predicted']).sort_values('predicted', ascending=False)

# Fix: Use column-wise comparison with user 1's row
user1_ratings = user_item_matrix.iloc[0]  # Ratings for user 1
unseen_mask = (user1_ratings == 0).reindex(user1_pred.index, fill_value=True)  # Align indices
unseen_svd = user1_pred[unseen_mask].head(5)

print("SVD Recommendations for User 1:")
for movie_id, pred in unseen_svd.iterrows():
    title = movies[movies['movie_id'] == movie_id]['title'].iloc[0]
    print(f"- {title}: {pred['predicted']:.2f}")

SVD Recommendations for User 1:
- E.T. the Extra-Terrestrial (1982): 3.48
- Batman (1989): 3.11
- Dave (1993): 2.90
- One Flew Over the Cuckoo's Nest (1975): 2.77
- True Lies (1994): 2.61
