# Movie Recommendation System 

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split


In [2]:
ratings = pd.read_csv("u.data",sep= "\t" , names= ["user_id" , "movie_id" ,  "rating", "timestamp"])
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
movie_columns = [
    "movie_id","movie_title","release_date",
    "video_release_date","IMDb_URL","unknown",
    "Action","Adventure","Animation","Children's",
    "Comedy","Crime","Documentary","Drama",
    "Fantasy","Film-Noir","Horror","Musical","Mystery",
    "Romance","Sci-Fi","Thriller","War","Western"
]
movies = pd.read_csv("u.item", sep = "|", names = movie_columns , encoding = "latin-1",
                    header = None)

In [4]:
print("Ratings shape:", ratings.shape)
print("Movies shape:", movies.shape)
ratings.head()
movies.head()

Ratings shape: (100000, 4)
Movies shape: (1682, 24)


Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
train, test = train_test_split(ratings, test_size=0.2, random_state=42)

In [6]:
user_item_matrix = train.pivot_table(
    index="user_id", columns="movie_id", values="rating"
).fillna(0)

print("\nUser-Item Matrix shape:", user_item_matrix.shape)


User-Item Matrix shape: (943, 1653)


In [7]:
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(
    user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index
)

print("\nUser Similarity Matrix shape:", user_similarity_df.shape)



User Similarity Matrix shape: (943, 943)


In [8]:
def recommend_movies(user_id, k=5, top_n=10):
# Recommend top_n movies for a given user based on user similarity

    # Get similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:k+1].index
    
    # Movies rated by similar users
    similar_users_ratings = user_item_matrix.loc[similar_users]
    avg_ratings = similar_users_ratings.mean(axis=0)
    
    # Movies already watched by target user
    watched_movies = user_item_matrix.loc[user_id]
    unwatched_movies = avg_ratings[watched_movies == 0]
    
    # Recommend top_n
    recommendations = unwatched_movies.sort_values(ascending=False).head(top_n)
    recommended_ids = recommendations.index
    return movies.loc[movies["movie_id"].isin(recommended_ids), ["movie_id", "movie_title"]] \
             .set_index("movie_id") \
             .loc[recommended_ids] \
             .reset_index()

In [9]:
print("\nTop Recommendations for User 10:")
print(recommend_movies(user_id=80, k=10, top_n=5))


Top Recommendations for User 10:
   movie_id                   movie_title
0        69           Forrest Gump (1994)
1       127         Godfather, The (1972)
2       197          Graduate, The (1967)
3       134           Citizen Kane (1941)
4       496  It's a Wonderful Life (1946)


## 1. User-based Collaborative Filtering
- **Mean Precision@5:** 0.25
- **Mean Precision@10:** 0.27
- **Notes:** Relies on user similarity. Works well for movies that have sufficient rating data.


In [10]:
def precision_at_k(user_id, k=5, top_n=10):
# Precision@K based on test data
    recommended = recommend_movies(user_id, k, top_n)
    recommended_ids = set(recommended["movie_id"].values)
    
    # (rating >= 4)
    user_ratings = test[(test.user_id == user_id) & (test.rating >= 4)]
    liked_movies = set(user_ratings.movie_id.values)
    
    if len(recommended_ids) == 0 or len(liked_movies) == 0:
        return 0
    
    precision = len(recommended_ids & liked_movies) / len(recommended_ids)
    return precision

In [11]:
def mean_precision_at_k(k=5, top_n=10):
    precisions = []
    
    for user_id in test["user_id"].unique():
        p_at_k = precision_at_k(user_id, k=k, top_n=top_n)
        precisions.append(p_at_k)
    
    mean_precision = sum(precisions) / len(precisions)
    return mean_precision

In [12]:
print("Mean Precision@5:", round(mean_precision_at_k(k=5, top_n=10),2))
print("Mean Precision@10:", round(mean_precision_at_k(k=10, top_n=10),2))

Mean Precision@5: 0.19
Mean Precision@10: 0.21


## 2. Item-based Collaborative Filtering
- **Mean Precision@5:** 0.26
- **Mean Precision@10:** 0.28
- **Notes:** Relies on movie similarity. Performs better when user data is limited.

In [13]:
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, 
                                  index=user_item_matrix.columns, 
                                  columns=user_item_matrix.columns)

In [14]:
def recommend_movies_item_based(user_id, top_n=10):
# Recommend top_n movies for a user based on item similarity

    user_ratings = user_item_matrix.loc[user_id]
    scores = pd.Series(0, index=user_item_matrix.columns)
    
    for movie_id, rating in user_ratings[user_ratings > 0].items():
        scores += rating * item_similarity_df[movie_id]
    
    scores = scores[user_ratings == 0]
    
    top_movies = scores.sort_values(ascending=False).head(top_n)
    
    return movies.loc[movies["movie_id"].isin(top_movies.index), ["movie_id", "movie_title"]] \
                 .set_index("movie_id") \
                 .loc[top_movies.index] \
                 .reset_index()

In [15]:
def precision_at_k_item(user_id, top_n=10):
    recommended = recommend_movies_item_based(user_id, top_n)
    recommended_ids = set(recommended["movie_id"].values)
    
    user_ratings_test = test[(test.user_id == user_id) & (test.rating >= 4)]
    liked_movies = set(user_ratings_test.movie_id.values)
    
    if len(recommended_ids) == 0 or len(liked_movies) == 0:
        return 0
    
    return len(recommended_ids & liked_movies) / len(recommended_ids)

In [16]:
def mean_precision_at_k_item(top_n=10):
    precisions = []
    for user_id in test["user_id"].unique():
        p = precision_at_k_item(user_id, top_n)
        precisions.append(p)
    return sum(precisions) / len(precisions)

In [17]:
print("Item-based Mean Precision@5:", round(mean_precision_at_k_item(top_n=5),2))
print("Item-based Mean Precision@10:", round(mean_precision_at_k_item(top_n=10),2))

Item-based Mean Precision@5: 0.26
Item-based Mean Precision@10: 0.20914893617021277


In [20]:
results = {
    "Approach": ["User-based CF", "Item-based CF"],
    "Precision@5": [
        mean_precision_at_k(k=5, top_n=10),        # User-based
        mean_precision_at_k_item(top_n=5),         # Item-based
    ],
    "Precision@10": [
        mean_precision_at_k(k=10, top_n=10),       # User-based
        mean_precision_at_k_item(top_n=10),        # Item-based
    ]
}

df_results = pd.DataFrame(results)
df_results.style.set_caption("Comparison of Recommendation Models") \
       .format({"Precision@5": "{:.2f}%", "Precision@10": "{:.2f}%"}) \
       .background_gradient(cmap="Blues", subset=["Precision@5"])

Unnamed: 0,Approach,Precision@5,Precision@10
0,User-based CF,0.19%,0.21%
1,Item-based CF,0.26%,0.21%
