# RECOMMENDATION SYSTEM

In this lab we build a movie recommendation system using matrix factorization

In the following sections, we'll:
- Preprocess the user-item interaction data
- Build a recommendation system
- Train and optimize the model
- Evaluate model performance
- Generate personalized recommendations

##  Understanding And Cleaning The Data

In [1]:
# Import Libraries
import pandas as pd
import numpy as np

In [2]:
# Import the data sets
movie_df = pd.read_csv('movies.csv')
rating_df = pd.read_csv('ratings.csv')

In [3]:
# Preview the movie dataset
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
# Preview the rating dataset
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,5.0,847117005
1,1,2,3.0,847642142
2,1,10,3.0,847641896
3,1,32,4.0,847642008
4,1,34,4.0,847641956


In [5]:
# Let's combine both datasets to df
df = pd.merge(rating_df,movie_df,on='movieId', how='left')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,5.0,847117005,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,2,3.0,847642142,Jumanji (1995),Adventure|Children|Fantasy
2,1,10,3.0,847641896,GoldenEye (1995),Action|Adventure|Thriller
3,1,32,4.0,847642008,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
4,1,34,4.0,847641956,Babe (1995),Children|Drama


In [6]:
# Let's drop timestamp since it doesnot provide much for the recommendation
df.drop('timestamp',axis=1,inplace=True)
df.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,2,3.0,Jumanji (1995),Adventure|Children|Fantasy
2,1,10,3.0,GoldenEye (1995),Action|Adventure|Thriller
3,1,32,4.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
4,1,34,4.0,Babe (1995),Children|Drama


In [7]:
# check for null
df.isnull().sum()

Unnamed: 0,0
userId,0
movieId,0
rating,0
title,0
genres,0


No null values found.

In [8]:
# Check for duplicates
print("Duplicate rows:", df.duplicated().sum())

Duplicate rows: 0


## Data Preparation

In [11]:
# Create the User-Movie Matrix and fill with zeros
# Group by userId and title and take the mean of the ratings to handle duplicates
movie_user_rating_pivot = df.groupby(['userId', 'title'])['rating'].mean().unstack().fillna(0)
movie_user_rating_pivot.head(10)

title,"""Great Performances"" Cats (1998)",'71 (2014),'Round Midnight (1986),'Til There Was You (1997),'night Mother (1986),(500) Days of Summer (2009),(Untitled) (2009),*batteries not included (1987),...And Justice for All (1979),1-900 (06) (1994),...,Zoom (2006),Zorro the Gay Blade (1981),Zulu (1964),[REC] (2007),[REC]�_ (2009),burbs The (1989),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),��Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,4.0,3.5,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
X = movie_user_rating_pivot.values.T
X.shape

(8913, 718)

In [13]:
# Matrix Factorization Using NumPy (Gradient Descent)
# Convert pivot table to NumPy array
R = movie_user_rating_pivot.values
num_users, num_items = R.shape

def matrix_factorization(R, K=10, steps=100, alpha=0.002, beta=0.02):
    M, N = R.shape
    P = np.random.rand(M, K)
    Q = np.random.rand(N, K)
    Q = Q.T

    for step in range(steps):
        for i in range(M):
            for j in range(N):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i, :], Q[:, j])
                    for k in range(K):
                        P[i][k] += alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] += alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        # Optional: Print error every 10 steps
        if step % 10 == 0:
            error = np.sum((R[R > 0] - np.dot(P, Q) [R > 0])**2)
            print(f"Iteration {step}, Error: {error:.2f}")
    return P, Q.T

## Model Training And Evaluation

In [15]:
# Train the Matrix Factorization Model
P, Q = matrix_factorization(R, K=15, steps=100)
predicted_ratings = np.dot(P, Q.T)

Iteration 0, Error: 98856.65
Iteration 10, Error: 68303.98
Iteration 20, Error: 60841.34
Iteration 30, Error: 54285.42
Iteration 40, Error: 48311.70
Iteration 50, Error: 43392.30
Iteration 60, Error: 39520.45
Iteration 70, Error: 36492.07
Iteration 80, Error: 34110.64
Iteration 90, Error: 32218.09


In [16]:
# Build the Predicted Ratings DataFrame
predicted_df = pd.DataFrame(predicted_ratings,
                            index=movie_user_rating_pivot.index,
                            columns=movie_user_rating_pivot.columns)
predicted_df.head()

title,"""Great Performances"" Cats (1998)",'71 (2014),'Round Midnight (1986),'Til There Was You (1997),'night Mother (1986),(500) Days of Summer (2009),(Untitled) (2009),*batteries not included (1987),...And Justice for All (1979),1-900 (06) (1994),...,Zoom (2006),Zorro the Gay Blade (1981),Zulu (1964),[REC] (2007),[REC]�_ (2009),burbs The (1989),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),��Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.876434,4.306315,3.149288,4.734254,3.779674,4.172657,3.779606,3.465056,3.696487,3.558582,...,3.131938,3.765425,4.890929,4.547983,3.844915,3.024905,3.416042,3.049503,2.754862,3.264372
2,4.396712,4.130923,1.616838,4.267657,4.286687,3.747751,2.194951,2.994449,2.630813,4.162447,...,3.494875,3.058153,3.932663,4.409672,3.561995,2.738534,3.592779,4.184427,2.66313,2.633818
3,4.049966,4.571565,2.726044,4.916663,4.12895,4.118992,3.443803,3.458396,2.666768,4.324758,...,3.228473,3.566603,5.275421,4.937321,3.668753,3.022792,3.416329,4.118838,2.16182,2.346343
4,3.943905,4.507721,2.7371,4.275617,4.101727,3.571163,3.317098,3.252708,2.727393,3.635202,...,3.289716,3.208911,4.704032,4.463547,3.101639,3.464703,3.949287,3.098123,1.915533,2.676638
5,3.508933,3.701057,3.144912,3.670682,4.009374,4.474428,3.949074,3.361045,3.488104,2.894855,...,3.223537,3.490243,3.979745,4.168468,3.039269,3.31984,3.776437,3.083228,2.348762,3.26303


## Personalized Recommendation

In [18]:
# Recommend Top 5 Movies for a User
def recommend_movies(user_id, actual_df, predicted_df, n=5):
    user_row = predicted_df.loc[user_id]
    already_rated = actual_df.loc[user_id]
    unrated = already_rated[already_rated == 0].index
    recommendations = user_row[unrated].sort_values(ascending=False).head(n)
    return recommendations

# Example: Recommend 5 movies to user 10
top_movies = recommend_movies(user_id=10, actual_df=movie_user_rating_pivot, predicted_df=predicted_df, n=5)
print("Top 5 Movie Recommendations for User 10:\n")
print(top_movies)

Top 5 Movie Recommendations for User 10:

title
Lupin III: The Castle Of Cagliostro (Rupan sansei: Kariosutoro no shiro) (1979)    5.441951
Life Is Beautiful (La Vita ̬ bella) (1997)                                         5.408407
RocknRolla (2008)                                                                  5.343276
Vanishing  The (Spoorloos) (1988)                                                  5.328672
Romulus  My Father (2007)                                                          5.304472
Name: 10, dtype: float64


In [20]:
# Recommend Top 5 Movies based on a perticular geners

import pandas as pd
def recommend_movies_by_genre(genre, movie_df, n=5):
    """
    Recommends the top n movies based on a specific genre.

    Args:
        genre (str): The genre to filter movies by.
        movie_df (pd.DataFrame): The DataFrame containing movie information (with 'genres' column).
        n (int): The number of top movies to recommend.

    Returns:
        pd.Series: A Series of movie titles recommended.
    """
    # Filter movies that contain the specified genre in their 'genres' string
    genre_movies = movie_df[movie_df['genres'].str.contains(genre, na=False)]

    # For simplicity, we'll recommend the top-rated movies within that genre.
    # This assumes we have a measure of "top-rated". If not, you might need to
    # use rating data or a different metric.
    # Assuming higher 'movieId' or simply the first ones that appear in the filtered list
    # are sufficient for a basic recommendation within the genre for this example.
    # A more sophisticated approach would involve average ratings, popularity, etc.

    # For a simple recommendation based on genre alone, we'll just take the first n
    # movies found in that genre. If you have rating data associated with the movies
    # themselves (like average ratings), you would sort by that.
    recommended_movies = genre_movies['title'].head(n)

    return recommended_movies

# Example: Recommend Top 5 Action movies
action_genre = "Action"
top_action_movies = recommend_movies_by_genre(genre=action_genre, movie_df=movie_df, n=5)
print(f"Top 5 {action_genre} Movie Recommendations:\n")
print(top_action_movies)

# Example: Recommend Top 5 Comedy movies
comedy_genre = "Comedy"
top_comedy_movies = recommend_movies_by_genre(genre=comedy_genre, movie_df=movie_df, n=5)
print(f"\nTop 5 {comedy_genre} Movie Recommendations:\n")
display(top_comedy_movies)

Top 5 Action Movie Recommendations:

5                 Heat (1995)
8         Sudden Death (1995)
9            GoldenEye (1995)
14    Cutthroat Island (1995)
19         Money Train (1995)
Name: title, dtype: object

Top 5 Comedy Movie Recommendations:



Unnamed: 0,title
0,Toy Story (1995)
2,Grumpier Old Men (1995)
3,Waiting to Exhale (1995)
4,Father of the Bride Part II (1995)
6,Sabrina (1995)


In [22]:
# Recommend Top 5 Movies released on certain year
def recommend_movies_by_year(year, movie_df, n=5):
    """
    Recommends the top n movies released in a specific year.

    Args:
        year (int): The release year to filter movies by.
        movie_df (pd.DataFrame): The DataFrame containing movie information (with 'title' column).
        n (int): The number of top movies to recommend.

    Returns:
        pd.Series: A Series of movie titles recommended.
    """
    # Filter movies that contain the specified year in their 'title' string
    year_movies = movie_df[movie_df['title'].str.contains(f'({year})', na=False)]

    # For a simple recommendation based on year alone, we'll just take the first n
    # movies found for that year. If you have rating data associated with the movies
    # themselves (like average ratings), you would sort by that.
    recommended_movies = year_movies['title'].head(n)

    return recommended_movies

# Example: Recommend Top 5 movies released in 1995
release_year = 1995
top_year_movies = recommend_movies_by_year(year=release_year, movie_df=movie_df, n=5)
print(f"Top 5 Movie Recommendations from {release_year}:\n")
display(top_year_movies)

Top 5 Movie Recommendations from 1995:



  year_movies = movie_df[movie_df['title'].str.contains(f'({year})', na=False)]


Unnamed: 0,title
0,Toy Story (1995)
1,Jumanji (1995)
2,Grumpier Old Men (1995)
3,Waiting to Exhale (1995)
4,Father of the Bride Part II (1995)
