# Movie Recommendation System

- Correlation based recommendation system
- Collaborative filtering
- XGBoost user based recommendation system

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

## Loading the dataset

In [2]:
links= "../Data/links.csv"
movies= "../Data/movies.csv"
ratings= "../Data/ratings.csv"
tags= "../Data/tags.csv"

In [3]:
links = pd.read_csv(links)
movies = pd.read_csv(movies)
ratings = pd.read_csv(ratings)
tags = pd.read_csv(tags)

## Merging `movies.csv` and `ratings.csv`

In [4]:
df = pd.merge(ratings, movies, on='movieId')

In [10]:
print(df.shape)
df.head()

(100836, 6)


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


## Data Exploration

In [11]:
null_count = df.isna().sum()
null_percentages = null_count / len(df) *100
pd.DataFrame({"Null Count:" : null_count,"Null Percentage (%)": null_percentages})

Unnamed: 0,Null Count:,Null Percentage (%)
userId,0,0.0
movieId,0,0.0
rating,0,0.0
timestamp,0,0.0
title,0,0.0
genres,0,0.0


In [12]:
print('Unique Users:',len(df['userId'].unique()))
print('Unique Items:',len(df['movieId'].value_counts()))

Unique Users: 610
Unique Items: 9724


we have `610` diffrent user<br>
we have `9724` diffrent movie<br>

## Data Preprocessing
Here we will drop the `timestamp` column from the dataset.<br>
We will devide the `genres` column into diffrent columns and then we will drop the `genres` column.<br>

In [5]:
df.drop(columns=['timestamp'],inplace = True)

In [6]:
# How many unique genres?
unique_genres = set()
for genres in df['genres'].str.split('|'):
  unique_genres.update(genres)
print(f"Number of unique genres: {len(unique_genres)}")
print('Unique Genres:',unique_genres)


Number of unique genres: 20
Unique Genres: {'Children', 'Adventure', 'Western', 'Film-Noir', 'Thriller', 'Documentary', 'Action', 'Drama', 'Comedy', 'Animation', 'Musical', '(no genres listed)', 'Sci-Fi', 'Fantasy', 'War', 'Romance', 'Mystery', 'IMAX', 'Horror', 'Crime'}


In [7]:
# Make a column for each unique_genres
for genre in unique_genres:
  df[genre] = 0

for index, row in df.iterrows():
  for genre in row['genres'].split('|'):
    df.at[index, genre] = 1


In [8]:
df.drop(columns=['genres'],inplace = True)

In [23]:
df.head(3)

Unnamed: 0,userId,movieId,rating,title,Thriller,Comedy,IMAX,Romance,Musical,Film-Noir,...,Documentary,Action,Horror,War,Western,Crime,(no genres listed),Children,Mystery,Drama
0,1,1,4.0,Toy Story (1995),0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,3,4.0,Grumpier Old Men (1995),0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,6,4.0,Heat (1995),1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


## Correlation based recommendation system


In [25]:
movie_user_rating_pivot = df.pivot_table(index='userId', columns='title', values='rating')
movie_user_rating_pivot.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [9]:
ratings_ = pd.DataFrame(df.groupby('title')['rating'].mean())
ratings_['number_of_ratings'] = df.groupby('title')['rating'].count()
ratings_.head()

Unnamed: 0_level_0,rating,number_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),4.0,1
'Hellboy': The Seeds of Creation (2004),4.0,1
'Round Midnight (1986),3.5,2
'Salem's Lot (2004),5.0,1
'Til There Was You (1997),4.0,2


In [63]:
similar_to_movie_m = movie_user_rating_pivot.corr(method='pearson')
similar_to_movie_m.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,


In [28]:
def get_recommendation(movie_name):
    movie_user_ratings = movie_user_rating_pivot[movie_name]
    similar_to_movie = movie_user_rating_pivot.corrwith(movie_user_ratings)
    corr_movie = pd.DataFrame(similar_to_movie, columns=['correlation'])
    corr_movie.dropna(inplace=True)
    corr_movie = corr_movie.join(ratings_['number_of_ratings'])
    recommendation = corr_movie[corr_movie['number_of_ratings'] > 100].sort_values(by='correlation', ascending=False).head(10)
    return recommendation


get_recommendation('Toy Story (1995)')

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


Unnamed: 0_level_0,correlation,number_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story (1995),1.0,215
"Incredibles, The (2004)",0.643301,125
Finding Nemo (2003),0.618701,141
Aladdin (1992),0.611892,183
"Monsters, Inc. (2001)",0.490231,132
Mrs. Doubtfire (1993),0.446261,144
"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",0.438237,120
American Pie (1999),0.420117,103
Die Hard: With a Vengeance (1995),0.410939,144
E.T. the Extra-Terrestrial (1982),0.409216,122


In [67]:
def get_recommendation(matrix, movie_name):
  similar_scores = matrix[movie_name]
  similar_scores = similar_scores.sort_values(ascending=False)
  
  # only return movies with more than 100 ratings
  similar_scores = similar_scores[similar_scores.index.isin(ratings[ratings_['number_of_ratings'] > 100].index)]
  return similar_scores

movie_name = 'Aladdin (1992)'
recommendations = get_recommendation(similar_to_movie_m, movie_name)
print(f"Recommendations for {movie_name}:")

for i in range(1, 10):
  print(f"{i}: {recommendations.index[i]}")


Recommendations for Aladdin (1992):
1: Toy Story (1995)
2: Lion King, The (1994)
3: Beauty and the Beast (1991)
4: Truman Show, The (1998)
5: Finding Nemo (2003)
6: Back to the Future (1985)
7: Shrek (2001)
8: Ferris Bueller's Day Off (1986)
9: Forrest Gump (1994)


## Saving Movie Similarity Matrix (item based model)

In [123]:
with open('../Models/Item_based_matrix.pkl', 'wb') as file:
    pickle.dump(similar_to_movie_m, file)

## User based Collaborative filtering

In [68]:
user_movie_rating_pivot = df.pivot_table(index='userId', columns='title', values='rating')
user_movie_rating_pivot.head()


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [69]:
user_movie_rating_pivot = user_movie_rating_pivot.fillna(0)


user_similarity = cosine_similarity(user_movie_rating_pivot)
user_similarity = pd.DataFrame(user_similarity, index=user_movie_rating_pivot.index, columns=user_movie_rating_pivot.index)
user_similarity.head()


userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.027283,0.05972,0.194395,0.12908,0.128152,0.158744,0.136968,0.064263,0.016875,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,1.0,0.0,0.003726,0.016614,0.025333,0.027585,0.027257,0.0,0.067445,...,0.202671,0.016866,0.011997,0.0,0.0,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.05972,0.0,1.0,0.002251,0.00502,0.003936,0.0,0.004941,0.0,0.0,...,0.005048,0.004892,0.024992,0.0,0.010694,0.012993,0.019247,0.021128,0.0,0.032119
4,0.194395,0.003726,0.002251,1.0,0.128659,0.088491,0.11512,0.062969,0.011361,0.031163,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.12908,0.016614,0.00502,0.128659,1.0,0.300349,0.108342,0.429075,0.0,0.030611,...,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792


In [113]:
"""
Here we have a user similarity matrix. We can use this matrix to get similar users to a given user.
Then we can recommend movies that these similar users have rated highly but the given user has not seen.
we only get the 10 most similar users.
and we only recommend the top 4 movies from each similar user.
"""


def get_user_recommendation(Matrix,user_id):
    user = Matrix[user_id]
    user = user.sort_values(ascending=False)
    # now we have a series of user similarities
    # we only want to recommend movies that the user has not seen
    # so we need to filter out movies that the user has seen
    user_seen_movies = df[df['userId'] == user_id]['title']


    # Now we loop through user and get top 10 recommendations
    recommendations = []
    print(len(user.index))
    for U in user.index[1:10]:
        # get all rated movies by user U
        movies = df[df['userId'] == U]['title']
        movies = movies[~movies.isin(user_seen_movies)]

        # get all movies that U has rated 4 or higher
        movies = movies[df['rating'] >= 4]
        # sort by rating
        movies = movies.sort_values(ascending=False)
        for movie in movies[:4]:   
            if movie not in recommendations:
                recommendations.append(movie) 

        # add to recommendations
        # if we have 10 recommendations, break

       
    return recommendations

    
    



user_id = 123
recommendations = get_user_recommendation(user_similarity,user_id)
print(f"Recommendations for user {user_id}:")
recommendations


610
Recommendations for user 123:


['WALL·E (2008)',
 'The Martian (2015)',
 'Star Wars: Episode VII - The Force Awakens (2015)',
 'Star Trek (2009)',
 'Saving Private Ryan (1998)',
 'Pirates of the Caribbean: The Curse of the Black Pearl (2003)',
 'Minority Report (2002)',
 'Indiana Jones and the Last Crusade (1989)',
 'Zootopia (2016)',
 'Wolf of Wall Street, The (2013)',
 'Wizard of Oz, The (1939)',
 'Willy Wonka & the Chocolate Factory (1971)',
 'Zoolander (2001)',
 'Zombieland (2009)',
 'V for Vendetta (2006)',
 'Tropic Thunder (2008)',
 'Whiplash (2013)',
 'Vanilla Sky (2001)',
 'Truman Show, The (1998)',
 'Trainspotting (1996)',
 'Wreck-It Ralph (2012)',
 'Up (2009)',
 'The Lego Movie (2014)',
 'Usual Suspects, The (1995)',
 'Toy Story 3 (2010)',
 'Sword in the Stone, The (1963)']

## Saving User Similarity Matrix (user based model)

In [124]:
with open('../Models/User_based_matrix.pkl', 'wb') as file:
    pickle.dump(user_similarity, file)

# More Advanced Techniques
- XGBoost 
- Neural Collaborative Filtering

## XGBoost Model

## Feature Engineering
Adding a feature `number_of_ratings` & `average_rating` to the dataset. <br>
`number_of_ratings:` will represent the number of ratings given to this movie by all users.<br>
`average_rating:` will represent the average rating given to this movie by all users.<br>


In [10]:
n = df['movieId'].value_counts()

movie_ratings = df.groupby('movieId')['rating'].mean()
movie_ratings = pd.DataFrame(movie_ratings)
movie_ratings['number_of_ratings'] = n
movie_ratings = movie_ratings.reset_index()
movie_ratings = movie_ratings.rename(columns={'rating':'average_rating'})
movie_ratings.head()


Unnamed: 0,movieId,average_rating,number_of_ratings
0,1,3.92093,215
1,2,3.431818,110
2,3,3.259615,52
3,4,2.357143,7
4,5,3.071429,49


In [12]:
df = pd.merge(df, movie_ratings, on='movieId')

## Splitting data (Leave one out)

In [14]:
df['Train'] = (df.groupby(by = 'userId').cumcount(ascending=False) != 0).replace({True:1, False:0})

  df['Train'] = (df.groupby(by = 'userId').cumcount(ascending=False) != 0).replace({True:1, False:0})


In [15]:
train = df[df['Train'] == 1]
test = df[df['Train'] == 0]

# X_train = train.drop(columns=['rating', 'Train', 'title'])
# y_train = train['rating']

# X_test = test.drop(columns=['rating', 'Train', 'title'])
# y_test = test['rating']


## Saving our database

In [28]:
df

Unnamed: 0,userId,movieId,rating,title,Children,Adventure,Western,Film-Noir,Thriller,Documentary,...,Fantasy,War,Romance,Mystery,IMAX,Horror,Crime,average_rating,number_of_ratings,Train
0,1,1,4.0,Toy Story (1995),1,1,0,0,0,0,...,1,0,0,0,0,0,0,3.920930,215,1
1,1,3,4.0,Grumpier Old Men (1995),0,0,0,0,0,0,...,0,0,1,0,0,0,0,3.259615,52,1
2,1,6,4.0,Heat (1995),0,0,0,0,1,0,...,0,0,0,0,0,0,1,3.946078,102,1
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),0,0,0,0,1,0,...,0,0,0,1,0,0,0,3.975369,203,1
4,1,50,5.0,"Usual Suspects, The (1995)",0,0,0,0,1,0,...,0,0,0,1,0,0,1,4.237745,204,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,Split (2017),0,0,0,0,1,0,...,0,0,0,0,0,1,0,3.333333,6,1
100832,610,168248,5.0,John Wick: Chapter Two (2017),0,0,0,0,1,0,...,0,0,0,0,0,0,1,4.142857,7,1
100833,610,168250,5.0,Get Out (2017),0,0,0,0,0,0,...,0,0,0,0,0,1,0,3.633333,15,1
100834,610,168252,5.0,Logan (2017),0,0,0,0,0,0,...,0,0,0,0,0,0,0,4.280000,25,1


In [20]:
df.to_csv('../Data/XGBoost_database.csv', index=False)

During inference, we will use this database to get the recommendations for the user. and we will drop the `train` column from the database.<br>

## Model functions

In [26]:
def train_model(data,user_id, test=None, eval = False):


    # select only user data
    train_user = data[data['userId']==user_id]


    X_train = train_user.drop(columns=['userId','rating', 'Train', 'title'])
    y_train = train_user['rating']

    model = XGBRegressor()
    model.fit(X_train,y_train)

    if eval:
        test_user = test[test['userId']== user_id]
        X_test = test_user.drop(columns=['userId','rating', 'Train', 'title'])
        y_test = test_user['rating']
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        print(f'RMSE: {rmse:.4f}')
    # Model evaluation
    # print("Predected rating:", y_pred)
    # print("Actual rating:",y_test)
    # print(X_test)

    return model

def get_user_recommendation(all_moves,model, user_id, n=10):
    # get all movies that the user has not seen
    user_seen_movies = all_moves[all_moves['userId'] == user_id]['title']
    user_unseen_movies = all_moves[~all_moves['title'].isin(user_seen_movies)]

    # drop duplicates
    user_unseen_movies = user_unseen_movies.drop_duplicates(subset=['title'])

    # make predictions
    user_unseen_movies['Pred_rating'] = model.predict(user_unseen_movies.drop(columns=['userId', 'rating', 'Train', 'title']))

    # only return movies with more than 100 ratings

    # get top 10 recommendations
    recommendations = user_unseen_movies.sort_values(by='Pred_rating', ascending=False).head(n)['title']
    return recommendations ,user_seen_movies

user_id = 123
# model = train_model(df, user_id)   
model = train_model(train,user_id,test, eval=True)   
recommendations , user_seen_movies  = get_user_recommendation(df, model, user_id)
print(f"Recommendations for user {user_id}:")
recommendations

RMSE: 0.1215
Recommendations for user 123:


9142                         Seven Sisters (2017)
27243                             The Post (2017)
52697               All That Heaven Allows (1955)
10231                     Running on Empty (1988)
511                                 L.I.E. (2001)
74285                  Written on the Wind (1956)
49810                            Slow West (2015)
24296    Man Who Shot Liberty Valance, The (1962)
9140                           Baby Driver (2017)
14531                            Near Dark (1987)
Name: title, dtype: object

## Neural Collaborative Filtering