# Movie recommender system
## Importing database
https://www.kaggle.com/prajitdatta/movielens-100k-dataset

In [110]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

import datetime as dt 
import re
import pprint

from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine, correlation

In [111]:
# Movies dataframe
movies_cols = [
    'movie_id', 
    'title', 
    'release_date', 
]
movies = pd.read_csv('ml-100k/u.item', sep='|', names=movies_cols, usecols=range(3), encoding='latin-1')

# Add a lower case title for recognition
movies['lowercase_title'] = movies['title'].str.lower()

# Remove year from title
year_type_re = re.compile(r' [(]\d{4}[)]', re.IGNORECASE)
def rm_year_from_title(title):
    return re.sub(year_type_re, '', title)
movies['title'] = movies['title'].apply(rm_year_from_title)
movies['lowercase_title'] = movies['lowercase_title'].apply(rm_year_from_title)

# Users dataframe
users_col = [
    'user_id', 
    'age', 
    'sex', 
    'occupation', 
    'zip_code'
]
users = pd.read_csv('ml-100k/u.user', sep='|', names=users_col, encoding='latin-1', parse_dates=True) 

# Ratings dataframe
ratings_col = [
    'user_id', 
    'movie_id', 
    'rating', 
]
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_col, usecols=range(3), encoding='latin-1')

# Merging all dfs and exporting to csv
df = pd.merge(pd.merge(movies, ratings), users)

In [112]:
df.head()

Unnamed: 0,movie_id,title,release_date,lowercase_title,user_id,rating,age,sex,occupation,zip_code
0,1,Toy Story,01-Jan-1995,toy story,308,4,60,M,retired,95076
1,4,Get Shorty,01-Jan-1995,get shorty,308,5,60,M,retired,95076
2,5,Copycat,01-Jan-1995,copycat,308,4,60,M,retired,95076
3,7,Twelve Monkeys,01-Jan-1995,twelve monkeys,308,4,60,M,retired,95076
4,8,Babe,01-Jan-1995,babe,308,5,60,M,retired,95076


In [113]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   movie_id         100000 non-null  int64 
 1   title            100000 non-null  object
 2   release_date     99991 non-null   object
 3   lowercase_title  100000 non-null  object
 4   user_id          100000 non-null  int64 
 5   rating           100000 non-null  int64 
 6   age              100000 non-null  int64 
 7   sex              100000 non-null  object
 8   occupation       100000 non-null  object
 9   zip_code         100000 non-null  object
dtypes: int64(4), object(6)
memory usage: 8.4+ MB


In [114]:
# Showing the number of duplicated rows (returns True if a row is duplicated)
(df.duplicated()).value_counts()

False    100000
dtype: int64

In [115]:
df.head()

Unnamed: 0,movie_id,title,release_date,lowercase_title,user_id,rating,age,sex,occupation,zip_code
0,1,Toy Story,01-Jan-1995,toy story,308,4,60,M,retired,95076
1,4,Get Shorty,01-Jan-1995,get shorty,308,5,60,M,retired,95076
2,5,Copycat,01-Jan-1995,copycat,308,4,60,M,retired,95076
3,7,Twelve Monkeys,01-Jan-1995,twelve monkeys,308,4,60,M,retired,95076
4,8,Babe,01-Jan-1995,babe,308,5,60,M,retired,95076


In [116]:
# Count number of people who have rated the movie
count_rate = df.groupby('movie_id').agg({'rating': [np.size]}).reset_index()
count_rate

Unnamed: 0_level_0,movie_id,rating
Unnamed: 0_level_1,Unnamed: 1_level_1,size
0,1,452
1,2,131
2,3,90
3,4,209
4,5,86
...,...,...
1677,1678,1
1678,1679,1
1679,1680,1
1680,1681,1


In [117]:
# join the counter df to the global df (warning may be ignired)
df = df.join(count_rate.set_index('movie_id'), on='movie_id', how='left')



In [118]:
# Check for 1 particular movie
df[df['movie_id'] == 50]

Unnamed: 0,movie_id,title,release_date,lowercase_title,user_id,rating,age,sex,occupation,zip_code,"(rating, size)"
26,50,Star Wars,01-Jan-1977,star wars,308,5,60,M,retired,95076,583
403,50,Star Wars,01-Jan-1977,star wars,287,5,21,M,salesman,31211,583
462,50,Star Wars,01-Jan-1977,star wars,148,5,33,M,engineer,97006,583
541,50,Star Wars,01-Jan-1977,star wars,280,3,30,F,librarian,22903,583
788,50,Star Wars,01-Jan-1977,star wars,66,5,23,M,student,80521,583
...,...,...,...,...,...,...,...,...,...,...,...
94846,50,Star Wars,01-Jan-1977,star wars,507,5,18,F,writer,28450,583
94904,50,Star Wars,01-Jan-1977,star wars,127,4,33,M,none,73439,583
94927,50,Star Wars,01-Jan-1977,star wars,781,5,20,M,student,48825,583
94969,50,Star Wars,01-Jan-1977,star wars,183,2,33,M,scientist,27708,583


In [119]:
# Drop all movies that have been rated less than 50 times
df = df[df[('rating', 'size')] >= 50]

In [120]:
df.describe()

Unnamed: 0,movie_id,user_id,rating,age,"(rating, size)"
count,83715.0,83715.0,83715.0,83715.0,83715.0
mean,340.040793,463.684119,3.629111,33.095968,195.296554
std,248.431534,267.570213,1.079863,11.47202,114.577354
min,1.0,1.0,1.0,7.0,50.0
25%,153.0,254.0,3.0,24.0,104.0
50%,276.0,450.0,4.0,30.0,170.0
75%,496.0,684.0,4.0,40.0,256.0
max,1119.0,943.0,5.0,73.0,583.0


## Pivot table : User ratings vs movies

In [109]:
df_pivoted = df.pivot_table(index=['movie_id'],columns=['user_id'],values='rating')
df_pivoted.fillna( 0, inplace = True)
df_pivoted

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
2,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
5,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1074,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
1101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### sklearn.metrics.pairwise_distances:
Compute the distance matrix from a vector array X and optional Y.

This method takes either a vector array or a distance matrix, and returns a distance matrix. If the input is a vector array, the distances are computed. If the input is a distances matrix, it is returned instead.

This method provides a safe way to take a distance matrix as input, while preserving compatibility with many other algorithms that take a vector array.

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html

Metrics -> Pairwise distances between observations in n-dimensional space.

https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html

In [17]:
# Generate similarity matrix to compare movies based on user ratings
matrix_similarities = 1 - pairwise_distances(df_pivoted, metric="cosine" )
np.fill_diagonal(matrix_similarities, 0) # Avoid saying that a movie is similar ot itself
df_similarities = pd.DataFrame(matrix_similarities)

# set index and columns to correspond to movie_id
df_similarities.columns = list(df_pivoted.index)
df_similarities['movies_id'] = list(df_pivoted.index)
df_similarities = df_similarities.set_index('movies_id')

df_similarities

Unnamed: 0_level_0,1,2,4,7,8,9,11,12,13,14,...,845,866,879,895,926,928,1012,1016,1028,1047
movies_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000000,0.402382,0.454938,0.620979,0.481114,0.496288,0.468291,0.460392,0.417509,0.347678,...,0.476322,0.379421,0.206159,0.188549,0.394245,0.360728,0.349093,0.415372,0.422974,0.455343
2,0.402382,0.000000,0.502571,0.383403,0.337002,0.255252,0.468506,0.459946,0.213972,0.125463,...,0.254058,0.214651,0.164033,0.185087,0.305089,0.243020,0.206733,0.317261,0.282294,0.284650
4,0.454938,0.502571,0.000000,0.489283,0.490236,0.419044,0.588337,0.584884,0.397251,0.266764,...,0.303499,0.246952,0.177039,0.198952,0.257198,0.247254,0.306567,0.385563,0.283786,0.323819
7,0.620979,0.383403,0.489283,0.000000,0.423515,0.527462,0.568844,0.568886,0.467391,0.330617,...,0.357101,0.302525,0.197934,0.220151,0.334532,0.373421,0.381370,0.369631,0.344687,0.421239
8,0.481114,0.337002,0.490236,0.423515,0.000000,0.424429,0.435477,0.504906,0.348291,0.302722,...,0.326620,0.256586,0.158162,0.125731,0.199094,0.217191,0.268811,0.307688,0.283091,0.236382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
928,0.360728,0.243020,0.247254,0.373421,0.217191,0.214574,0.339508,0.291698,0.220538,0.111821,...,0.295799,0.246986,0.101302,0.182888,0.368132,0.000000,0.210616,0.258896,0.373994,0.422626
1012,0.349093,0.206733,0.306567,0.381370,0.268811,0.283574,0.294105,0.285705,0.266640,0.186282,...,0.277110,0.234425,0.113430,0.141172,0.214802,0.210616,0.000000,0.357965,0.303062,0.275603
1016,0.415372,0.317261,0.385563,0.369631,0.307688,0.262824,0.411792,0.353192,0.193393,0.129069,...,0.331196,0.304402,0.247521,0.233304,0.347783,0.258896,0.357965,0.000000,0.323799,0.384170
1028,0.422974,0.282294,0.283786,0.344687,0.283091,0.287939,0.304968,0.260714,0.219150,0.125556,...,0.354275,0.483718,0.141465,0.130989,0.430962,0.373994,0.303062,0.323799,0.000000,0.484262


## User ratings

In [165]:
select_title = 'Net'

# Find matching results
df[['title', 
    'release_date', 
    'movie_id']][df['lowercase_title'].str.contains(select_title.lower()) == True].drop_duplicates()

Unnamed: 0,title,release_date,movie_id
204,Forbidden Planet,01-Jan-1956,434
539,"Net, The",01-Jan-1995,38


In [166]:
# movie_id / user rating
input_data = {
    50:5,
    181:5,
    38:2,
}

## Recommender system

In [167]:
def find_movie_id(id):
    return df['title'].loc[df['movie_id'] == id].iloc[0]

In [168]:
my_ratings = pd.Series({find_movie_id(id):input_data[id] for id in input_data})

In [169]:
# Check compliance:
print('Your ratings are...')
my_ratings

Your ratings are...


Star Wars             5
Return of the Jedi    5
Net, The              2
dtype: int64

In [170]:
# Transform the user rating series index to movie id
def find_id_movie(movie):
    return int(movies['movie_id'].loc[movies['title'] == movie])

def recommender(user_ratings):
    
    # Transform the user rating series index to movie id
    user_ratings_id = pd.Series({find_id_movie(movie): user_ratings[movie] for movie in user_ratings.index})

    # Declare a candidate series to list similar cnadidate movies
    similar_candidates = pd.Series(dtype='float64')

    # Loop over the user rating series to find similar movies in the similarity matrix
    # Based on code from https://www.udemy.com/course/data-science-and-machine-learning-with-python-hands-on/
    
    for i in range(0, len(user_ratings_id.index)):
        # Find similar movies in the similarities df
        similar_movies = df_similarities[user_ratings_id.index[i]].dropna()
        # Multiply rating of each movie by the user rating of the user rated movie
        similar_movies_user = similar_movies.map(lambda x: x * user_ratings_id.iloc[i])
        # Add the score to the list of similarity candidates
        similar_candidates = similar_candidates.append(similar_movies_user)

    # Sum similarities from movies appearing several times (similar to more than 1 movie rated by the user)
    similar_candidates = similar_candidates.groupby(similar_candidates.index).sum()

    # Sort movies by similarity
    similar_candidates.sort_values(inplace = True, ascending = False)

    # find the corresponding column in the similarity matrix
    df_result = pd.DataFrame({
        'similarity':np.array(similar_candidates),
        'movie_id':similar_candidates.index
    }).sort_values(by='similarity', ascending=False)

    # merge with the initial df to get title and release date
    df_result = pd.merge(df_result, df, on='movie_id', how='inner')[[
        'title', 
        'release_date', 
        'movie_id', 
        'similarity',
        ('rating', 'size')
    ]].drop_duplicates()
    
    return df_result.head(10)

In [171]:
recommender(my_ratings)

Unnamed: 0,title,release_date,movie_id,similarity,"(rating, size)"
0,Raiders of the Lost Ark,01-Jan-1981,174,8.425414,420
420,"Empire Strikes Back, The",01-Jan-1980,172,8.387856,367
787,Toy Story,01-Jan-1995,1,7.909325,452
1239,Indiana Jones and the Last Crusade,01-Jan-1989,210,7.884062,331
1570,Independence Day (ID4),03-Jul-1996,121,7.855092,429
1999,Back to the Future,01-Jan-1985,204,7.644042,350
2349,Star Trek: First Contact,22-Nov-1996,222,7.484834,365
2714,"Fugitive, The",01-Jan-1993,79,7.470028,336
3050,"Terminator, The",01-Jan-1984,195,7.460332,301
3351,Terminator 2: Judgment Day,01-Jan-1991,96,7.334983,295
