# Content-based Movie Recommender

In [1]:
# Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Load the MovieLens data

You can download the file `ml-latest.zip`[here](https://grouplens.org/datasets/movielens/) and then unzip into the `data/` directory.

In [2]:
# Read dataframes
df_movies = pd.read_csv('data/movies.csv')
df_links = pd.read_csv('data/links.csv')
df_ratings = pd.read_csv('data/ratings.csv')
df_genome_tags = pd.read_csv('data/genome-tags.csv')
df_genome_scores = pd.read_csv('data/genome-scores.csv')

# Merge scores and tags
df_movie_tags_in_text = pd.merge(df_genome_scores, df_genome_tags, on='tagId')[['movieId', 'tag', 'relevance']]

# Only keep tags with relevance higher than 0.3
df_movie_tags = df_genome_scores[df_genome_scores.relevance > 0.3][['movieId', 'tagId']]

### Which is movie with Id 1?

In [3]:
df_movies[df_movies.movieId == 1]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


### Let's have a look at a few of the tags Toy Story 

In [4]:
df_movie_tags[df_movie_tags['movieId'] == 1].merge(df_genome_tags, on='tagId').sample(10)

Unnamed: 0,movieId,tagId,tag
150,1,844,redemption
129,1,719,not funny
14,1,93,awesome
195,1,1108,whimsical
94,1,481,gunfight
155,1,897,secrets
164,1,967,stereotypes
158,1,921,simple
154,1,886,sci fi
85,1,452,good soundtrack


### Encode features

In [5]:
df_movie_tags

Unnamed: 0,movieId,tagId
10,1,11
18,1,19
19,1,20
28,1,29
29,1,30
...,...,...
18472095,288167,1096
18472100,288167,1101
18472107,288167,1108
18472113,288167,1114


In [6]:
df_genome_tags

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s
...,...,...
1123,1124,writing
1124,1125,wuxia
1125,1126,wwii
1126,1127,zombie


In [7]:
df_tags_to_movies = pd.merge(df_movie_tags, df_genome_tags, on='tagId', how='left')[['movieId', 'tagId']]
df_tags_to_movies['tagId'] = df_tags_to_movies.tagId.astype(str)
df_tags_to_movies

Unnamed: 0,movieId,tagId
0,1,11
1,1,19
2,1,20
3,1,29
4,1,30
...,...,...
1697797,288167,1096
1697798,288167,1101
1697799,288167,1108
1697800,288167,1114


In [8]:
def _concatenate_tags_of_movie(tags):
    tags_as_str = ' '.join(set(tags))
    return tags_as_str

In [9]:
df_tags_per_movie = df_tags_to_movies.groupby('movieId')['tagId'].agg(_concatenate_tags_of_movie)
df_tags_per_movie.name = 'movie_tags'
df_tags_per_movie = df_tags_per_movie.reset_index()

In [10]:
df_tags_per_movie[df_tags_per_movie['movieId'] == 1]

Unnamed: 0,movieId,movie_tags
0,1,138 1051 1034 353 388 382 1007 215 747 942 45 ...


In [11]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119
...,...,...,...,...
33832157,330975,8340,2.0,1091583256
33832158,330975,8493,2.5,1091585709
33832159,330975,8622,4.0,1091581777
33832160,330975,8665,3.0,1091581765


In [12]:
df_avg_ratings  = df_ratings.groupby('movieId')['rating'].agg(['mean', 'median', 'size'])
df_avg_ratings 

Unnamed: 0_level_0,mean,median,size
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.893508,4.0,76813
2,3.278179,3.0,30209
3,3.171271,3.0,15820
4,2.868395,3.0,3028
5,3.076957,3.0,15801
...,...,...,...
288967,3.500000,3.5,1
288971,0.500000,0.5,1
288975,4.000000,4.0,1
288977,3.000000,3.0,1


In [13]:
df_avg_ratings  = df_ratings.groupby('movieId')['rating'].agg(['mean', 'median', 'size'])
df_avg_ratings.columns = ['rating_mean', 'rating_median', 'num_ratingsdf_tags_per_movie']
df_avg_ratings = df_avg_ratings.reset_index()

In [14]:
df_movies_with_ratings = pd.merge(df_movies, df_avg_ratings, how='left', on='movieId')

In [15]:
df_data = pd.merge(df_movies_with_ratings, df_tags_per_movie, how='left', on='movieId')

In [16]:
df_data_with_tags = df_data[~df_data.movie_tags.isnull()].reset_index(drop=True)
df_data_with_tags

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.893508,4.0,76813.0,138 1051 1034 353 388 382 1007 215 747 942 45 ...
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.278179,3.0,30209.0,98 388 382 304 844 761 971 378 646 776 266 497...
2,3,Grumpier Old Men (1995),Comedy|Romance,3.171271,3.0,15820.0,920 465 919 1102 1057 387 388 445 867 926 230 ...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.868395,3.0,3028.0,1062 201 900 387 388 445 867 614 913 374 1070 ...
4,5,Father of the Bride Part II (1995),Comedy,3.076957,3.0,15801.0,138 355 900 1102 455 387 388 445 867 926 602 2...
...,...,...,...,...,...,...,...
16371,286901,The Flash (2023),Action|Adventure|Sci-Fi,3.038043,3.0,92.0,1051 353 919 388 913 1027 304 844 864 971 1071...
16372,286905,Indiana Jones and the Dial of Destiny (2023),Action|Adventure,3.276119,3.5,134.0,353 919 388 382 913 942 844 761 524 774 832 55...
16373,287377,Fast X (2023),Action|Crime,2.815217,3.0,92.0,919 382 913 732 844 258 498 829 160 971 609 37...
16374,287633,Asteroid City (2023),Comedy|Drama|Romance|Sci-Fi,3.620690,3.5,145.0,138 311 844 258 829 971 337 378 646 266 889 91...


### TF-IDF vectors

In [17]:
tf_idf = TfidfVectorizer()

In [18]:
len(df_data_with_tags.movie_tags[0])

797

In [19]:
df_data_with_tags.movie_tags

0        138 1051 1034 353 388 382 1007 215 747 942 45 ...
1        98 388 382 304 844 761 971 378 646 776 266 497...
2        920 465 919 1102 1057 387 388 445 867 926 230 ...
3        1062 201 900 387 388 445 867 614 913 374 1070 ...
4        138 355 900 1102 455 387 388 445 867 926 602 2...
                               ...                        
16371    1051 353 919 388 913 1027 304 844 864 971 1071...
16372    353 919 388 382 913 942 844 761 524 774 832 55...
16373    919 382 913 732 844 258 498 829 160 971 609 37...
16374    138 311 844 258 829 971 337 378 646 266 889 91...
16375    1051 1034 164 353 382 70 913 1007 844 258 498 ...
Name: movie_tags, Length: 16376, dtype: object

In [20]:
df_movies_tf_idf_described = tf_idf.fit_transform(df_data_with_tags.movie_tags)

In [21]:
df_movies_tf_idf_described

<16376x1119 sparse matrix of type '<class 'numpy.float64'>'
	with 1690520 stored elements in Compressed Sparse Row format>

In [22]:
m2m = cosine_similarity(df_movies_tf_idf_described)

In [23]:
m2m[0]

array([1.        , 0.37962821, 0.14451215, ..., 0.25534355, 0.2249172 ,
       0.3087429 ])

In [24]:
df_tfidf_m2m = pd.DataFrame(cosine_similarity(df_movies_tf_idf_described))

In [25]:
df_tfidf_m2m

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16366,16367,16368,16369,16370,16371,16372,16373,16374,16375
0,1.000000,0.379628,0.144512,0.137713,0.178240,0.298222,0.245527,0.178135,0.096275,0.225245,...,0.431068,0.256292,0.303731,0.145661,0.494795,0.285858,0.363925,0.255344,0.224917,0.308743
1,0.379628,1.000000,0.137079,0.159790,0.170910,0.100493,0.206269,0.245843,0.134397,0.204641,...,0.399514,0.131929,0.252955,0.118443,0.298011,0.329786,0.404351,0.308838,0.197086,0.273383
2,0.144512,0.137079,1.000000,0.180735,0.467717,0.062162,0.291692,0.125420,0.085931,0.140051,...,0.205660,0.120554,0.199574,0.096373,0.153900,0.311616,0.206411,0.164879,0.121409,0.138584
3,0.137713,0.159790,0.180735,1.000000,0.231167,0.063929,0.327313,0.237519,0.047743,0.062415,...,0.168045,0.129887,0.153314,0.074440,0.170369,0.143986,0.176181,0.099562,0.133688,0.098377
4,0.178240,0.170910,0.467717,0.231167,1.000000,0.055294,0.275303,0.120743,0.117387,0.103979,...,0.230024,0.120001,0.189535,0.123149,0.220936,0.221167,0.230767,0.114495,0.102969,0.147752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16371,0.285858,0.329786,0.311616,0.143986,0.221167,0.144519,0.164069,0.138332,0.140364,0.231748,...,0.402947,0.160706,0.395326,0.136327,0.315797,1.000000,0.427665,0.364402,0.170626,0.331611
16372,0.363925,0.404351,0.206411,0.176181,0.230767,0.215755,0.252928,0.164444,0.202562,0.334906,...,0.562953,0.236918,0.419097,0.214603,0.407445,0.427665,1.000000,0.388149,0.189176,0.490294
16373,0.255344,0.308838,0.164879,0.099562,0.114495,0.221061,0.158150,0.088949,0.204815,0.308057,...,0.434861,0.276426,0.392325,0.144863,0.348712,0.364402,0.388149,1.000000,0.214254,0.398785
16374,0.224917,0.197086,0.121409,0.133688,0.102969,0.225534,0.198766,0.183798,0.052413,0.133732,...,0.320372,0.376673,0.239804,0.295528,0.272746,0.170626,0.189176,0.214254,1.000000,0.165940


In [26]:
index_to_movie_id = df_data_with_tags['movieId']


In [27]:
df_tfidf_m2m.columns = [str(index_to_movie_id[int(col)]) for col in df_tfidf_m2m.columns]

In [28]:
df_tfidf_m2m.index = [index_to_movie_id[idx] for idx in df_tfidf_m2m.index]

In [29]:
df_tfidf_m2m

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,286131,286355,286357,286777,286897,286901,286905,287377,287633,288167
1,1.000000,0.379628,0.144512,0.137713,0.178240,0.298222,0.245527,0.178135,0.096275,0.225245,...,0.431068,0.256292,0.303731,0.145661,0.494795,0.285858,0.363925,0.255344,0.224917,0.308743
2,0.379628,1.000000,0.137079,0.159790,0.170910,0.100493,0.206269,0.245843,0.134397,0.204641,...,0.399514,0.131929,0.252955,0.118443,0.298011,0.329786,0.404351,0.308838,0.197086,0.273383
3,0.144512,0.137079,1.000000,0.180735,0.467717,0.062162,0.291692,0.125420,0.085931,0.140051,...,0.205660,0.120554,0.199574,0.096373,0.153900,0.311616,0.206411,0.164879,0.121409,0.138584
4,0.137713,0.159790,0.180735,1.000000,0.231167,0.063929,0.327313,0.237519,0.047743,0.062415,...,0.168045,0.129887,0.153314,0.074440,0.170369,0.143986,0.176181,0.099562,0.133688,0.098377
5,0.178240,0.170910,0.467717,0.231167,1.000000,0.055294,0.275303,0.120743,0.117387,0.103979,...,0.230024,0.120001,0.189535,0.123149,0.220936,0.221167,0.230767,0.114495,0.102969,0.147752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286901,0.285858,0.329786,0.311616,0.143986,0.221167,0.144519,0.164069,0.138332,0.140364,0.231748,...,0.402947,0.160706,0.395326,0.136327,0.315797,1.000000,0.427665,0.364402,0.170626,0.331611
286905,0.363925,0.404351,0.206411,0.176181,0.230767,0.215755,0.252928,0.164444,0.202562,0.334906,...,0.562953,0.236918,0.419097,0.214603,0.407445,0.427665,1.000000,0.388149,0.189176,0.490294
287377,0.255344,0.308838,0.164879,0.099562,0.114495,0.221061,0.158150,0.088949,0.204815,0.308057,...,0.434861,0.276426,0.392325,0.144863,0.348712,0.364402,0.388149,1.000000,0.214254,0.398785
287633,0.224917,0.197086,0.121409,0.133688,0.102969,0.225534,0.198766,0.183798,0.052413,0.133732,...,0.320372,0.376673,0.239804,0.295528,0.272746,0.170626,0.189176,0.214254,1.000000,0.165940


### Most similar movies to Toy Story

In [30]:
df_tfidf_m2m.iloc[0].sort_values(ascending=False)[:10]

1        1.000000
3114     0.728797
78499    0.728585
4886     0.673833
8961     0.652712
2355     0.642130
6377     0.637588
68954    0.636667
50872    0.614647
97913    0.606383
Name: 1, dtype: float64

In [31]:
df_data_with_tags[df_data_with_tags.movieId == 3114]

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
2820,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,3.809132,4.0,34495.0,138 1051 1046 388 382 1007 215 304 844 761 971...


In [32]:
df_data_with_tags[df_data_with_tags.movieId == 4886]

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
4429,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,3.840528,4.0,48441.0,620 977 138 1051 919 388 382 461 1007 215 844 ...
