In [1]:
# Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [2]:
# Read dataframes
df_movies = pd.read_csv('ml-latest-small/movies.csv')
df_links = pd.read_csv('ml-latest-small/links.csv')
df_ratings = pd.read_csv('ml-latest-small/ratings.csv')
df_genome_tags = pd.read_csv('data/genome-tags.csv')
df_genome_scores = pd.read_csv('data/genome-scores.csv')

# Merge scores and tags
df_movie_tags_in_text = pd.merge(df_genome_scores, df_genome_tags, on='tagId')[['movieId', 'tag', 'relevance']]

# Only keep tags with relevance higher than 0.3
df_movie_tags = df_genome_scores[df_genome_scores.relevance > 0.3][['movieId', 'tagId']]

In [3]:
df_movie_tags

Unnamed: 0,movieId,tagId
10,1,11
18,1,19
20,1,21
21,1,22
28,1,29
...,...,...
14862490,187595,1091
14862491,187595,1092
14862495,187595,1096
14862500,187595,1101


In [4]:
df_movies[df_movies.movieId == 1]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [5]:
df_movie_tags[df_movie_tags['movieId'] == 1].merge(df_genome_tags, on='tagId').sample(10)

Unnamed: 0,movieId,tagId,tag
161,1,982,stylized
164,1,999,suspense
107,1,602,life philosophy
128,1,752,oscar (best effects - visual effects)
81,1,454,good versus evil
176,1,1064,unusual plot structure
24,1,193,cgi
53,1,353,enormously long battle scene
73,1,420,futuristic
49,1,323,drama


In [6]:
df_tags_to_movies = pd.merge(df_movie_tags, df_genome_tags, on='tagId', how='left')[['movieId', 'tagId']]
df_tags_to_movies['tagId'] = df_tags_to_movies.tagId.astype(str)

In [7]:
df_tags_to_movies['tagId']

0            11
1            19
2            21
3            22
4            29
           ... 
1471041    1091
1471042    1092
1471043    1096
1471044    1101
1471045    1114
Name: tagId, Length: 1471046, dtype: object

In [8]:
def _concatenate_tags_of_movie(tags):
    tags_as_str = ' '.join(set(tags))
    return tags_as_str

In [9]:
df_tags_per_movie = df_tags_to_movies.groupby('movieId')['tagId'].agg(_concatenate_tags_of_movie)
df_tags_per_movie.name = 'movie_tags'
df_tags_per_movie = df_tags_per_movie.reset_index()

In [10]:
df_tags_per_movie

Unnamed: 0,movieId,movie_tags
0,1,11 338 445 719 29 33 323 371 675 61 368 21 951...
1,2,445 719 29 371 61 21 951 382 662 245 881 389 6...
2,3,777 865 445 505 1057 29 807 277 926 1102 292 1...
3,4,497 640 22 445 967 807 1116 299 439 128 179 21...
4,5,777 849 497 865 445 967 926 807 277 808 1102 1...
...,...,...
13171,185435,1123 445 926 21 100 389 333 335 1040 971 990 7...
13172,185585,777 886 332 719 29 807 277 889 933 982 1116 88...
13173,186587,177 338 45 426 323 887 951 929 100 536 353 333...
13174,187593,191 323 21 780 108 382 536 547 51 1040 311 797...


In [11]:
df_tags_per_movie[df_tags_per_movie['movieId'] == 1]

Unnamed: 0,movieId,movie_tags
0,1,11 338 445 719 29 33 323 371 675 61 368 21 951...


In [12]:
df_avg_ratings  = df_ratings.groupby('movieId')['rating'].agg(['mean', 'median', 'size'])
df_avg_ratings.columns = ['rating_mean', 'rating_median', 'num_ratingsdf_tags_per_movie']
df_avg_ratings = df_avg_ratings.reset_index()

In [13]:
df_avg_ratings

Unnamed: 0,movieId,rating_mean,rating_median,num_ratingsdf_tags_per_movie
0,1,3.920930,4.0,215
1,2,3.431818,3.5,110
2,3,3.259615,3.0,52
3,4,2.357143,3.0,7
4,5,3.071429,3.0,49
...,...,...,...,...
9719,193581,4.000000,4.0,1
9720,193583,3.500000,3.5,1
9721,193585,3.500000,3.5,1
9722,193587,3.500000,3.5,1


In [14]:
df_movies_with_ratings = pd.merge(df_movies, df_avg_ratings, how='left', on='movieId')

In [15]:
df_movies_with_ratings

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930,4.0,215.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,3.5,110.0
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,3.0,52.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,3.0,7.0
4,5,Father of the Bride Part II (1995),Comedy,3.071429,3.0,49.0
...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,4.000000,4.0,1.0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,3.500000,3.5,1.0
9739,193585,Flint (2017),Drama,3.500000,3.5,1.0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,3.500000,3.5,1.0


In [16]:
df_data = pd.merge(df_movies_with_ratings, df_tags_per_movie, how='left', on='movieId')

In [17]:
df_data

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930,4.0,215.0,11 338 445 719 29 33 323 371 675 61 368 21 951...
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,3.5,110.0,445 719 29 371 61 21 951 382 662 245 881 389 6...
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,3.0,52.0,777 865 445 505 1057 29 807 277 926 1102 292 1...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,3.0,7.0,497 640 22 445 967 807 1116 299 439 128 179 21...
4,5,Father of the Bride Part II (1995),Comedy,3.071429,3.0,49.0,777 849 497 865 445 967 926 807 277 808 1102 1...
...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,4.000000,4.0,1.0,
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,3.500000,3.5,1.0,
9739,193585,Flint (2017),Drama,3.500000,3.5,1.0,
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,3.500000,3.5,1.0,


In [18]:
df_data_with_tags = df_data[~df_data.movie_tags.isnull()].reset_index(drop=True)

In [19]:
df_data_with_tags

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.920930,4.00,215.0,11 338 445 719 29 33 323 371 675 61 368 21 951...
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818,3.50,110.0,445 719 29 371 61 21 951 382 662 245 881 389 6...
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615,3.00,52.0,777 865 445 505 1057 29 807 277 926 1102 292 1...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143,3.00,7.0,497 640 22 445 967 807 1116 299 439 128 179 21...
4,5,Father of the Bride Part II (1995),Comedy,3.071429,3.00,49.0,777 849 497 865 445 967 926 807 277 808 1102 1...
...,...,...,...,...,...,...,...
8741,185435,"Game Over, Man! (2018)",Action|Comedy,3.000000,3.00,1.0,1123 445 926 21 100 389 333 335 1040 971 990 7...
8742,185585,Pacific Rim: Uprising (2018),Action|Fantasy|Sci-Fi,2.750000,2.75,2.0,777 886 332 719 29 807 277 889 933 982 1116 88...
8743,186587,Rampage (2018),Action|Adventure|Sci-Fi,3.000000,3.00,1.0,177 338 45 426 323 887 951 929 100 536 353 333...
8744,187593,Deadpool 2 (2018),Action|Comedy|Sci-Fi,3.875000,4.00,12.0,191 323 21 780 108 382 536 547 51 1040 311 797...


In [20]:
tf_idf = TfidfVectorizer()

In [21]:
df_movies_tf_idf_described = tf_idf.fit_transform(df_data_with_tags.movie_tags)

In [None]:
m2m = cosine_similarity(df_movies_tf_idf_described)

In [27]:
df_movies_tf_idf_described.shape

(13176, 1119)

In [28]:
df_tfidf_m2m = pd.DataFrame(cosine_similarity(df_movies_tf_idf_described))

In [29]:
index_to_movie_id = df_data_with_tags['movieId']

In [30]:
df_tfidf_m2m.columns = [str(index_to_movie_id[int(col)]) for col in df_tfidf_m2m.columns]

In [31]:
df_tfidf_m2m.index = [index_to_movie_id[idx] for idx in df_tfidf_m2m.index]

In [35]:
df_tfidf_m2m.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,184987,184997,185029,185135,185425,185435,185585,186587,187593,187595
1,1.0,0.359995,0.140584,0.163904,0.197146,0.267026,0.240104,0.233925,0.075557,0.223134,...,0.231415,0.323718,0.449159,0.415062,0.115754,0.126076,0.154487,0.238729,0.376306,0.339774
2,0.359995,1.0,0.116658,0.123059,0.119013,0.090835,0.215883,0.221415,0.167558,0.22194,...,0.309822,0.231912,0.207119,0.253158,0.151519,0.122537,0.183769,0.244879,0.20468,0.287186
3,0.140584,0.116658,1.0,0.192486,0.407801,0.090215,0.246536,0.151995,0.077091,0.142224,...,0.118169,0.198064,0.173156,0.146563,0.090056,0.131115,0.226738,0.147758,0.202069,0.204408
4,0.163904,0.123059,0.192486,1.0,0.278716,0.07574,0.334642,0.200485,0.049504,0.079378,...,0.151011,0.195374,0.211978,0.181477,0.214305,0.168674,0.164492,0.162263,0.170269,0.199378
5,0.197146,0.119013,0.407801,0.278716,1.0,0.085531,0.309019,0.151632,0.067623,0.109039,...,0.14701,0.264331,0.18241,0.163857,0.117392,0.133729,0.15714,0.148886,0.199741,0.16228


In [42]:
df_tfidf_m2m.iloc[0].sort_values(ascending=False)[1:11]

3114     0.785583
4886     0.773566
78499    0.747440
2355     0.728273
6377     0.705095
68954    0.668650
8961     0.660274
50872    0.653690
4306     0.652549
364      0.632700
Name: 1, dtype: float64

In [37]:
df_data_with_tags[df_data_with_tags.movieId == 3114]

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
2809,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,3.809977,4.0,29820.0,538 338 664 1114 519 743 472 19 244 777 62 61 ...


In [25]:
df_data_with_tags[df_data_with_tags.movieId == 4886]

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
4331,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,3.861679,4.0,8708.0,216 215 669 664 663 765 136 497 490 493 690 10...


In [26]:
df_data_with_tags[df_data_with_tags.title.str.contains('Terminator 2')]

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
555,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,3.947648,4.0,16093.0,216 768 215 212 1085 668 452 761 663 767 132 8...


In [44]:
df_tfidf_m2m.iloc[3114][555]

0.06361854820409465

In [66]:
df_user_ratings = df_ratings[df_ratings.userId == 2000]

In [67]:
df_user_data_with_tags = df_data_with_tags.reset_index().merge(df_user_ratings, on='movieId')

In [70]:
df_user_data_with_tags[['title', 'rating']]

Unnamed: 0,title,rating
0,Toy Story (1995),4.0
1,Jumanji (1995),3.5
2,Ace Ventura: When Nature Calls (1995),1.0
3,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),4.0
4,Babe (1995),3.5
...,...,...
237,Star Wars: Episode VII - The Force Awakens (2015),3.0
238,The Martian (2015),3.0
239,Inside Out (2015),4.0
240,Spotlight (2015),4.5


In [71]:
df_user_data_with_tags['weight'] = df_user_data_with_tags['rating']/5.

In [88]:
df_user_data_with_tags

Unnamed: 0,index,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags,userId,rating,timestamp,weight
0,0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.886649,4.0,68469.0,982 338 1093 664 830 990 1114 519 743 472 19 2...,2000,4.0,1455860873,0.8
1,1,2,Jumanji (1995),Adventure|Children|Fantasy,3.246583,3.0,27143.0,112 1093 396 1113 502 743 19 244 51 777 889 62...,2000,3.5,1456119319,0.7
2,18,19,Ace Ventura: When Nature Calls (1995),Comedy,2.642014,3.0,24913.0,763 558 594 893 216 418 777 801 62 265 264 807...,2000,1.0,1455861164,0.2
3,31,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,3.890193,4.0,54846.0,538 982 52 222 558 1093 243 396 547 46 715 830...,2000,4.0,1455998568,0.8
4,33,34,Babe (1995),Children|Drama,3.613834,4.0,35903.0,338 46 1114 244 62 61 603 497 22 378 469 468 6...,2000,3.5,1455861041,0.7
...,...,...,...,...,...,...,...,...,...,...,...,...
237,12105,122886,Star Wars: Episode VII - The Force Awakens (2015),Action|Adventure|Fantasy|Sci-Fi|IMAX,3.794736,4.0,12747.0,982 52 222 833 1005 338 850 1093 396 911 502 9...,2000,3.0,1455823417,0.6
238,12337,134130,The Martian (2015),Adventure|Drama|Sci-Fi,4.043812,4.0,16160.0,555 338 1093 243 396 46 830 945 1114 519 743 1...,2000,3.0,1455999133,0.6
239,12357,134853,Inside Out (2015),Adventure|Animation|Children|Comedy|Drama|Fantasy,3.960429,4.0,13659.0,52 338 1093 664 46 871 1114 519 743 1024 216 4...,2000,4.0,1455998854,0.8
240,12526,142488,Spotlight (2015),Thriller,4.079351,4.0,5211.0,982 833 262 555 558 338 850 243 715 567 682 11...,2000,4.5,1455823423,0.9


In [72]:
user_profile = np.dot(df_movies_tf_idf_described[df_user_data_with_tags['index'].values].toarray().T, df_user_data_with_tags['weight'].values)

In [75]:
C = cosine_similarity(atleast_2d(user_profile), df_movies_tf_idf_described)

In [76]:
R = argsort(C)[:, ::-1]

In [78]:
recommendations = [i for i in R[0] if i not in df_user_data_with_tags['index'].values]

In [93]:
df_data_with_tags['title'][recommendations].head(10)

13157                                 Isle of Dogs (2018)
12858                         Over the Garden Wall (2013)
13081                            Blade Runner 2049 (2017)
13106                                         Coco (2017)
12395          George Carlin: You Are All Diseased (1999)
1136                                   Stand by Me (1986)
13100    Three Billboards Outside Ebbing, Missouri (2017)
820                                 Apartment, The (1960)
13040             The Godfather Trilogy: 1972-1990 (1992)
12689                    Hunt for the Wilderpeople (2016)
Name: title, dtype: object