In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
movies = pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movies['genre_title'] = (pd.Series(movies[['title','genres']]
                                   .fillna('')
                                   .values.tolist()).str.join(' '))

In [4]:
movies['genre_title']

0        Toy Story (1995) Adventure|Animation|Children|...
1                Jumanji (1995) Adventure|Children|Fantasy
2                   Grumpier Old Men (1995) Comedy|Romance
3            Waiting to Exhale (1995) Comedy|Drama|Romance
4                Father of the Bride Part II (1995) Comedy
                               ...                        
62418                                      We (2018) Drama
62419                Window of the Soul (2001) Documentary
62420                        Bad Poems (2018) Comedy|Drama
62421               A Girl Thing (2001) (no genres listed)
62422    Women of Devil's Island (1962) Action|Adventur...
Name: genre_title, Length: 62423, dtype: object

In [5]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1,2),
                     min_df=0.0, stop_words='english')

tf_authTags_matrix = tf.fit_transform(movies['genre_title'])

In [6]:
cosine_sim_authTags = cosine_similarity(tf_authTags_matrix, 
                                        tf_authTags_matrix)
print (cosine_sim_authTags.shape)

: 

In [3]:
genome_scores = pd.read_csv('genome_scores.csv')
print(genome_scores.shape)
genome_scores.head()

(15584448, 3)


Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075


In [12]:
genome_combined = pd.merge(genome_scores, genome_tags,on='tagId', how='inner')
genome_combined.sort_values(by='relevance',ascending=True)

Unnamed: 0,movieId,tagId,relevance,tag
5839607,5789,1080,0.00025,vietnam war
14332586,147940,219,0.00025,claymation
6835631,6923,1080,0.00025,vietnam war
14053625,137062,1002,0.00025,swedish
11407682,74131,219,0.00050,claymation
...,...,...,...,...
13670349,125916,118,1.00000,bdsm
7344409,7569,2,1.00000,007 (series)
4033729,3984,2,1.00000,007 (series)
1998874,2010,59,1.00000,android(s)/cyborg(s)


In [10]:
grouped = genome_combined[['movieId','tagId', 'relevance']].groupby(['movieId','tagId']).mean().sort_values(by='relevance', ascending=False)
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,relevance
movieId,tagId,Unnamed: 2_level_1
3593,891,1.00000
4734,1081,1.00000
2762,436,1.00000
1610,1031,1.00000
1639,1081,1.00000
...,...,...
25812,119,0.00050
5789,1080,0.00025
6923,1080,0.00025
137062,1002,0.00025


In [4]:
genome_tags = pd.read_csv('genome_tags.csv')
print(genome_tags.shape)
genome_tags.head()

(1128, 2)


Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [13]:
imdb_data = pd.read_csv('imdb_data.csv')
imdb_data

Unnamed: 0,movieId,title_cast,director,runtime,budget,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game
2,3,Walter Matthau|Jack Lemmon|Sophia Loren|Ann-Ma...,Mark Steven Johnson,101.0,"$25,000,000",boat|lake|neighbor|rivalry
3,4,Whitney Houston|Angela Bassett|Loretta Devine|...,Terry McMillan,124.0,"$16,000,000",black american|husband wife relationship|betra...
4,5,Steve Martin|Diane Keaton|Martin Short|Kimberl...,Albert Hackett,106.0,"$30,000,000",fatherhood|doberman|dog|mansion
...,...,...,...,...,...,...
27273,131254,Franz Dinda|Florian Lukas|Axel Stein|Kailas Ma...,Carsten Funke,85.0,,man wrapped in a towel|man wears a thong|male ...
27274,131256,Rick Kavanian|Axel Stein|Eva Habermann|Christo...,Matthias Dinter,83.0,"DEM5,800,000",ski|ski resort|ampersand in title|drink in title
27275,131258,Nam-gil Kim|Ye-jin Son|Hae-Jin Yoo|Kyeong-yeon...,Seong-il Cheon,130.0,,pirate|sword fight|korea|bandit
27276,131260,Martti Suosalo|Ilkka Koivula|Vexi Salmi|Riitta...,Timo Koivusalo,102.0,,friend|friendship|television show|restaurant


In [14]:
links = pd.read_csv('links.csv')
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [18]:
test = pd.read_csv('test.csv')
test.shape

(5000019, 2)

In [21]:
train = pd.read_csv('train.csv')
print(train.shape)
train.head()

(10000038, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739
2,146790,5459,5.0,1076215539
3,106362,32296,2.0,1423042565
4,9041,366,3.0,833375837


In [29]:
imdb_data.isnull().sum()

movieId              0
title_cast       10068
director          9874
runtime          12089
budget           19372
plot_keywords    11078
dtype: int64