In [1]:
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from pprint import pprint

In [2]:
## Reading the CSVs
movie_titles_df = pd.read_csv('Title_basics.tsv', dtype={"isAdult": "string"}, sep='\t')
movie_ratings_df = pd.read_csv('Title_ratings.tsv', sep='\t')
user_ratings_df = pd.DataFrame(np.load('User_ratings.npy'))

## convert loaded np array to pd dataframe
user_ratings_df[['userID', 'titleID', 'rating', 'date']] = user_ratings_df[0].str.split(',', expand=True)
user_ratings_df = user_ratings_df.drop(0, axis=1)

In [3]:
movie_titles_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9689096,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
9689097,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
9689098,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
9689099,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [4]:
movie_titles_df = movie_titles_df.drop(movie_titles_df[movie_titles_df.titleType.isin(["tvEpisode", "videoGame", "video", "tvSpecial", "tvShort", "tv", "tvSeries", "tvMiniSeries", "tvPilot", "short"])].index)

In [5]:
movie_titles_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,100,"Documentary,News,Sport"
498,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
...,...,...,...,...,...,...,...,...,...
9689019,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,\N,100,Documentary
9689024,tt9916692,tvMovie,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,0,2015,\N,66,Drama
9689031,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,\N,\N,Comedy
9689041,tt9916730,movie,6 Gunn,6 Gunn,0,2017,\N,116,\N


In [6]:
movie_titles_df = movie_titles_df.drop(['titleType', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres'], axis=1)

In [7]:
movie_titles_df

Unnamed: 0,tconst,primaryTitle
8,tt0000009,Miss Jerry
144,tt0000147,The Corbett-Fitzsimmons Fight
498,tt0000502,Bohemios
570,tt0000574,The Story of the Kelly Gang
587,tt0000591,The Prodigal Son
...,...,...
9689019,tt9916680,De la ilusión al desconcierto: cine colombiano...
9689024,tt9916692,Teatroteka: Czlowiek bez twarzy
9689031,tt9916706,Dankyavar Danka
9689041,tt9916730,6 Gunn


In [8]:
user_ratings_df

Unnamed: 0,userID,titleID,rating,date
0,ur4592644,tt0120884,10,16 January 2005
1,ur3174947,tt0118688,3,16 January 2005
2,ur3780035,tt0387887,8,16 January 2005
3,ur4592628,tt0346491,1,16 January 2005
4,ur3174947,tt0094721,8,16 January 2005
...,...,...,...,...
4669815,ur0581842,tt0107977,6,16 January 2005
4669816,ur3174947,tt0103776,8,16 January 2005
4669817,ur4592639,tt0107423,9,16 January 2005
4669818,ur4581944,tt0102614,8,16 January 2005


In [9]:
user_ratings_df = user_ratings_df.drop('date', axis=1)

In [10]:
user_ratings_df

Unnamed: 0,userID,titleID,rating
0,ur4592644,tt0120884,10
1,ur3174947,tt0118688,3
2,ur3780035,tt0387887,8
3,ur4592628,tt0346491,1
4,ur3174947,tt0094721,8
...,...,...,...
4669815,ur0581842,tt0107977,6
4669816,ur3174947,tt0103776,8
4669817,ur4592639,tt0107423,9
4669818,ur4581944,tt0102614,8


In [11]:
movie_titles_df.rename(columns={'tconst': 'titleID'}, inplace=True)

In [12]:
movie_titles_df

Unnamed: 0,titleID,primaryTitle
8,tt0000009,Miss Jerry
144,tt0000147,The Corbett-Fitzsimmons Fight
498,tt0000502,Bohemios
570,tt0000574,The Story of the Kelly Gang
587,tt0000591,The Prodigal Son
...,...,...
9689019,tt9916680,De la ilusión al desconcierto: cine colombiano...
9689024,tt9916692,Teatroteka: Czlowiek bez twarzy
9689031,tt9916706,Dankyavar Danka
9689041,tt9916730,6 Gunn


In [13]:
combined_df = pd.merge(movie_titles_df, user_ratings_df)

In [14]:
combined_df

Unnamed: 0,titleID,primaryTitle,userID,rating
0,tt0000147,The Corbett-Fitzsimmons Fight,ur2483625,8
1,tt0000147,The Corbett-Fitzsimmons Fight,ur1234929,8
2,tt0000574,The Story of the Kelly Gang,ur1609079,10
3,tt0000574,The Story of the Kelly Gang,ur13283282,10
4,tt0000574,The Story of the Kelly Gang,ur10334028,9
...,...,...,...,...
3559107,tt9916270,Il talento del calabrone,ur0430892,5
3559108,tt9916270,Il talento del calabrone,ur126474842,2
3559109,tt9916270,Il talento del calabrone,ur24536688,6
3559110,tt9916362,Coven,ur66663169,10


In [15]:
combined_df.userID.isnull().values.any()

False

In [16]:
combined_df['titleID'].nunique()

158968

In [17]:
movie_titles_df['titleID'].nunique()

779228

In [18]:
user_ratings_df['titleID'].nunique()

351109

In [19]:
refined_dataset = combined_df.groupby(by=['titleID','userID','primaryTitle'], as_index=False).agg({"rating":"mean"})

In [20]:
refined_dataset

Unnamed: 0,titleID,userID,primaryTitle,rating
0,tt0000147,ur1234929,The Corbett-Fitzsimmons Fight,8.0
1,tt0000147,ur2483625,The Corbett-Fitzsimmons Fight,8.0
2,tt0000574,ur0258737,The Story of the Kelly Gang,10.0
3,tt0000574,ur10334028,The Story of the Kelly Gang,9.0
4,tt0000574,ur13283282,The Story of the Kelly Gang,10.0
...,...,...,...,...
3545019,tt9916270,ur126474842,Il talento del calabrone,2.0
3545020,tt9916270,ur24536688,Il talento del calabrone,6.0
3545021,tt9916270,ur27958996,Il talento del calabrone,3.0
3545022,tt9916362,ur66663169,Coven,10.0


In [21]:
movieUser_df = refined_dataset.pivot_table(
    index='userID',
     columns='primaryTitle',
      ## Replacing all movies users haven't rated with a rating of 0
      values='rating').fillna(0)

  num_cells = num_rows * num_columns


IndexError: index 875914235 is out of bounds for axis 0 with size 875909652