In [1]:
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

from pprint import pprint

In [2]:
## Reading the CSVs
movie_titles_df = pd.read_csv('title_merge_ratings_500000.csv')
user_ratings_df = pd.DataFrame(np.load('User_ratings.npy'))

## convert loaded np array to pd dataframe
user_ratings_df[['userID', 'titleID', 'rating', 'date']] = user_ratings_df[0].str.split(',', expand=True)
user_ratings_df = user_ratings_df.drop(0, axis=1)

In [3]:
movie_titles_df = movie_titles_df.drop(['Unnamed: 0', 'titleType', 'startYear', 'runtimeMinutes', 'genres', 'averageRating', 'numVotes', 'Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Film-Noir', 'Game-Show', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'Reality-TV', 'Romance', 'Sci-Fi', 'Short', 'Sport', 'Talk-Show', 'Thriller', 'War', 'Western'], axis=1)
user_ratings_df = user_ratings_df.drop('date', axis=1)
movie_titles_df.rename(columns={'tconst': 'titleID'}, inplace=True)

In [4]:
movie_titles_df

Unnamed: 0,titleID,primaryTitle
0,tt0034583,Casablanca
1,tt0050083,12 Angry Men
2,tt0054215,Psycho
3,tt0060196,"The Good, the Bad and the Ugly"
4,tt0062622,2001: A Space Odyssey
...,...,...
288,tt6966692,Green Book
289,tt7131622,Once Upon a Time in Hollywood
290,tt7286456,Joker
291,tt8579674,1917


In [5]:
user_ratings_df

Unnamed: 0,userID,titleID,rating
0,ur4592644,tt0120884,10
1,ur3174947,tt0118688,3
2,ur3780035,tt0387887,8
3,ur4592628,tt0346491,1
4,ur3174947,tt0094721,8
...,...,...,...
4669815,ur0581842,tt0107977,6
4669816,ur3174947,tt0103776,8
4669817,ur4592639,tt0107423,9
4669818,ur4581944,tt0102614,8


In [6]:
user_ratings_df_trimmed = user_ratings_df.groupby('userID').filter(lambda x : len(x)>14)

In [7]:
user_ratings_df_trimmed['userID'].nunique()

29521

In [8]:
user_ratings_df_trimmed

Unnamed: 0,userID,titleID,rating
1,ur3174947,tt0118688,3
4,ur3174947,tt0094721,8
5,ur1162550,tt0114891,9
8,ur3174947,tt0120601,8
10,ur2035667,tt0335245,10
...,...,...,...
4669813,ur0581842,tt0068424,4
4669814,ur3174947,tt0096895,8
4669815,ur0581842,tt0107977,6
4669816,ur3174947,tt0103776,8


In [9]:
combined_df = pd.merge(movie_titles_df, user_ratings_df_trimmed)

In [10]:
combined_df

Unnamed: 0,titleID,primaryTitle,userID,rating
0,tt0034583,Casablanca,ur2375356,10
1,tt0034583,Casablanca,ur3212364,9
2,tt0034583,Casablanca,ur2615539,10
3,tt0034583,Casablanca,ur4862994,10
4,tt0034583,Casablanca,ur1293485,10
...,...,...,...,...
155232,tt8946378,Knives Out,ur110879431,8
155233,tt8946378,Knives Out,ur23827727,1
155234,tt8946378,Knives Out,ur109019708,9
155235,tt8946378,Knives Out,ur94666240,10


In [11]:
combined_df['titleID'].nunique()

287

In [12]:
movie_titles_df['titleID'].nunique()

293

In [13]:
user_ratings_df['titleID'].nunique()

351109

In [14]:
refined_dataset = combined_df.groupby(by=['titleID','userID','primaryTitle'], as_index=False).agg({"rating":"mean"})

In [15]:
refined_dataset

Unnamed: 0,titleID,userID,primaryTitle,rating
0,tt0034583,ur0011104,Casablanca,1.0
1,tt0034583,ur0011762,Casablanca,10.0
2,tt0034583,ur0012295,Casablanca,7.0
3,tt0034583,ur0013691,Casablanca,10.0
4,tt0034583,ur0016344,Casablanca,10.0
...,...,...,...,...
154285,tt8946378,ur9927546,Knives Out,8.0
154286,tt8946378,ur99622833,Knives Out,9.0
154287,tt8946378,ur99708365,Knives Out,7.0
154288,tt8946378,ur99877336,Knives Out,8.0
