In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.sparse import csr_matrix, save_npz

In [2]:
# import datasets:
genome_scores =  pd.read_csv('data/genome-scores.csv')
movies = pd.read_csv('data/movies.csv',  usecols = ["movieId"])
ratings = pd.read_csv('data/ratings.csv', usecols = ["movieId", "userId", "rating"])

# Colloborative filtering
## Prepare user - movie - rating matrices

In [3]:
# Count movies in the movies df
tmm = movies["movieId"].nunique()
# Count movies in the matings df
tmr = ratings["movieId"].nunique()
# Count movies in the genome_score df
tmg = genome_scores["movieId"].nunique()
print("The total number of movies in movies, ratings and genome score dataframes, respectively :", tmm, tmr, tmg)

The total number of movies in movies, ratings and genome score dataframes, respectively : 27278 26744 10381


In [3]:
# Create a movie - user - rating data frame with movies present in all the dataframes
tagged_movies = pd.DataFrame(genome_scores['movieId'].value_counts()).index
mov_rat = pd.merge(movies, ratings, on="movieId")

In [15]:
final_df = mov_rat[mov_rat["movieId"].isin(tagged_movies)]

In [16]:
#Reducing the dataframe by removing unpopular movies and inactive users
#Shringking movies
gf = pd.DataFrame(final_df['movieId'].value_counts())
rare_movies = gf[gf['movieId'] <= 50].index
final_df = final_df[~final_df["movieId"].isin(rare_movies)]
print('Out of total of ', gf.shape[0] , ' movies, ', rare_movies.shape[0], ' are considered rare and will be removed.')
print('The final number of movies is ', final_df["movieId"].nunique())

#Shringking users
udf = pd.DataFrame(final_df['userId'].value_counts())
lazy_users = udf[udf['userId'] <= 1000].index
final_df = final_df[~final_df["userId"].isin(lazy_users)]
print('Out of total of ', udf.shape[0] , ' users, ', lazy_users.shape[0], ' are considered lazy and will be removed.')
print('The final number of users is ', final_df["userId"].nunique())

# Create the user->movie sparse rating matrix. Fill the NA with zeros
pivot = final_df.pivot_table(index="userId", columns="movieId", values="rating")
pivot_na = pivot.copy()
pivot.fillna(0,inplace=True)
pivot.to_csv('matrices/pivot.csv', index=True, header="userId")

#Estimate sparsity
sparsity = 1.0 - ( np.count_nonzero(pivot) / float(pivot.size) )
print("The resulting sparcity of the matrix is:", sparsity)

#Create non-sparce dataset
csr_data = csr_matrix(pivot.values)

Out of total of  10370  movies,  439  are considered rare and will be removed.
The final number of movies is  9931
Out of total of  138493  users,  136700  are considered lazy and will be removed.
The final number of users is  1793
The resulting sparcity of the matrix is: 0.8535309137791419


In [7]:
pivot_na.describe()

movieId,1,2,3,4,5,6,7,9,10,11,...,102125,104841,106487,106489,106782,106916,106920,109374,111759,112852
count,4870.0,3695.0,1905.0,539.0,1702.0,3444.0,1843.0,574.0,3626.0,2663.0,...,605.0,750.0,554.0,508.0,545.0,486.0,499.0,563.0,491.0,549.0
mean,3.875154,2.930582,2.858268,2.564007,2.654818,3.792538,3.030657,2.432927,3.287645,3.425272,...,3.501653,3.878,3.557762,3.591535,3.693578,3.634774,3.910822,3.924512,3.792261,3.835155
std,0.85208,0.862414,0.95877,0.97938,0.939203,0.852636,0.929087,0.868458,0.843063,0.878649,...,0.743068,0.814248,0.797432,0.819503,0.839979,0.766465,0.78679,0.770383,0.698848,0.754344
min,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
25%,3.5,2.5,2.0,2.0,2.0,3.0,2.5,2.0,3.0,3.0,...,3.0,3.5,3.0,3.0,3.5,3.0,3.5,3.5,3.5,3.5
50%,4.0,3.0,3.0,3.0,3.0,4.0,3.0,2.5,3.5,3.5,...,3.5,4.0,3.5,3.5,4.0,3.5,4.0,4.0,4.0,4.0
75%,4.5,3.5,3.5,3.0,3.0,4.5,3.5,3.0,4.0,4.0,...,4.0,4.5,4.0,4.0,4.0,4.0,4.5,4.5,4.0,4.5
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [8]:
final_df.to_csv('matrices/final_ratings.csv', index=False)

In [8]:
final_df.head()

Unnamed: 0,movieId,userId,rating
18,1,54,4.0
19,1,58,5.0
27,1,91,4.0
37,1,116,3.0
45,1,134,4.0


In [14]:
pivot_na.to_csv('matrices/pivot_na.csv', index=True, header="userId")

In [10]:
save_npz("matrices/sparse_ratings.npz", csr_data)