In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.sparse import csr_matrix, save_npz

In [2]:
# import datasets:
genome_scores =  pd.read_csv('data/genome-scores.csv')
movies = pd.read_csv('data/movies.csv',  usecols = ["movieId"])
ratings = pd.read_csv('data/ratings.csv', usecols = ["movieId", "userId", "rating"])

# Colloborative filtering
## Prepare user - movie - rating matrices

In [3]:
# Count movies in the movies df
tmm = movies["movieId"].nunique()
# Count movies in the matings df
tmr = ratings["movieId"].nunique()
# Count movies in the genome_score df
tmg = genome_scores["movieId"].nunique()
print("The total number of movies in movies, ratings and genome score dataframes, respectively :", tmm, tmr, tmg)

The total number of movies in movies, ratings and genome score dataframes, respectively : 27278 26744 10381


In [3]:
# Create a movie - user - rating data frame with movies present in all the dataframes
tagged_movies = pd.DataFrame(genome_scores['movieId'].value_counts()).index
mov_rat = pd.merge(movies, ratings, on="movieId")

In [10]:
final_df = mov_rat[mov_rat["movieId"].isin(tagged_movies)]

In [11]:
#Reducing the dataframe by removing unpopular movies and inactive users
#Shringking movies
gf = pd.DataFrame(final_df['movieId'].value_counts())
rare_movies = gf[gf['movieId'] <= 500].index
final_df = final_df[~final_df["movieId"].isin(rare_movies)]
print('Out of total of ', gf.shape[0] , ' movies, ', rare_movies.shape[0], ' are considered rare and will be removed.')
print('The final number of movies is ', final_df["movieId"].nunique())

#Shringking users
udf = pd.DataFrame(final_df['userId'].value_counts())
lazy_users = udf[udf['userId'] <= 500].index
final_df = final_df[~final_df["userId"].isin(lazy_users)]
print('Out of total of ', udf.shape[0] , ' users, ', lazy_users.shape[0], ' are considered lazy and will be removed.')
print('The final number of users is ', final_df["userId"].nunique())

# Create the user->movie sparse rating matrix. Fill the NA with zeros
pivot = final_df.pivot_table(index="userId", columns="movieId", values="rating")
pivot_na = pivot.copy()
#pivot.fillna(0,inplace=True)
#pivot.to_csv('matrices/pivot.csv', index=True, header="userId")

#Estimate sparsity
sparsity = 1.0 - ( np.count_nonzero(pivot) / float(pivot.size) )
print("The resulting sparcity of the matrix is:", sparsity)

#Create non-sparce dataset
csr_data = csr_matrix(pivot.values)

# Lets save teh pivot matrix with NA for further uses
pivot_na.to_csv('matrices/pivot_na.csv', index=True, header="userId")

Out of total of  10370  movies,  5894  are considered rare and will be removed.
The final number of movies is  4476
Out of total of  138493  users,  131909  are considered lazy and will be removed.
The final number of users is  6584
The resulting sparcity of the matrix is: 0.8186343433372749


In [7]:
#final_df.to_csv('matrices/final_ratings.csv', index=False)

In [None]:
#save_npz("matrices/sparse_ratings.npz", csr_data)