# Netflix

In [1]:
import os
import math
import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from collections import deque

seed = 1234
np.random.seed(seed)

## Data Reading

In [2]:
df1 = pd.read_csv('../input/netflix-prize-data/combined_data_1.txt', header=None, names=['User', 'Rating'], usecols=[0,1])
df1['Rating'] = df1['Rating'].astype(float)
print('Dataset 1 shape: {}'.format(df1.shape))
df = df1

Dataset 1 shape: (24058263, 2)


In [3]:
df2 = pd.read_csv('../input/netflix-prize-data/combined_data_2.txt', header=None, names=['User', 'Rating'], usecols=[0,1])
df3 = pd.read_csv('../input/netflix-prize-data/combined_data_3.txt', header=None, names=['User', 'Rating'], usecols=[0,1])
df4 = pd.read_csv('../input/netflix-prize-data/combined_data_4.txt', header=None, names=['User', 'Rating'], usecols=[0,1])
df2['Rating'] = df2['Rating'].astype(float)
df3['Rating'] = df3['Rating'].astype(float)
df4['Rating'] = df4['Rating'].astype(float)

df = pd.concat([df1, df2, df3, df4])

df.index = np.arange(0,len(df))
print('Full dataset shape: {}'.format(df.shape))

Full dataset shape: (100498277, 2)


In [4]:
tmp_movies = df[df['Rating'].isna()]['User'].reset_index()
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]
shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)

user_data = []
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
    if df_id_1 < df_id_2:
        tmp_df = df.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = df.loc[df_id_1+1:].copy()
    tmp_df['Movie'] = movie_id
    user_data.append(tmp_df)

df = pd.concat(user_data)
del user_data, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape User-Ratings:\t{}'.format(df.shape))
df.sample(5)

Shape User-Ratings:	(100480507, 3)


Unnamed: 0,User,Rating,Movie
69976929,1064587,5.0,12675
61414863,1587376,3.0,11184
79617643,1301583,4.0,14382
40298114,2457206,5.0,7155
21571233,809832,3.0,4055


In [5]:
# 2000, 20
min_movie_ratings = 2000
filter_movies = (df['Movie'].value_counts()>min_movie_ratings)
filter_movies = filter_movies[filter_movies].index.tolist()

min_user_ratings = 200
filter_users = (df['User'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

df_filtered = df[(df['Movie'].isin(filter_movies)) & (df['User'].isin(filter_users))]
del filter_movies, filter_users, min_movie_ratings, min_user_ratings
print('Shape User-Ratings unfiltered:\t{}'.format(df.shape))
print('Shape User-Ratings filtered:\t{}'.format(df_filtered.shape))

Shape User-Ratings unfiltered:	(100480507, 3)
Shape User-Ratings filtered:	(72868166, 3)


In [6]:
print("Number of unique movies unfiltered:", df['Movie'].nunique())
print("Number of unique movies filtered:", df_filtered['Movie'].nunique())

Number of unique movies unfiltered: 17770
Number of unique movies filtered: 5264


In [7]:
df_p = df_filtered.pivot_table(values='Rating',index='User',columns='Movie')
print(df_p.shape)

(150245, 5264)


In [8]:
movie_titles = pd.read_csv('../input/netflix-prize-data/movie_titles.csv', encoding = 'ISO-8859-1', usecols=range(3),
                           header=None, names=['Id', 'Year', 'Name']).set_index('Id')
print('Shape Movie-Titles:\t{}'.format(movie_titles.shape))
movie_titles.sample(5)

Shape Movie-Titles:	(17770, 2)


Unnamed: 0_level_0,Year,Name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
11967,1992.0,Batman: The Animated Series: The Legend Begins
712,1998.0,Homicide: Life on the Street: Season 7
4473,2002.0,Kermit's Swamp Years
9280,2004.0,Teacher's Pet
6508,2002.0,The Salton Sea


In [9]:
filtered_movies = movie_titles.iloc[df_p.columns-1] #['Name']
print(filtered_movies)
filtered_movies.to_csv('movies.csv')

         Year                        Name
Id                                       
3      1997.0                   Character
8      2004.0  What the #$*! Do We Know!?
16     1996.0                   Screamers
17     2005.0                   7 Seconds
18     1994.0            Immortal Beloved
...       ...                         ...
17758  1979.0                    Prophecy
17761  2003.0                      Levity
17762  1997.0                     Gattaca
17764  1998.0         Shakespeare in Love
17769  2003.0                 The Company

[5264 rows x 2 columns]


## Pearson R Correlation

In [10]:
def recommend(movie_title):
    print("For movie {}".format(movie_title))
    print("Top 10 movies recommended: ")
    i = int(movie_titles.index[movie_titles['Name'] == movie_title][0])
    target = df_p[i]
    similar_to_target = df_p.corrwith(target)
    corr_target = pd.DataFrame(similar_to_target, columns = ['PearsonR'])
    corr_target.dropna(inplace = True)
    corr_target = corr_target.sort_values('PearsonR', ascending = False)
    corr_target.index = corr_target.index.map(int)
    corr_target = corr_target.join(movie_titles)[['PearsonR', 'Name']]
    print(corr_target[:10].to_string(index=False))

recommend("Lilo and Stitch")

For movie Lilo and Stitch
Top 10 movies recommended: 
 PearsonR                                  Name
 1.000000                       Lilo and Stitch
 0.479502                     Stitch! The Movie
 0.458240                     Lilo and Stitch 2
 0.427557           Rugrats in Paris: The Movie
 0.395200                       Rugrats Go Wild
 0.387868                      Oliver & Company
 0.387045                              Hercules
 0.383723         Scooby-Doo's Creepiest Capers
 0.381166 Scooby-Doo and the Reluctant Werewolf
 0.379675                     The Rugrats Movie


In [11]:
recommend("Superman: The Movie")

For movie Superman: The Movie
Top 10 movies recommended: 
 PearsonR                             Name
 1.000000              Superman: The Movie
 0.639680                      Superman II
 0.476632                     Superman III
 0.405939           The Karate Kid Part II
 0.403611 Superman IV: The Quest for Peace
 0.391602                            Benji
 0.385704       Herbie Goes to Monte Carlo
 0.385315        The Batman Superman Movie
 0.383454             The Towering Inferno
 0.381919                           Batman


In [12]:
# predictions = {}
# for i in df_p.columns:
#     target = df_p[i]
#     similar_to_target = df_p.corrwith(target)
#     corr_target = pd.DataFrame(similar_to_target, columns = ['PearsonR'])
#     corr_target.dropna(inplace = True)
#     corr_target = corr_target.sort_values('PearsonR', ascending = False)
#     corr_target.index = corr_target.index.map(int)
#     predictions[i] = corr_target.index.values.tolist()[:50]
#     if i%10==0:
#         print(i)

In [13]:
# predictions = {k: v['Name'].tolist()[:10] for k,v in predictions.items()}
# print(predictions[8]['Name'].tolist()[:10])

In [14]:
# p = pd.DataFrame.from_dict(predictions, orient='index')
# p.to_csv('predictions.csv')
# df_p.to_csv('df.csv')

## Matrix Factorization

In [None]:
# import surprise as sp
# from surprise.model_selection import cross_validate
# data = sp.Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']].sample(20000), sp.Reader())

In [15]:
import surprise as sp
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse

In [17]:
dataset = sp.Dataset.load_from_df(df_filtered[['User', 'Movie', 'Rating']].sample(20000), sp.Reader())
trainset, testset = train_test_split(dataset, test_size=0.2)
model = SVD()
model.fit(trainset)
predictions = model.test(testset)
rmse_score = rmse(predictions)
print("RMSE:", rmse_score)

RMSE: 1.0486
RMSE: 1.048616410569879


In [20]:
print(model.predict(3, 290))p

user: 3          item: 290        r_ui = None   est = 3.71   {'was_impossible': False}
