In [2]:
import pandas as pd


In [2]:
data_all = []
for i in range(4):
    data = pd.read_csv('/home/aprosvetov/netflix/combined_data_' + str(i+1)+'.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])
    data_all.append(data)

In [3]:
df = pd.concat(data_all)

In [4]:
import numpy as np
df.index = np.arange(0,len(df))
print('Full dataset shape: {}'.format(df.shape))
print('-Dataset examples-')
print(df.iloc[::5000000, :])

Full dataset shape: (100498277, 2)
-Dataset examples-
           Cust_Id  Rating
0               1:     NaN
5000000    2560324     4.0
10000000   2271935     2.0
15000000   1921803     2.0
20000000   1933327     3.0
25000000   1465002     3.0
30000000    961023     4.0
35000000   1372532     5.0
40000000    854274     5.0
45000000    116334     3.0
50000000    768483     3.0
55000000   1331144     5.0
60000000   1609324     2.0
65000000   1699240     3.0
70000000   1776418     4.0
75000000   1643826     5.0
80000000    932047     4.0
85000000   2292868     4.0
90000000    932191     4.0
95000000   1815101     3.0
100000000   872339     4.0


In [None]:
import numpy as np
df_nan = pd.DataFrame(pd.isnull(df.Rating))
df_nan = df_nan[df_nan['Rating'] == True]
df_nan = df_nan.reset_index()

movie_np = []
movie_id = 1

for i,j in zip(df_nan['index'][1:],df_nan['index'][:-1]):
    # numpy approach
    temp = np.full((1,np.abs(i-j-1)), movie_id)
    movie_np = np.append(movie_np, temp)
    movie_id += 1

# Account for last record and corresponding length
# numpy approach
last_record = np.full((1,len(df) - df_nan.iloc[-1, 0] - 1),movie_id)
movie_np = np.append(movie_np, last_record)

In [None]:
df = df[pd.notnull(df['Rating'])]

df['Movie_Id'] = movie_np.astype(int)
df['Cust_Id'] = df['Cust_Id'].astype(int)
print('-Dataset examples-')
print(df.sample(3))

In [None]:
df.to_csv('/srv/aprosvetov/netflix/data_prep.csv', sep=';', index = None)

In [3]:
import pandas as pd
df = pd.read_csv('/srv/aprosvetov/netflix/data_prep.csv', sep=';')

In [10]:
df.Movie_Id.nunique()

17770

In [56]:
df.Cust_Id.nunique()

480189

In [4]:
cust_sample = df.Cust_Id.sample(10000)

In [11]:
movie_sample = df.Movie_Id.sample(5000)

In [3]:
import surprise

In [7]:
from surprise import Reader, Dataset

In [12]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(df[df.Cust_Id.isin(cust_sample) &
                              df.Movie_Id.isin(movie_sample)][['Cust_Id', 'Movie_Id', 'Rating']], reader)

In [13]:
from surprise import KNNBasic

sim_options = {
    'name': 'cosine',
    'user_based': False
}
 
knn = KNNBasic(sim_options=sim_options)
trainingSet = data.build_full_trainset()

In [14]:
knn.fit(trainingSet)



Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fcf281fe160>

In [None]:
testSet = trainingSet.build_anti_testset()
predictions = knn.test(testSet)

In [16]:
titles = pd.read_csv('movie_titles.csv', encoding = "ISO-8859-1", 
                     header = None, 
                     names = ['Movie_Id', 'Year', 'Name'])

In [18]:
predictions[0:4]

[Prediction(uid=2503129, iid=44, r_ui=3.5881227434677645, est=4.474997108906492, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=2503129, iid=47, r_ui=3.5881227434677645, est=3.9245326082785117, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=2503129, iid=76, r_ui=3.5881227434677645, est=4.424598056472339, details={'actual_k': 40, 'was_impossible': False}),
 Prediction(uid=2503129, iid=83, r_ui=3.5881227434677645, est=4.374795599566862, details={'actual_k': 40, 'was_impossible': False})]

In [19]:
from collections import defaultdict
 
def get_top3_recommendations(predictions, topN = 3):
     
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs

In [24]:
top3_recommendations = get_top3_recommendations(predictions)

In [38]:
import numpy as np
def print_recs(i):
    for (a, b) in top3_recommendations[i]:
        print(titles[titles.Movie_Id == a]['Name'].values[0], np.round(b,2))

In [39]:
i = 262149
print_recs(i)

The Blair Witch Project 4.65
Lone Wolf and Cub: Sword of Vengeance 4.63
The Basketball Diaries 4.63


In [54]:
i = np.random.choice(list(top3_recommendations.keys()))

print_recs(i)

Family Guy: Vol. 1: Seasons 1-2 4.32
Family Guy: Vol. 2: Season 3 4.32
Team America: World Police 4.3


In [55]:
films = data.df[(data.df.Cust_Id == i) & (data.df.Rating == 5)]['Movie_Id'].values
titles[titles.Movie_Id.isin(films)]['Name'].values

array(['The Deer Hunter', 'Dogma', 'Kill Bill: Vol. 2',
       'The Matrix: Revolutions', 'Boyz N the Hood',
       'Dragon: The Bruce Lee Story', 'The Sandlot', 'Man on Fire',
       'Hook', 'Casino: 10th Anniversary Edition',
       'Die Hard 2: Die Harder', "Cheech & Chong's Up in Smoke",
       'Lord of the Rings: The Fellowship of the Ring', 'Braveheart',
       'Elf', 'Half Baked', 'Lost: Season 1',
       'Fear and Loathing in Las Vegas', 'The Matrix: Reloaded', 'Signs',
       'Patch Adams', 'Army of Darkness',
       'South Park: Passion of the Jew', 'Mission: Impossible',
       'Tommy Boy', 'Jay and Silent Bob Strike Back',
       'Jackass: The Movie',
       'Star Wars: Episode V: The Empire Strikes Back',
       'GoodFellas: Special Edition', 'Snatch', 'Fight Club',
       'Scarface: 20th Anniversary Edition', 'Awakenings', 'Michael',
       'The Cable Guy', 'Natural Born Killers',
       'Lord of the Rings: The Two Towers: Extended Edition',
       'The Lord of the Rings: