In [237]:
import pandas as pd
import numpy as np
import scipy as sp
import re, os, math, sklearn, datetime, pickle

In [238]:
df_movies = pd.read_csv('movies.csv')
df_ratings = pd.read_csv('ratings.csv')

In [239]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [240]:
df_ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### 1. Filter movies with 5 or more ratings

#### Create a dataframe (df_movie_ratings_count_min5) that stores each movie that has more than 4 ratings along with the number of its ratings

Group the ratings_df by the movieId and count the rows with the same movieId

In [241]:
df_movie_ratings_count = df_ratings.groupby('movieId').count()

Keep only the index and the first column and rename the first column for beautifying purposes

In [242]:
df_movie_ratings_count = df_movie_ratings_count.iloc[:,:1]

df_movie_ratings_count.rename(columns = {'userId':'count'}, inplace = True)

In [243]:
df_movie_ratings_count.head(33)

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
6,102
7,54
8,8
9,16
10,132


#### Remove movies with fewer than 5 ratings

In [244]:
df_movie_ratings_count_min5= df_movie_ratings_count.loc[df_movie_ratings_count['count'] > 4]

In [245]:
df_movie_ratings_count_min5.head(40)

Unnamed: 0_level_0,count
movieId,Unnamed: 1_level_1
1,215
2,110
3,52
4,7
5,49
6,102
7,54
8,8
9,16
10,132


In [246]:
df_ratings_min5 = df_ratings.loc[df_ratings['movieId'].isin(df_movie_ratings_count_min5.index)]

In [247]:
df_movies_min5 = df_movies.loc[df_movies['movieId'].isin(df_movie_ratings_count_min5.index)]

In [248]:
print(df_ratings_min5.shape)
print(df_ratings.shape)
print(df_movies_min5.shape)
print(df_movies.shape)
print(df_movies.shape[0]-df_movies_min5.shape[0], "movies have been removed")

(90274, 4)
(100836, 4)
(3650, 3)
(9742, 3)
6092 movies have been removed


#### Check that movies with fewer than 5 ratings have indeed been removed

In [249]:
df_movies.loc[df_movies['movieId']==40]

Unnamed: 0,movieId,title,genres
36,40,"Cry, the Beloved Country (1995)",Drama


In [250]:
df_ratings.loc[df_ratings['movieId']==40]

Unnamed: 0,userId,movieId,rating,timestamp
47991,311,40,3.5,1057854804
84607,544,40,5.0,850688776


In [251]:
df_movies_min5.loc[df_movies_min5['movieId']==40]

Unnamed: 0,movieId,title,genres


In [252]:
df_ratings_min5.loc[df_ratings_min5['movieId']==40]

Unnamed: 0,userId,movieId,rating,timestamp


### 2. Create the pivot table

In [254]:
merged = df_ratings_min5.merge(df_movies_min5, left_on = 'movieId', right_on = 'movieId', suffixes= ['_user', ''])
merged.drop(columns = ['timestamp','genres'], inplace = True)

In [255]:
merged[300:309]

Unnamed: 0,userId,movieId,rating,title
300,202,6,5.0,Heat (1995)
301,217,6,2.0,Heat (1995)
302,219,6,3.5,Heat (1995)
303,220,6,3.5,Heat (1995)
304,239,6,5.0,Heat (1995)
305,244,6,5.0,Heat (1995)
306,266,6,4.0,Heat (1995)
307,269,6,5.0,Heat (1995)
308,270,6,3.0,Heat (1995)


In [256]:
merged.isnull().values.any()

False

In [257]:
piv = merged.pivot_table(index=['userId'], columns=['title'], values='rating')

In [259]:
piv.iloc[10:20]

title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),[REC] (2007),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11,,,,,,,,,,,...,,,,,,,,,,
12,5.0,,,,5.0,,,,,,...,,,,,,,,,,
13,,,,,,,,,,,...,,,,,,,,,,
14,,,,,,,,,,,...,,,,,,,,,,
15,,4.0,,5.0,,,,1.5,,,...,,,,,3.0,,,,,
16,,,,,,,,,,4.0,...,,,,,,,,,,
17,,,,,,,,,,,...,,,,,,,,,,
18,,4.0,,,,,,,,5.0,...,,4.5,,3.0,,,,,,
19,2.0,,,,3.0,,1.0,,,,...,3.0,,,,,,,,,2.0
20,,,,,,,3.0,4.0,,,...,,,,3.5,,,,0.5,,


In [262]:
# This function constructs a list of lists containing the highest rated shows per similar user
# and returns the name of the show along with the frequency it appears in the list
import operator

def similar_user_recs(user):
    
    if user not in piv_adj.columns:
        return('No data available on user {}'.format(user))
    
    sim_users =user_sim_adj_df.sort_values(by=user, ascending=False).index[1:5]
    print(sim_users)
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = piv_adj.loc[:, i].max()
        print(max_score)
        best.append(piv_adj[piv_adj.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:20]    

In [263]:
piv.loc[41,'Lost in Translation (2003)']

3.5

#### 3. Define prediction functions

In [268]:
"""
Item-based prediction
calculates the weighted average of k similar movies to determine a potential rating for an input user and movie
movie_values: list of the similarities (with values 0-1) of the k nearest movies 
Parameters: movie, user, movie's k nearest neighbours, movie-by-movie similarity table
Returns: item-based prediction
"""
def ib_predicted_rating(movie_name, user, k, item_sim_df):
    sim_movies = item_sim_df.sort_values(by=movie_name, ascending=False).index[1:k+1]  #start index at 1 to avoid including the searched movie in its neighbors
    movie_values = item_sim_df.sort_values(by=movie_name, ascending=False).loc[:,movie_name].tolist()[1:k+1]
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_movies):
        rating = piv.loc[user, i]
        similarity = movie_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    
    #sum(weight_list)==0 means that the user has not rated any of the k nearest movies 
    if (sum(weight_list)==0):  
        return np.nan
    else:
        return sum(rating_list)/sum(weight_list)    

In [266]:
"""
User-based prediction
calculates the weighted average of k similar users to determine a potential rating for an input user and movie
Parameters: movie, user, user's k nearest neighbours, user-by-user similarity table
Returns: user-based prediction
"""
def ub_predicted_rating(movie_name, user, k, user_sim_df):
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:k+1]
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:k+1]
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = piv.loc[i, movie_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
            
    if (sum(weight_list)==0):
        return np.nan
    else:
        return sum(rating_list)/sum(weight_list)  

In [270]:
piv.loc[331,'Lost in Translation (2003)']

5.0

#### 4. Define similarity functions

For the calculation of adjusted cosine similarity the sklearn's cosine similarity will be used but with an adjusted dataset

In [271]:
"""
Given a pivot table P (dataframe) with users as rows and items as columns and ratings on items as P(i,j)
Removes from every user column the user's average rating and then replaces NaN with 0s. 
Transposes P.
Creates the sparse matrix from the transposed P and calculates the cosine similarith of this matrix
"""
from sklearn.metrics.pairwise import cosine_similarity

def adjusted_cosine_similarity(piv):
    
    piv_adj = piv.copy()
    piv_adj = piv_adj.apply(lambda x: (x-np.mean(x)), axis=1) # the average of the row is taken, even if axis=1 
    piv_adj.fillna(0, inplace=True)
    piv_adj_sparse = sp.sparse.csr_matrix(piv_adj.to_numpy())
    return cosine_similarity(piv_adj_sparse)

adjusted_cosine_similarity(dev_piv).shape


(610, 610)

In [29]:
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform

def jaccard_similarity(piv):
    piv_jac = piv.copy()
    piv_jac.fillna(0, inplace=True)
    piv_jac.to_numpy()
    jaccard_distances = pdist(piv_jac, metric='jaccard')
    jaccard_distances = squareform(jaccard_distances)
    jaccard_similarity = 1-jaccard_distances
    return jaccard_similarity

In [30]:
"""
Parameters: piv (pivot table), experiment ('jaccard|adjusted','adjusted|adjusted')
Defines: two dataframes; 1.showing the similarity between items and 2.showing the similarity between users
"""
def define_similarity(piv,experiment):
    
    if (experiment=='jaccard|adjusted'):
    
        item_similarity_jaccard = jaccard_similarity(piv.T)
        item_sim_jac_df = pd.DataFrame(item_similarity_jaccard, index = piv.columns, columns = piv.columns)
        user_similarity_adjusted = adjusted_cosine_similarity(piv)
        user_sim_adj_df = pd.DataFrame(user_similarity_adjusted, index = piv.index, columns = piv.index)
        print("jac item df shape",item_sim_jac_df.shape)
        print("adj user df shape",user_sim_adj_df.shape)
        
        return (item_sim_jac_df, user_sim_adj_df)
        
    elif (experiment=='adjusted|adjusted'):
        
        item_similarity_adjusted = adjusted_cosine_similarity(piv.T)
        item_sim_adj_df = pd.DataFrame(item_similarity_adjusted, index = piv.columns, columns = piv.columns)
        user_similarity_adjusted = adjusted_cosine_similarity(piv)
        user_sim_adj_df = pd.DataFrame(user_similarity_adjusted, index = piv.index, columns = piv.index)
        
        print("adj user df shape",user_sim_adj_df.shape)
        print("adj item df shape",item_sim_adj_df.shape)
        
        return (item_sim_adj_df, user_sim_adj_df)


### 5. Create training sets

#### But first, we need some helper functions

#### In order to create the different training tables (10%, 20%, ... , 90%) a function that calculates the percentage of the table that is filled with ratings will be defined.

In [40]:
"""
Calculates for each column (movie) the completeness of the column (values that are not nan) as a percentage,
then it calculates the mean of all the columns
Parameters: a pivot table
Returns: a percentage of the cells of the pivot table that are not NaN
"""
def table_completeness(piv):
    not_nans=[]
    for j in piv.columns:
        not_nans.append(round(piv.loc[:,j].count()/piv.shape[0]*100, 1)) #divide with the total users number
    return round(np.mean(not_nans),3)

#### At first, a training table that has the 90% of the ratings of the initial pivot table (randomly selected) will be created. The leftover 10% ratings will be used to create the test table which will be the same throught all the experiments

In [236]:
"""
Creates an approximate 90/10 split. This is done by randomly selecting 10% of the ratings in the initial pivot table and removing them
from training table and moving them to the testing table.
With propability 1-(train_percentage + fixing_coefficient) and if a cell is not empty, then, empty the cell of the pivot table
Fixing coefficint empirically calculated to be 0.005 and is added to training percentage to compensate for the
fact that a lot of the randomly chosen cells will be empty in the initial table and thus cannot be removed in
the training table therefore resulting in a larger training table than the one we want
Parameters: pivot table
Returns: 90/10 train-test tables
"""
import random

def initial_split(piv):
    train_piv = piv.copy()
    test_piv = pd.DataFrame().reindex_like(piv)

    for i in train_piv.index-1:
        for j in range(len(train_piv.columns)):
            rand = random.random()
            if (rand<0.095 and not math.isnan(piv.iloc[i,j])):
                train_piv.iloc[i,j] = np.nan
                test_piv.iloc[i,j] = piv.iloc[i,j]
    print("Inital completeness:",table_completeness(piv))
    print("Train90 completeness:",table_completeness(train_piv))
    print("Test10 completeness:",table_completeness(test_piv))

    return (train_piv, test_piv)

In [235]:
train90, test10 = initial_split(piv)

Inital completeness: 4.05
Train90 completeness: 3.668
Test10 completeness: 0.393


#### The (10%, 20%, ... 80%) training tables will be created from the 90% training table by randomly selecting ratings from it. It is crucial that the (10%, 20%, ... 80%) training tables be created from the 90% and not from the initial in order to avoid data leakage from the test table

In [146]:
def keep_percentage(train_piv, train_percentage):
    subtrain_piv = train_piv.copy()
    
    fixing_coef = 0.005*(10-train_percentage*10)
    prop = 0.9-train_percentage+fixing_coef
    print("Propability",prop)
    for i in train_piv.index-1:
        for j in range(len(train_piv.columns)):
            rand = random.random()
            if (rand<(0.9-train_percentage+fixing_coef) and not math.isnan(train_piv.iloc[i,j])):
                subtrain_piv.iloc[i,j] = np.nan
                
    print("Train"+ str(int(train_percentage*100))+" completeness:",table_completeness(subtrain_piv))
    
    return subtrain_piv

In [143]:
train80 = keep_percentage(train90, 0.8)
train70 = keep_percentage(train90, 0.7)

fg 2.0
0.01
t 0.10999999999999997
Train80 completeness: 3.265
fg 3.0
0.015
t 0.21500000000000008
Train70 completeness: 2.881


In [144]:
train60 = keep_percentage(train90, 0.6)

fg 4.0
0.02
t 0.32000000000000006
Train60 completeness: 2.496


In [147]:
train50 = keep_percentage(train90, 0.5)

Propability 0.42500000000000004
Train50 completeness: 2.107


In [148]:
train40 = keep_percentage(train90, 0.4)

Propability 0.53
Train40 completeness: 1.734


In [149]:
train30 = keep_percentage(train90, 0.3)

Propability 0.6350000000000001
Train30 completeness: 1.348


In [150]:
train20 = keep_percentage(train90, 0.2)

Propability 0.74
Train20 completeness: 0.963


In [151]:
train10 = keep_percentage(train90, 0.1)

Propability 0.8450000000000001
Train10 completeness: 0.565


#### Core functions

In [222]:

def make_predictions(train, test, k, item_sim_df, user_sim_df):
    pred = []
    true = []
    
    print(train.index)
    for i in train.index:
        print("~~~~~~~~~~~~ User " +str(i) +" of 610 ~~~~~~~~~~~~")
        for j in train.columns:
            #print(j)
            if (math.isnan(train.loc[i,j]) and not math.isnan(test.loc[i,j])):
                if (ib_predicted_rating(j, i, k, item_sim_df)>2.5):
                    pred.append(ub_predicted_rating(j, i, k, user_sim_df))
                    true.append(test.loc[i,j])
    return (pred,true)

In [218]:
"""
Parameters: piv (pivot table - users on rows, movies on columns), train percentage (from total users) , k nearest neighbours,
sim_func ('adjusted cosine','jaccard')
Returns: prediction list, real test list
"""
def create_model(train, test, k, experiment):
    
    item_sim_df, user_sim_df = define_similarity(train,experiment) 
    print("item_sim df shape",item_sim_df.shape)
    print("user_sim df shape",user_sim_df.shape)
    
    return (make_predictions(train, test, k, item_sim_df, user_sim_df))

#### Evaluation helper functions

In [227]:
"""
From 2 lists that their indices correspond creates 2 new 'same' lists with no NaN values,
e.g. if it is found that predicted rating for movie1 by user1 is y_pred[0]==NaN (because 
it was <2.5, we didn't care to make a prediction) and true rating is y_true[0]==3,
y_pred_watch and y_test_watch must not contain this rating

Parameters: predictions list, true test list
Returns: 2 lists
"""
def remove_nan(y_pred,y_true):
    y_true_watch = []
    y_pred_watch = []
    for i in range(len(y_true)):    #len(y_test)==len(y_true)
            if (not math.isnan(y_true[i]) and not math.isnan(y_pred[i])):
                y_true_watch.append(y_true[i])
                y_pred_watch.append(y_pred[i])
                
    return (y_pred_watch, y_true_watch)

In [34]:
"""
Create 2 lists in order be compared with sklearns's binary classification metrics
Parameters: predictions list, true test list
Returns: 2 lists like the given but with values in the binary range
"""

def prepare_for_binary_evaluation(y_pred,y_test):
    binary_y_test_watch = []
    binary_y_pred_watch = []
    for i in range(len(y_test_watch)):
        if (y_test_watch[i]>3.5):
            binary_y_test_watch.append(1)
        else:
            binary_y_test_watch.append(0)
        if (y_pred_watch[i]>3.5):
            binary_y_pred_watch.append(1)
        else:
            binary_y_pred_watch.append(0)
            
    return (binary_y_pred_watch, binary_y_test_watch)

<br />

### 5. Experiments

#### 1A) K = 20, train = 10%

In [233]:
y_pred1A, y_true1A = create_model(train10,test10,20,'jaccard|adjusted')

KeyboardInterrupt: 

In [None]:
y_pred_watch1A, y_true_watch1A = remove_nan(y_pred1A,y_true1A)
binary_y_pred_watch, binary_y_true_watch = prepare_for_binary_evaluation(y_pred_watch1A,y_true_watch1A)

In [None]:
from sklearn.metrics import mean_absolute_error, precision_score, recall_score

print(mean_absolute_error(y_true_watch1A, y_pred_watch1A))
print(precision_score(binary_y_true_watch, binary_y_pred_watch, average='weighted'))
print(recall_score(binary_y_true_watch, binary_y_pred_watch, average='weighted'))

#### 1B) K = 20, train = 20%

In [231]:
y_pred1B, y_true1B = create_model(train20,test10,20,'jaccard|adjusted')

jac item df shape (3650, 3650)
adj user df shape (610, 610)
item_sim df shape (3650, 3650)
user_sim df shape (610, 610)
Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            601, 602, 603, 604, 605, 606, 607, 608, 609, 610],
           dtype='int64', name='userId', length=610)
~~~~~~~~~~~~ User 1 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 2 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 3 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 4 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 5 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 6 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 7 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 8 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 9 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 10 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 11 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 12 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 13 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 14 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 15 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 16 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 17 of 610 ~~~~~~~~~~~

~~~~~~~~~~~~ User 192 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 193 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 194 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 195 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 196 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 197 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 198 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 199 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 200 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 201 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 202 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 203 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 204 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 205 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 206 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 207 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 208 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 209 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 210 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 211 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 212 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 213 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 214 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 215 of 610 ~~~~~

KeyboardInterrupt: 

In [228]:
y_pred_watch1B, y_true_watch1B = remove_nan(y_pred1B,y_true1B)
binary_y_pred_watch, binary_y_true_watch = prepare_for_binary_evaluation(y_pred_watch1B,y_true_watch1B)

In [229]:
from sklearn.metrics import mean_absolute_error, precision_score, recall_score

print(mean_absolute_error(y_true_watch1B, y_pred_watch1B))
print(precision_score(binary_y_true_watch, binary_y_pred_watch, average='weighted'))
print(recall_score(binary_y_true_watch, binary_y_pred_watch, average='weighted'))

0.8106574463068166
0.6192607464523696
0.6209549671871464


#### Implementing the zero rule in order to make an assesment about the MAE value

In [230]:
zero_pred = [np.mean(y_true_watch1A)] * len(y_true_watch1A)
print(mean_absolute_error(y_true_watch1A, zero_pred))

0.7645046915997343


1A) K = 20, train = 20%

In [None]:
y_pred1A, y_true1A = create_model(train10,test10,20,'jaccard|adjusted')

In [158]:
item_sim_df, user_sim_df = define_similarity(train10,'jaccard|adjusted') 
print("item_sim df shape",item_sim_df.shape)
print("user_sim df shape",user_sim_df.shape)

jac item df shape (3650, 3650)
adj user df shape (610, 610)
item_sim df shape (3650, 3650)
user_sim df shape (610, 610)


In [201]:
len(y_true)

5690

In [193]:
y_pred, y_true = make_predictions(train10, test10, 20, item_sim_df, user_sim_df)

~~~~~~~~~~~~ User 1 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 2 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 3 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 4 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 5 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 6 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 7 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 8 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 9 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 10 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 11 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 12 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 13 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 14 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 15 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 16 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 17 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 18 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 19 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 20 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 21 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 22 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 23 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 24 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 25 of 6

~~~~~~~~~~~~ User 199 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 200 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 201 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 202 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 203 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 204 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 205 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 206 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 207 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 208 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 209 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 210 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 211 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 212 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 213 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 214 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 215 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 216 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 217 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 218 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 219 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 220 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 221 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 222 of 610 ~~~~~

~~~~~~~~~~~~ User 395 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 396 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 397 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 398 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 399 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 400 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 401 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 402 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 403 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 404 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 405 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 406 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 407 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 408 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 409 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 410 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 411 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 412 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 413 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 414 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 415 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 416 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 417 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 418 of 610 ~~~~~

~~~~~~~~~~~~ User 591 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 592 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 593 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 594 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 595 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 596 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 597 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 598 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 599 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 600 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 601 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 602 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 603 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 604 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 605 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 606 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 607 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 608 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 609 of 610 ~~~~~~~~~~~~
~~~~~~~~~~~~ User 610 of 610 ~~~~~~~~~~~~
