In [1]:
import heapq

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import pairwise_distances

In [2]:
def read_file_csv(file_name):
    return pd.read_csv(file_name, low_memory=False, encoding="ISO-8859-1")

def weighted_average_rating(counts_df, m, rating_mean_df, rating_all_mean):
    return (counts_df / (counts_df + m)) * rating_mean_df + (m / (counts_df + m)) * rating_all_mean

def get_top_k_recommendations(df, counts_df, k, columns, m):
    return df[counts_df >= m].nlargest(k, columns)

def get_age_range(age):
    if age % 10 == 0:
        age -= 1

    lower_bound = age - ((age % 10) - 1)
    upper_bound = age + (10 - (age % 10))

    return lower_bound, upper_bound

In [3]:
books_df = read_file_csv('./data/books.csv')
books_tags_df = read_file_csv('./data/books_tags.csv')
users_df = read_file_csv('./data/users.csv')
ratings_df = read_file_csv('./data/ratings.csv')
tags_df = read_file_csv('./data/tags.csv')
test_df = read_file_csv('./data/test.csv')

In [4]:
# b_r_u_df = book_rating_user_df
b_r_u_df = pd.merge(books_df[['book_id', 'title']], ratings_df, on='book_id', how='inner')
b_r_u_df = pd.merge(b_r_u_df, users_df, on='user_id', how='inner')
b_r_u_df.head()

Unnamed: 0,book_id,title,user_id,rating,location,age
0,1,"The Hunger Games (The Hunger Games, #1)",3634,4,District of Columbia,92
1,4,To Kill a Mockingbird,3634,5,District of Columbia,92
2,5,The Great Gatsby,3634,3,District of Columbia,92
3,8,The Catcher in the Rye,3634,5,District of Columbia,92
4,17,"Catching Fire (The Hunger Games, #2)",3634,4,District of Columbia,92


In [5]:
title_df = b_r_u_df[['book_id', 'title']].drop_duplicates(subset='book_id')
title_df.head()

Unnamed: 0,book_id,title
0,1,"The Hunger Games (The Hunger Games, #1)"
1,4,To Kill a Mockingbird
2,5,The Great Gatsby
3,8,The Catcher in the Rye
4,17,"Catching Fire (The Hunger Games, #2)"


In [6]:
nb_voters_data = b_r_u_df['book_id'].value_counts()
nb_voters_df = pd.DataFrame(data={'book_id': nb_voters_data.index.tolist(), 'counts': nb_voters_data.values.tolist()})
nb_voters_df.head()

Unnamed: 0,book_id,counts
0,1,1764
1,4,1747
2,5,1655
3,2,1634
4,26,1597


In [7]:
rating_mean_data = b_r_u_df.groupby(['book_id'])['rating'].mean()
rating_mean_df = pd.DataFrame(data={'book_id': rating_mean_data.index.tolist(), 'rating_mean': rating_mean_data.values.tolist()})
rating_mean_df.head()

Unnamed: 0,book_id,rating_mean
0,1,4.238662
1,2,4.05814
2,3,3.376528
3,4,4.364625
4,5,3.748036


In [8]:
m = nb_voters_data.quantile(0.90)
print(m)

202.0


In [9]:
## Ask Dubi is this should be the mean of the ratings or the mean of the rating means?

# rating_all_mean = b_r_u_df['rating'].mean()
rating_all_mean = rating_mean_df['rating_mean'].mean() # Ask Dubi about this.
print(rating_all_mean)

3.7395773123055873


In [10]:
df = pd.merge(title_df, nb_voters_df, on='book_id', how='inner')
df = pd.merge(df, rating_mean_df, on='book_id', how='inner')
df.head()

Unnamed: 0,book_id,title,counts,rating_mean
0,1,"The Hunger Games (The Hunger Games, #1)",1764,4.238662
1,4,To Kill a Mockingbird,1747,4.364625
2,5,The Great Gatsby,1655,3.748036
3,8,The Catcher in the Rye,1535,3.76873
4,17,"Catching Fire (The Hunger Games, #2)",1494,4.048193


In [11]:
df['weighted_average_rating'] = weighted_average_rating(df['counts'], m, df['rating_mean'], rating_all_mean)
df.head()

Unnamed: 0,book_id,title,counts,rating_mean,weighted_average_rating
0,1,"The Hunger Games (The Hunger Games, #1)",1764,4.238662,4.187383
1,4,To Kill a Mockingbird,1747,4.364625,4.299843
2,5,The Great Gatsby,1655,3.748036,3.747116
3,8,The Catcher in the Rye,1535,3.76873,3.765339
4,17,"Catching Fire (The Hunger Games, #2)",1494,4.048193,4.011436


In [12]:
result = get_top_k_recommendations(df, df['counts'], 10, ['weighted_average_rating'], m)
result.head(10)

Unnamed: 0,book_id,title,counts,rating_mean,weighted_average_rating
504,25,Harry Potter and the Deathly Hallows (Harry Po...,1444,4.421745,4.338028
1,4,To Kill a Mockingbird,1747,4.364625,4.299843
506,102,Where the Wild Things Are,612,4.449346,4.273212
370,85,The Giving Tree,815,4.364417,4.240309
364,50,Where the Sidewalk Ends,976,4.343238,4.239724
8,31,The Help,1265,4.318577,4.238851
425,144,"Unbroken: A World War II Story of Survival, Re...",557,4.396768,4.221864
212,27,Harry Potter and the Half-Blood Prince (Harry ...,1394,4.28264,4.213906
0,1,"The Hunger Games (The Hunger Games, #1)",1764,4.238662,4.187383
663,133,"Anne of Green Gables (Anne of Green Gables, #1)",533,4.348968,4.181489


# Non-Personalized

In [13]:
global b_r_u_df
# b_r_u_df = book_rating_user_df
b_r_u_df = pd.merge(books_df[['book_id', 'title']], ratings_df, on='book_id', how='inner')
b_r_u_df = pd.merge(b_r_u_df, users_df, on='user_id', how='inner')

In [14]:
def merge_tables(b_r_u_df):
    # Dataframe that contains distribution of votes by book ID.
    nb_voters_data = b_r_u_df['book_id'].value_counts()
    nb_voters_df = pd.DataFrame(data={'book_id': nb_voters_data.index.tolist(), 'counts': nb_voters_data.values.tolist()})

    # Dataframe that contains distribution of rate averages by book ID.
    rating_mean_data = b_r_u_df.groupby(['book_id'])['rating'].mean()
    rating_mean_df = pd.DataFrame(data={'book_id': rating_mean_data.index.tolist(), 'rating_mean': rating_mean_data.values.tolist()})
    
    return nb_voters_df, nb_voters_data, rating_mean_df, rating_mean_data

In [15]:
def get_voters_and_means(nb_voters_data, rating_mean_df):
    m = nb_voters_data.quantile(0.90)
    rating_all_mean = rating_mean_df['rating_mean'].mean()
    
    return m, rating_all_mean

In [16]:
def calculate_WAR_and_top_k(df, m, rating_all_mean, k, columns):
    df['weighted_average_rating'] = weighted_average_rating(df['counts'], m, df['rating_mean'], rating_all_mean)
    return get_top_k_recommendations(df, df['counts'], k, columns, m)

## get_simply_recommendation

In [17]:
def get_simply_recommendation(k):
    global b_r_u_df
    
    nb_voters_df, nb_voters_data, rating_mean_df, rating_mean_data = merge_tables(b_r_u_df)
    m, rating_all_mean = get_voters_and_means(nb_voters_data, rating_mean_df)
    
    df = pd.merge(title_df, nb_voters_df, on='book_id', how='inner')
    df = pd.merge(df, rating_mean_df, on='book_id', how='inner')

    return calculate_WAR_and_top_k(df, m, rating_all_mean, k, ['weighted_average_rating'])

In [18]:
recommendation_df = get_simply_recommendation(10)
recommendation_df.head(10)

Unnamed: 0,book_id,title,counts,rating_mean,weighted_average_rating
504,25,Harry Potter and the Deathly Hallows (Harry Po...,1444,4.421745,4.338028
1,4,To Kill a Mockingbird,1747,4.364625,4.299843
506,102,Where the Wild Things Are,612,4.449346,4.273212
370,85,The Giving Tree,815,4.364417,4.240309
364,50,Where the Sidewalk Ends,976,4.343238,4.239724
8,31,The Help,1265,4.318577,4.238851
425,144,"Unbroken: A World War II Story of Survival, Re...",557,4.396768,4.221864
212,27,Harry Potter and the Half-Blood Prince (Harry ...,1394,4.28264,4.213906
0,1,"The Hunger Games (The Hunger Games, #1)",1764,4.238662,4.187383
663,133,"Anne of Green Gables (Anne of Green Gables, #1)",533,4.348968,4.181489


## get_simply_place_recommendation

In [19]:
def get_simply_place_recommendation(place, k):
    global b_r_u_df
    
    b_r_u_place_df = b_r_u_df[b_r_u_df['location'] == place]
    
    nb_voters_df, nb_voters_data, rating_mean_df, rating_mean_data = merge_tables(b_r_u_place_df)
    m, rating_all_mean = get_voters_and_means(nb_voters_data, rating_mean_df)
    
    df = pd.merge(title_df, nb_voters_df, on='book_id', how='inner')
    df = pd.merge(df, rating_mean_df, on='book_id', how='inner')

    return calculate_WAR_and_top_k(df, m, rating_all_mean, k, ['weighted_average_rating'])

In [20]:
place_recommendation_df = get_simply_place_recommendation('Ohio', 10)
place_recommendation_df.head(10)

Unnamed: 0,book_id,title,counts,rating_mean,weighted_average_rating
310,126,Dune (Dune Chronicles #1),12,4.75,4.367963
602,143,All the Light We Cannot See,15,4.6,4.317786
356,144,"Unbroken: A World War II Story of Survival, Re...",8,4.75,4.266087
190,24,Harry Potter and the Goblet of Fire (Harry Pot...,25,4.4,4.249728
424,102,Where the Wild Things Are,15,4.466667,4.226877
889,490,Maus I: A Survivor's Tale: My Father Bleeds Hi...,7,4.714286,4.213664
1191,1462,The Orphan Master's Son,7,4.714286,4.213664
1554,983,Between the World and Me,7,4.714286,4.213664
884,119,The Handmaid's Tale,13,4.461538,4.199565
800,89,The Princess Bride,14,4.428571,4.190062


## get_simply_age_recommendation

In [21]:
def get_simply_age_recommendation(age, k):
    global b_r_u_df
    
    lower_bound, upper_bound = get_age_range(age)
    b_r_u_age_df = b_r_u_df[(b_r_u_df['age'] >= lower_bound) & (b_r_u_df['age'] <= upper_bound)]
    
    nb_voters_df, nb_voters_data, rating_mean_df, rating_mean_data = merge_tables(b_r_u_age_df)
    m, rating_all_mean = get_voters_and_means(nb_voters_data, rating_mean_df)
    
    df = pd.merge(title_df, nb_voters_df, on='book_id', how='inner')
    df = pd.merge(df, rating_mean_df, on='book_id', how='inner')

    return calculate_WAR_and_top_k(df, m, rating_all_mean, k, ['weighted_average_rating'])

In [22]:
age_recommendation_df = get_simply_age_recommendation(28, 10)
age_recommendation_df.head(10)

Unnamed: 0,book_id,title,counts,rating_mean,weighted_average_rating
502,25,Harry Potter and the Deathly Hallows (Harry Po...,186,4.413978,4.326251
1,4,To Kill a Mockingbird,216,4.365741,4.294203
368,85,The Giving Tree,99,4.444444,4.289614
948,89,The Princess Bride,69,4.449275,4.244702
659,133,"Anne of Green Gables (Anne of Green Gables, #1)",73,4.410959,4.224914
362,50,Where the Sidewalk Ends,137,4.313869,4.216411
504,102,Where the Wild Things Are,78,4.371795,4.20468
418,70,"Ender's Game (Ender's Saga, #1)",93,4.344086,4.204095
8,31,The Help,143,4.293706,4.202891
209,21,Harry Potter and the Order of the Phoenix (Har...,169,4.272189,4.196385


# Collaborative Filtering

## Part 1

In [23]:
# b_r_u_df = book_rating_user_df
b_r_u_df = pd.merge(books_df[['book_id', 'title']], ratings_df, on='book_id', how='inner')
b_r_u_df = pd.merge(b_r_u_df, users_df, on='user_id', how='inner')
b_r_u_df.head()

Unnamed: 0,book_id,title,user_id,rating,location,age
0,1,"The Hunger Games (The Hunger Games, #1)",3634,4,District of Columbia,92
1,4,To Kill a Mockingbird,3634,5,District of Columbia,92
2,5,The Great Gatsby,3634,3,District of Columbia,92
3,8,The Catcher in the Rye,3634,5,District of Columbia,92
4,17,"Catching Fire (The Hunger Games, #2)",3634,4,District of Columbia,92


In [24]:
items = b_r_u_df[['book_id', 'title']].drop_duplicates(subset='book_id')
items.head()

Unnamed: 0,book_id,title
0,1,"The Hunger Games (The Hunger Games, #1)"
1,4,To Kill a Mockingbird
2,5,The Great Gatsby
3,8,The Catcher in the Rye
4,17,"Catching Fire (The Hunger Games, #2)"


In [26]:
ratings = b_r_u_df[['user_id', 'book_id', 'rating']]
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,3634,1,4
1,3634,4,5
2,3634,5,3
3,3634,8,5
4,3634,17,4


In [27]:
def keep_top_k(arr, k): 
    smallest = heapq.nlargest(k, arr)[-1] 
    arr[arr < smallest] = 0 # replace anything lower than the cut off with 0
    return arr

# For debug purposes - appends rating r for items on behalf of a debug user
def new_user_ratings(items, r, new_user_id):
  to_append=pd.DataFrame(items, columns=['book_id'])
  to_append[['user_id','rating']] = new_user_id,r
  return to_append

In [28]:
def get_top_rated(data_matrix_row, items, k=20):
  srt_idx = np.argsort(-data_matrix_row)
  #print(~np.isnan(data_matrix_row[srt_idx]))
  srt_idx_not_nan = srt_idx[~np.isnan(data_matrix_row[srt_idx])]
  return items['title'].iloc[srt_idx_not_nan][:k]  

In [29]:
dic = dict()

book_ids = ratings.book_id.unique()
counter = 0
for value in sorted(book_ids):
    dic[value] = counter
    counter += 1

In [36]:
#calculate the number of unique users and movies.
#n_users = ratings.user_id.unique().shape[0]
#n_items = ratings.book_id.unique().shape[0]

n_users = ratings.user_id.max()
n_items = ratings.book_id.max()

#create ranking table - that table is sparse
data_matrix = np.empty((n_users, n_items))
data_matrix[:] = np.nan
for line in ratings.itertuples():
    user = line[1]-1
    book = line[2]-1
    rating = line[3]
    data_matrix[user, book] = rating

#calc mean
mean_user_rating = np.nanmean(data_matrix, axis=1).reshape(-1, 1)

ratings_diff = (data_matrix - mean_user_rating)
#replace nan -> 0
ratings_diff[np.isnan(ratings_diff)]=0

In [37]:
#calculate user x user similarity matrix
user_similarity = 1-pairwise_distances(ratings_diff, metric='cosine')
print(user_similarity.shape)

# For each user (i.e., for each row) keep only k most similar users, set the rest to 0. 
# Note that the user has the highest similarity to themselves.
k=10
user_similarity = np.array([keep_top_k(np.array(arr),k) for arr  in user_similarity])
print(user_similarity.shape)

(5000, 5000)
(5000, 5000)


In [38]:
# since n-k users have similarity=0, for each user only k most similar users contribute to the predicted ratings
pred = mean_user_rating + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T
#pred.round(2)

In [46]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(predicted_ratings_row, data_matrix_row, items, k=5):

    predicted_ratings_unrated = predicted_ratings_row[np.isnan(data_matrix_row)]
    #print(predicted_ratings_unrated)

    idx = np.argsort(-predicted_ratings_unrated)
    print(idx)
    sim_scores = idx[0:k]
    #print(sim_scores)

    # Return top k movies
    return items['title'].iloc[sim_scores]

In [62]:
predicted_ratings_unrated = predicted_ratings_row[np.isnan(data_matrix_row)]
#print(predicted_ratings_unrated)

idx = np.argsort(-predicted_ratings_unrated)
print(idx)
sim_scores = idx[0:k]
#print(sim_scores)

lst = []
for book_id in idx:
    book_df_index = items[items['book_id'] == book_id]
    #print(f'book_id: {book_id}, book_df_index: {book_df_index}\n')
    #print(f'{book_df_index.index[0]}\n\n')
    lst.append(book_df_index.index)
sim_scores = np.array(lst)

# Return top k movies
a = items['title'].iloc[sim_scores]
b = 5

print(a)
print()
print(b)

[   2   21   72 ...   22 2738   11]


ValueError: setting an array element with a sequence.

In [47]:
user = 510
predicted_ratings_row = pred[user]
data_matrix_row=data_matrix[user]

print("Top rated movies by test user:")
#print(get_top_rated(data_matrix_row,items))

print('****** test user - user_prediction ******')
print(get_recommendations(predicted_ratings_row, data_matrix_row, items,k=10))

Top rated movies by test user:
****** test user - user_prediction ******
[   2   21   72 ...   22 2738   11]
    book_id                                              title
2         5                                   The Great Gatsby
21      139  Miss PeregrineÃÂ¢ÃÂÃÂs Home for Peculiar ...
72     4189  Princess: A True Story of Life Behind the Veil...
76     4513                                      Light on Snow
0         1            The Hunger Games (The Hunger Games, #1)
1         4                              To Kill a Mockingbird
23      180                                         Siddhartha
20      115                                          Middlesex
14       47                                     The Book Thief
13       46                                Water for Elephants


In [48]:
items[items['book_id'] == 5]

Unnamed: 0,book_id,title
2,5,The Great Gatsby


### Contact Based Filtering