In [1]:
import heapq

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def read_file_csv(file_name):
    return pd.read_csv(file_name, low_memory=False, encoding="ISO-8859-1")

def weighted_average_rating(counts_df, m, rating_mean_df, rating_all_mean):
    return (counts_df / (counts_df + m)) * rating_mean_df + (m / (counts_df + m)) * rating_all_mean

def get_top_k_recommendations(df, counts_df, k, columns, m):
    return df[counts_df >= m].nlargest(k, columns)

def get_age_range(age):
    if age % 10 == 0:
        age -= 1

    lower_bound = age - ((age % 10) - 1)
    upper_bound = age + (10 - (age % 10))

    return lower_bound, upper_bound

In [3]:
books_df = read_file_csv('./data/books.csv')
books_tags_df = read_file_csv('./data/books_tags.csv')
users_df = read_file_csv('./data/users.csv')
ratings_df = read_file_csv('./data/ratings.csv')
tags_df = read_file_csv('./data/tags.csv')
test_df = read_file_csv('./data/test.csv')

In [4]:
b_r_u_df = pd.merge(books_df[['book_id', 'title']], ratings_df, on='book_id', how='inner')
b_r_u_df = pd.merge(b_r_u_df, users_df, on='user_id', how='inner')
b_r_u_df.head()

Unnamed: 0,book_id,title,user_id,rating,location,age
0,1,"The Hunger Games (The Hunger Games, #1)",3634,4,District of Columbia,92
1,4,To Kill a Mockingbird,3634,5,District of Columbia,92
2,5,The Great Gatsby,3634,3,District of Columbia,92
3,8,The Catcher in the Rye,3634,5,District of Columbia,92
4,17,"Catching Fire (The Hunger Games, #2)",3634,4,District of Columbia,92


In [5]:
items_df = b_r_u_df[['book_id', 'title']].drop_duplicates(subset='book_id')
items_df.head()

Unnamed: 0,book_id,title
0,1,"The Hunger Games (The Hunger Games, #1)"
1,4,To Kill a Mockingbird
2,5,The Great Gatsby
3,8,The Catcher in the Rye
4,17,"Catching Fire (The Hunger Games, #2)"


In [6]:
nb_voters_data = b_r_u_df['book_id'].value_counts()
nb_voters_df = pd.DataFrame(data={'book_id': nb_voters_data.index.tolist(), 'counts': nb_voters_data.values.tolist()})
nb_voters_df.head()

Unnamed: 0,book_id,counts
0,1,1764
1,4,1747
2,5,1655
3,2,1634
4,26,1597


In [7]:
rating_mean_data = b_r_u_df.groupby(['book_id'])['rating'].mean()
rating_mean_df = pd.DataFrame(data={'book_id': rating_mean_data.index.tolist(), 'rating_mean': rating_mean_data.values.tolist()})
rating_mean_df.head()

Unnamed: 0,book_id,rating_mean
0,1,4.238662
1,2,4.05814
2,3,3.376528
3,4,4.364625
4,5,3.748036


In [8]:
m = nb_voters_data.quantile(0.90)
print(m)

202.0


In [9]:
## Ask Dubi is this should be the mean of the ratings or the mean of the rating means?

# rating_all_mean = b_r_u_df['rating'].mean()
rating_all_mean = rating_mean_df['rating_mean'].mean() # Ask Dubi about this.
print(rating_all_mean)

3.7395773123055873


In [10]:
df = pd.merge(items_df, nb_voters_df, on='book_id', how='inner')
df = pd.merge(df, rating_mean_df, on='book_id', how='inner')
df.head()

Unnamed: 0,book_id,title,counts,rating_mean
0,1,"The Hunger Games (The Hunger Games, #1)",1764,4.238662
1,4,To Kill a Mockingbird,1747,4.364625
2,5,The Great Gatsby,1655,3.748036
3,8,The Catcher in the Rye,1535,3.76873
4,17,"Catching Fire (The Hunger Games, #2)",1494,4.048193


In [11]:
df['weighted_average_rating'] = weighted_average_rating(df['counts'], m, df['rating_mean'], rating_all_mean)
df.head()

Unnamed: 0,book_id,title,counts,rating_mean,weighted_average_rating
0,1,"The Hunger Games (The Hunger Games, #1)",1764,4.238662,4.187383
1,4,To Kill a Mockingbird,1747,4.364625,4.299843
2,5,The Great Gatsby,1655,3.748036,3.747116
3,8,The Catcher in the Rye,1535,3.76873,3.765339
4,17,"Catching Fire (The Hunger Games, #2)",1494,4.048193,4.011436


In [12]:
result = get_top_k_recommendations(df, df['counts'], 10, ['weighted_average_rating'], m)
result.head(10)

Unnamed: 0,book_id,title,counts,rating_mean,weighted_average_rating
504,25,Harry Potter and the Deathly Hallows (Harry Po...,1444,4.421745,4.338028
1,4,To Kill a Mockingbird,1747,4.364625,4.299843
506,102,Where the Wild Things Are,612,4.449346,4.273212
370,85,The Giving Tree,815,4.364417,4.240309
364,50,Where the Sidewalk Ends,976,4.343238,4.239724
8,31,The Help,1265,4.318577,4.238851
425,144,"Unbroken: A World War II Story of Survival, Re...",557,4.396768,4.221864
212,27,Harry Potter and the Half-Blood Prince (Harry ...,1394,4.28264,4.213906
0,1,"The Hunger Games (The Hunger Games, #1)",1764,4.238662,4.187383
663,133,"Anne of Green Gables (Anne of Green Gables, #1)",533,4.348968,4.181489


# Non-Personalized

In [13]:
global b_r_u_df
b_r_u_df = pd.merge(books_df[['book_id', 'title']], ratings_df, on='book_id', how='inner')
b_r_u_df = pd.merge(b_r_u_df, users_df, on='user_id', how='inner')

In [14]:
def merge_tables(b_r_u_df):
    # Dataframe that contains distribution of votes by book ID.
    nb_voters_data = b_r_u_df['book_id'].value_counts()
    nb_voters_df = pd.DataFrame(data={'book_id': nb_voters_data.index.tolist(), 'counts': nb_voters_data.values.tolist()})

    # Dataframe that contains distribution of rate averages by book ID.
    rating_mean_data = b_r_u_df.groupby(['book_id'])['rating'].mean()
    rating_mean_df = pd.DataFrame(data={'book_id': rating_mean_data.index.tolist(), 'rating_mean': rating_mean_data.values.tolist()})
    
    return nb_voters_df, nb_voters_data, rating_mean_df, rating_mean_data

In [15]:
def get_voters_and_means(nb_voters_data, rating_mean_df):
    m = nb_voters_data.quantile(0.90)
    rating_all_mean = rating_mean_df['rating_mean'].mean()
    
    return m, rating_all_mean

In [16]:
def calculate_WAR_and_top_k(df, m, rating_all_mean, k, columns):
    df['weighted_average_rating'] = weighted_average_rating(df['counts'], m, df['rating_mean'], rating_all_mean)
    return get_top_k_recommendations(df, df['counts'], k, columns, m)

## get_simply_recommendation

In [17]:
def get_simply_recommendation(k):
    global b_r_u_df
    
    nb_voters_df, nb_voters_data, rating_mean_df, rating_mean_data = merge_tables(b_r_u_df)
    m, rating_all_mean = get_voters_and_means(nb_voters_data, rating_mean_df)
    
    df = pd.merge(items_df, nb_voters_df, on='book_id', how='inner')
    df = pd.merge(df, rating_mean_df, on='book_id', how='inner')

    return calculate_WAR_and_top_k(df, m, rating_all_mean, k, ['weighted_average_rating'])

In [18]:
recommendation_df = get_simply_recommendation(10)
recommendation_df[['book_id','title','weighted_average_rating']].head(10)

Unnamed: 0,book_id,title,weighted_average_rating
504,25,Harry Potter and the Deathly Hallows (Harry Po...,4.338028
1,4,To Kill a Mockingbird,4.299843
506,102,Where the Wild Things Are,4.273212
370,85,The Giving Tree,4.240309
364,50,Where the Sidewalk Ends,4.239724
8,31,The Help,4.238851
425,144,"Unbroken: A World War II Story of Survival, Re...",4.221864
212,27,Harry Potter and the Half-Blood Prince (Harry ...,4.213906
0,1,"The Hunger Games (The Hunger Games, #1)",4.187383
663,133,"Anne of Green Gables (Anne of Green Gables, #1)",4.181489


## get_simply_place_recommendation

In [19]:
def get_simply_place_recommendation(place, k):
    global b_r_u_df
    
    b_r_u_place_df = b_r_u_df[b_r_u_df['location'] == place]
    
    nb_voters_df, nb_voters_data, rating_mean_df, rating_mean_data = merge_tables(b_r_u_place_df)
    m, rating_all_mean = get_voters_and_means(nb_voters_data, rating_mean_df)
    
    df = pd.merge(items_df, nb_voters_df, on='book_id', how='inner')
    df = pd.merge(df, rating_mean_df, on='book_id', how='inner')

    return calculate_WAR_and_top_k(df, m, rating_all_mean, k, ['weighted_average_rating'])

In [20]:
place_recommendation_df = get_simply_place_recommendation('Ohio', 10)
place_recommendation_df[['book_id','title','weighted_average_rating']].head(10)

Unnamed: 0,book_id,title,weighted_average_rating
310,126,Dune (Dune Chronicles #1),4.367963
602,143,All the Light We Cannot See,4.317786
356,144,"Unbroken: A World War II Story of Survival, Re...",4.266087
190,24,Harry Potter and the Goblet of Fire (Harry Pot...,4.249728
424,102,Where the Wild Things Are,4.226877
889,490,Maus I: A Survivor's Tale: My Father Bleeds Hi...,4.213664
1191,1462,The Orphan Master's Son,4.213664
1554,983,Between the World and Me,4.213664
884,119,The Handmaid's Tale,4.199565
800,89,The Princess Bride,4.190062


## get_simply_age_recommendation

In [21]:
def get_simply_age_recommendation(age, k):
    global b_r_u_df
    
    lower_bound, upper_bound = get_age_range(age)
    b_r_u_age_df = b_r_u_df[(b_r_u_df['age'] >= lower_bound) & (b_r_u_df['age'] <= upper_bound)]
    
    nb_voters_df, nb_voters_data, rating_mean_df, rating_mean_data = merge_tables(b_r_u_age_df)
    m, rating_all_mean = get_voters_and_means(nb_voters_data, rating_mean_df)
    
    df = pd.merge(items_df, nb_voters_df, on='book_id', how='inner')
    df = pd.merge(df, rating_mean_df, on='book_id', how='inner')

    return calculate_WAR_and_top_k(df, m, rating_all_mean, k, ['weighted_average_rating'])

In [22]:
age_recommendation_df = get_simply_age_recommendation(28, 10)
age_recommendation_df[['book_id','title','weighted_average_rating']].head(10)

Unnamed: 0,book_id,title,weighted_average_rating
502,25,Harry Potter and the Deathly Hallows (Harry Po...,4.326251
1,4,To Kill a Mockingbird,4.294203
368,85,The Giving Tree,4.289614
948,89,The Princess Bride,4.244702
659,133,"Anne of Green Gables (Anne of Green Gables, #1)",4.224914
362,50,Where the Sidewalk Ends,4.216411
504,102,Where the Wild Things Are,4.20468
418,70,"Ender's Game (Ender's Saga, #1)",4.204095
8,31,The Help,4.202891
209,21,Harry Potter and the Order of the Phoenix (Har...,4.196385


# Collaborative Filtering

In [23]:
b_r_u_df = pd.merge(books_df[['book_id', 'title']], ratings_df, on='book_id', how='inner')
b_r_u_df = pd.merge(b_r_u_df, users_df, on='user_id', how='inner')

items_df = b_r_u_df[['book_id', 'title']].drop_duplicates(subset='book_id')

ratings = b_r_u_df[['user_id', 'book_id', 'rating']]

In [24]:
def keep_top_k(arr, k):
    smallest = heapq.nlargest(k, arr)[-1]
    arr[arr < smallest] = 0  # replace anything lower than the cut off with 0
    return arr


# Do we need this function?
def get_top_rated(data_matrix_row, items_df, k=20):
    srt_idx = np.argsort(-data_matrix_row)
    srt_idx_not_nan = srt_idx[~np.isnan(data_matrix_row[srt_idx])]
    return items_df['title'].iloc[srt_idx_not_nan][:k]


def build_CF_prediction_matrix(sim):
    return 1-pairwise_distances(ratings_diff, metric=sim)


def get_recommendations(predicted_ratings_row, data_matrix_row, items_df, k=5):
    predicted_ratings_unrated = predicted_ratings_row[np.isnan(data_matrix_row)]

    idx = np.argsort(-predicted_ratings_unrated)
    sim_scores = idx[0:k]
    fixed_sim_scores = sim_scores + 1

    values = []
    for book_id in fixed_sim_scores:
        book_df_index = items_df[items_df['book_id'] == book_id]
        values.append(book_df_index.values[0][1])

    return pd.DataFrame(data={'book_id': sim_scores, 'title': values})


def get_CF_recommendation(user_id, k):
    user_id = user_id - 1
    predicted_ratings_row = pred[user_id]
    data_matrix_row = data_matrix[user_id]

    # print("Top rated books by test user:")
    # print(get_top_rated(data_matrix_row, items_df))

    # print('****** test user - user_prediction ******')
    return get_recommendations(predicted_ratings_row, data_matrix_row, items_df, k=k)

In [25]:
n_users = ratings.user_id.max()
n_items = ratings.book_id.max()

data_matrix = np.empty((n_users, n_items))
data_matrix[:] = np.nan
for line in ratings.itertuples():
    user = line[1]-1
    book = line[2]-1
    rating = line[3]
    data_matrix[user, book] = rating

mean_user_rating = np.nanmean(data_matrix, axis=1).reshape(-1, 1)
ratings_diff = (data_matrix - mean_user_rating)
ratings_diff[np.isnan(ratings_diff)] = 0

In [26]:
user_similarity = build_CF_prediction_matrix('cosine')

# For each user (i.e., for each row) keep only k most similar users, set the rest to 0.
# Note that the user has the highest similarity to themselves.
k = 10
user_similarity = np.array([keep_top_k(np.array(arr), k) for arr in user_similarity])

# since n-k users have similarity=0, for each user only k most similar users contribute to the predicted ratings
pred = mean_user_rating + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T

recommendations_by_user = get_CF_recommendation(user_id=511, k=10)
recommendations_by_user.head(10)

Unnamed: 0,book_id,title
0,2,"Twilight (Twilight, #1)"
1,21,The Lovely Bones
2,72,"The Host (The Host, #1)"
3,76,"Holes (Holes, #1)"
4,0,"The Hunger Games (The Hunger Games, #1)"
5,1,Harry Potter and the Sorcerer's Stone (Harry P...
6,23,Harry Potter and the Goblet of Fire (Harry Pot...
7,20,Harry Potter and the Order of the Phoenix (Har...
8,14,The Diary of a Young Girl
9,13,Animal Farm


In [27]:
recommendations_by_user.iloc[0]

book_id                          2
title      Twilight (Twilight, #1)
Name: 0, dtype: object

In [4]:
#Changes in metrics

### Contact Based Filtering

In [5]:
tags_df.head(2)

Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-


In [6]:
books_tags_df.head(2)

Unnamed: 0,goodreads_book_id,tag_id
0,1,30574
1,1,11305


In [7]:
bookreads_tags_df = pd.merge(books_tags_df, tags_df, on='tag_id', how='inner')
bookreads_tags_df.head()

Unnamed: 0,goodreads_book_id,tag_id,tag_name
0,1,30574,to-read
1,2,30574,to-read
2,3,30574,to-read
3,5,30574,to-read
4,6,30574,to-read


In [8]:
groupped_data = bookreads_tags_df.groupby('goodreads_book_id', as_index=False)['tag_name'].transform(lambda x: ','.join(x))
books_tags_row_df = pd.DataFrame(data={'goodreads_book_id': groupped_data.index.tolist(), 'tag_name': groupped_data['tag_name'].values.tolist()})
books_tags_row_df.head(2)

Unnamed: 0,goodreads_book_id,tag_name
0,0,"to-read,fantasy,favorites,currently-reading,yo..."
1,1,"to-read,fantasy,favorites,currently-reading,yo..."


In [9]:
books_df.head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780440000000.0,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPrÃÂÃÂ©",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...


In [10]:
b_t_df = pd.merge(tags_df, books_tags_df, on='tag_id', how='inner')
# b_t_df = pd.merge(b_t_df, books_df[['book_id','goodreads_book_id','title']], on='goodreads_book_id', how='inner')
b_t_df = pd.merge(books_df[['book_id','goodreads_book_id','title']], books_tags_row_df, on='goodreads_book_id', how='outer')
b_t_df.head()

Unnamed: 0,book_id,goodreads_book_id,title,tag_name
0,1.0,2767052,"The Hunger Games (The Hunger Games, #1)",
1,2.0,3,Harry Potter and the Sorcerer's Stone (Harry P...,"to-read,fantasy,favorites,currently-reading,yo..."
2,3.0,41865,"Twilight (Twilight, #1)",
3,4.0,2657,To Kill a Mockingbird,"to-read,favorites,currently-reading,fiction,bo..."
4,5.0,4671,The Great Gatsby,"to-read,favorites,currently-reading,fiction,bo..."


In [11]:
b_t_df['tag_name'] = b_t_df['tag_name'].fillna('')

tfidf = TfidfVectorizer()

tfidf_matrix = tfidf.fit_transform(b_t_df['tag_name'])

print(tfidf_matrix.shape)

(42535, 2766)


In [12]:
from sklearn.metrics.pairwise import linear_kernel
 # Compute the cosine similarity matrix

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(b_t_df.index, index=b_t_df['title']).drop_duplicates()
print(indices[:10])

title
The Hunger Games (The Hunger Games, #1)                     0
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)    1
Twilight (Twilight, #1)                                     2
To Kill a Mockingbird                                       3
The Great Gatsby                                            4
The Fault in Our Stars                                      5
The Hobbit                                                  6
The Catcher in the Rye                                      7
Angels & Demons  (Robert Langdon, #1)                       8
Pride and Prejudice                                         9
dtype: int64


In [13]:
def get_content_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies (the first is the movie we asked)
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    book_indices = [i[0] for i in sim_scores]
    print(book_indices)


    # Return the top 10 most similar movies
    return b_t_df['title'].iloc[book_indices]

In [14]:
print(get_content_recommendations('Twilight (Twilight, #1)'))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
1     Harry Potter and the Sorcerer's Stone (Harry P...
2                               Twilight (Twilight, #1)
3                                 To Kill a Mockingbird
4                                      The Great Gatsby
5                                The Fault in Our Stars
6                                            The Hobbit
7                                The Catcher in the Rye
8                 Angels & Demons  (Robert Langdon, #1)
9                                   Pride and Prejudice
10                                      The Kite Runner
Name: title, dtype: object
