In [1]:
import pandas as pd
import seaborn as sns

In [2]:
def read_file_csv(file_name):
    return pd.read_csv(file_name, low_memory=False, encoding="ISO-8859-1")

def weighted_average_rating(counts_df, m, rating_mean_df, rating_all_mean):
    return (counts_df / (counts_df + m)) * rating_mean_df + (m / (counts_df + m)) * rating_all_mean

def get_top_k_recommendations(df, counts_df, k, columns, m):
    return df[counts_df > m].nlargest(k, columns)

def get_age_range(age):
    if age % 10 == 0:
        age -= 1

    lower_bound = age - ((age % 10) - 1)
    upper_bound = age + (10 - (age % 10))

    return lower_bound, upper_bound

In [3]:
books_df = read_file_csv('./data/books.csv')
books_tags_df = read_file_csv('./data/books_tags.csv')
users_df = read_file_csv('./data/users.csv')
ratings_df = read_file_csv('./data/ratings.csv')
tags_df = read_file_csv('./data/tags.csv')
test_df = read_file_csv('./data/test.csv')

In [4]:
# b_r_u_df = book_rating_user_df
b_r_u_df = pd.merge(books_df[['book_id', 'title']], ratings_df, on='book_id', how='inner')
b_r_u_df = pd.merge(b_r_u_df, users_df, on='user_id', how='inner')
b_r_u_df.head()

Unnamed: 0,book_id,title,user_id,rating,location,age
0,1,"The Hunger Games (The Hunger Games, #1)",3634,4,District of Columbia,92
1,4,To Kill a Mockingbird,3634,5,District of Columbia,92
2,5,The Great Gatsby,3634,3,District of Columbia,92
3,8,The Catcher in the Rye,3634,5,District of Columbia,92
4,17,"Catching Fire (The Hunger Games, #2)",3634,4,District of Columbia,92


In [5]:
title_df = b_r_u_df[['book_id', 'title']].drop_duplicates(subset='book_id')
title_df.head()

Unnamed: 0,book_id,title
0,1,"The Hunger Games (The Hunger Games, #1)"
1,4,To Kill a Mockingbird
2,5,The Great Gatsby
3,8,The Catcher in the Rye
4,17,"Catching Fire (The Hunger Games, #2)"


In [6]:
nb_voters_data = b_r_u_df['book_id'].value_counts()
nb_voters_df = pd.DataFrame(data={'book_id': nb_voters_data.index.tolist(), 'counts': nb_voters_data.values.tolist()})
nb_voters_df.head()

Unnamed: 0,book_id,counts
0,1,1764
1,4,1747
2,5,1655
3,2,1634
4,26,1597


In [7]:
rating_mean_data = b_r_u_df.groupby(['book_id'])['rating'].mean()
rating_mean_df = pd.DataFrame(data={'book_id': rating_mean_data.index.tolist(), 'rating_mean': rating_mean_data.values.tolist()})
rating_mean_df.head()

Unnamed: 0,book_id,rating_mean
0,1,4.238662
1,2,4.05814
2,3,3.376528
3,4,4.364625
4,5,3.748036


In [8]:
m = nb_voters_data.quantile(0.90)
print(m)

202.0


In [9]:
## Ask Dubi is this should be the mean of the ratings or the mean of the rating means?

# rating_all_mean = b_r_u_df['rating'].mean()
rating_all_mean = rating_mean_df['rating_mean'].mean() # Ask Dubi about this.
print(rating_all_mean)

3.7395773123055873


In [10]:
df = pd.merge(title_df, nb_voters_df, on='book_id', how='inner')
df = pd.merge(df, rating_mean_df, on='book_id', how='inner')
df.head()

Unnamed: 0,book_id,title,counts,rating_mean
0,1,"The Hunger Games (The Hunger Games, #1)",1764,4.238662
1,4,To Kill a Mockingbird,1747,4.364625
2,5,The Great Gatsby,1655,3.748036
3,8,The Catcher in the Rye,1535,3.76873
4,17,"Catching Fire (The Hunger Games, #2)",1494,4.048193


In [11]:
df['weighted_average_rating'] = weighted_average_rating(df['counts'], m, df['rating_mean'], rating_all_mean)
df.head()

Unnamed: 0,book_id,title,counts,rating_mean,weighted_average_rating
0,1,"The Hunger Games (The Hunger Games, #1)",1764,4.238662,4.187383
1,4,To Kill a Mockingbird,1747,4.364625,4.299843
2,5,The Great Gatsby,1655,3.748036,3.747116
3,8,The Catcher in the Rye,1535,3.76873,3.765339
4,17,"Catching Fire (The Hunger Games, #2)",1494,4.048193,4.011436


In [12]:
result = get_top_k_recommendations(df, df['counts'], 10, ['weighted_average_rating'], m)
result.head()

Unnamed: 0,book_id,title,counts,rating_mean,weighted_average_rating
504,25,Harry Potter and the Deathly Hallows (Harry Po...,1444,4.421745,4.338028
1,4,To Kill a Mockingbird,1747,4.364625,4.299843
506,102,Where the Wild Things Are,612,4.449346,4.273212
370,85,The Giving Tree,815,4.364417,4.240309
364,50,Where the Sidewalk Ends,976,4.343238,4.239724


# Non-Personalized

## get_simply_recommendation

In [13]:
def get_simply_recommendation(books_df, title_df, ratings_df, users_df, k):
    # b_r_u_df = book_rating_user_df
    b_r_u_df = pd.merge(books_df[['book_id', 'title']], ratings_df, on='book_id', how='inner')
    b_r_u_df = pd.merge(b_r_u_df, users_df, on='user_id', how='inner')

    # Dataframe that contains distribution of votes by book ID.
    nb_voters_data = b_r_u_df['book_id'].value_counts()
    nb_voters_df = pd.DataFrame(data={'book_id': nb_voters_data.index.tolist(), 'counts': nb_voters_data.values.tolist()})

    # Dataframe that contains distribution of rate averages by book ID.
    rating_mean_data = b_r_u_df.groupby(['book_id'])['rating'].mean()
    rating_mean_df = pd.DataFrame(data={'book_id': rating_mean_data.index.tolist(), 'rating_mean': rating_mean_data.values.tolist()})

    m = nb_voters_data.quantile(0.90)
    rating_all_mean = rating_mean_df['rating_mean'].mean()

    df = pd.merge(title_df, nb_voters_df, on='book_id', how='inner')
    df = pd.merge(df, rating_mean_df, on='book_id', how='inner')

    df['weighted_average_rating'] = weighted_average_rating(df['counts'], m, df['rating_mean'], rating_all_mean)
    return get_top_k_recommendations(df, df['counts'], k, ['weighted_average_rating'], m)

In [14]:
recommendation_df = get_simply_recommendation(books_df, title_df, ratings_df, users_df, 10)
recommendation_df.head(10)

Unnamed: 0,book_id,title,counts,rating_mean,weighted_average_rating
504,25,Harry Potter and the Deathly Hallows (Harry Po...,1444,4.421745,4.338028
1,4,To Kill a Mockingbird,1747,4.364625,4.299843
506,102,Where the Wild Things Are,612,4.449346,4.273212
370,85,The Giving Tree,815,4.364417,4.240309
364,50,Where the Sidewalk Ends,976,4.343238,4.239724
8,31,The Help,1265,4.318577,4.238851
425,144,"Unbroken: A World War II Story of Survival, Re...",557,4.396768,4.221864
212,27,Harry Potter and the Half-Blood Prince (Harry ...,1394,4.28264,4.213906
0,1,"The Hunger Games (The Hunger Games, #1)",1764,4.238662,4.187383
663,133,"Anne of Green Gables (Anne of Green Gables, #1)",533,4.348968,4.181489


## get_simply_place_recommendation

In [15]:
def get_simply_place_recommendation(books_df, title_df, ratings_df, users_df, place, k):
    # b_r_u_df = book_rating_user_df
    b_r_u_df = pd.merge(books_df[['book_id', 'title']], ratings_df, on='book_id', how='inner')
    b_r_u_df = pd.merge(b_r_u_df, users_df, on='user_id', how='inner')

    b_r_u_place_df = b_r_u_df[b_r_u_df['location'] == place]

    # Dataframe that contains distribution of votes by book ID.
    nb_voters_data = b_r_u_place_df['book_id'].value_counts()
    nb_voters_df = pd.DataFrame(data={'book_id': nb_voters_data.index.tolist(), 'counts': nb_voters_data.values.tolist()})

    # Dataframe that contains distribution of rate averages by book ID.
    rating_mean_data = b_r_u_place_df.groupby(['book_id'])['rating'].mean()
    rating_mean_df = pd.DataFrame(data={'book_id': rating_mean_data.index.tolist(), 'rating_mean': rating_mean_data.values.tolist()})

    m = nb_voters_data.quantile(0.90)
    rating_all_mean = rating_mean_df['rating_mean'].mean()

    df = pd.merge(title_df, nb_voters_df, on='book_id', how='inner')
    df = pd.merge(df, rating_mean_df, on='book_id', how='inner')

    df['weighted_average_rating'] = weighted_average_rating(df['counts'], m, df['rating_mean'], rating_all_mean)
    return get_top_k_recommendations(df, df['counts'], k, ['weighted_average_rating'], m)

In [16]:
place_recommendation_df = get_simply_place_recommendation(books_df, title_df, ratings_df, users_df, 'Ohio', 10)
place_recommendation_df.head(10)

Unnamed: 0,book_id,title,counts,rating_mean,weighted_average_rating
310,126,Dune (Dune Chronicles #1),12,4.75,4.367963
602,143,All the Light We Cannot See,15,4.6,4.317786
356,144,"Unbroken: A World War II Story of Survival, Re...",8,4.75,4.266087
190,24,Harry Potter and the Goblet of Fire (Harry Pot...,25,4.4,4.249728
424,102,Where the Wild Things Are,15,4.466667,4.226877
884,119,The Handmaid's Tale,13,4.461538,4.199565
800,89,The Princess Bride,14,4.428571,4.190062
319,191,Watchmen,9,4.555556,4.186956
888,321,Where the Red Fern Grows,9,4.555556,4.186956
306,85,The Giving Tree,21,4.333333,4.178261


## get_simply_age_recommendation

In [17]:
def get_simply_age_recommendation(books_df, title_df, ratings_df, users_df, age, k):
    # b_r_u_df = book_rating_user_df
    b_r_u_df = pd.merge(books_df[['book_id', 'title']], ratings_df, on='book_id', how='inner')
    b_r_u_df = pd.merge(b_r_u_df, users_df, on='user_id', how='inner')

    lower_bound, upper_bound = get_age_range(age)
    b_r_u_age_df = b_r_u_df[(b_r_u_df['age'] >= lower_bound) & (b_r_u_df['age'] <= upper_bound)]

    # Dataframe that contains distribution of votes by book ID.
    nb_voters_data = b_r_u_age_df['book_id'].value_counts()
    nb_voters_df = pd.DataFrame(data={'book_id': nb_voters_data.index.tolist(), 'counts': nb_voters_data.values.tolist()})

    # Dataframe that contains distribution of rate averages by book ID.
    rating_mean_data = b_r_u_age_df.groupby(['book_id'])['rating'].mean()
    rating_mean_df = pd.DataFrame(data={'book_id': rating_mean_data.index.tolist(), 'rating_mean': rating_mean_data.values.tolist()})

    m = nb_voters_data.quantile(0.90)
    rating_all_mean = rating_mean_df['rating_mean'].mean()

    df = pd.merge(title_df, nb_voters_df, on='book_id', how='inner')
    df = pd.merge(df, rating_mean_df, on='book_id', how='inner')

    df['weighted_average_rating'] = weighted_average_rating(df['counts'], m, df['rating_mean'], rating_all_mean)
    return get_top_k_recommendations(df, df['counts'], 10, ['weighted_average_rating'], m)

In [18]:
age_recommendation_df = get_simply_age_recommendation(books_df, title_df, ratings_df, users_df, 28, 10)
age_recommendation_df.head(10)

Unnamed: 0,book_id,title,counts,rating_mean,weighted_average_rating
502,25,Harry Potter and the Deathly Hallows (Harry Po...,186,4.413978,4.326251
1,4,To Kill a Mockingbird,216,4.365741,4.294203
368,85,The Giving Tree,99,4.444444,4.289614
948,89,The Princess Bride,69,4.449275,4.244702
659,133,"Anne of Green Gables (Anne of Green Gables, #1)",73,4.410959,4.224914
362,50,Where the Sidewalk Ends,137,4.313869,4.216411
504,102,Where the Wild Things Are,78,4.371795,4.20468
418,70,"Ender's Game (Ender's Saga, #1)",93,4.344086,4.204095
8,31,The Help,143,4.293706,4.202891
209,21,Harry Potter and the Order of the Phoenix (Har...,169,4.272189,4.196385


# Collaborative Filtering

In [52]:
df2 = pd.merge(books_tags_df, tags_df, on='tag_id', how='inner')
df2.sort_values(by='goodreads_book_id', inplace=True)
df2 = df2.groupby(['goodreads_book_id'], as_index = False).agg({'tag_name': ' '.join})
df2.head()

Unnamed: 0,goodreads_book_id,tag_name
0,1,to-read to-re-read read-in-2014 to-buy favouri...
1,2,british fantasia female-author childrens-lit c...
2,3,re-read i-own bookshelf all-time-favorites own...
3,5,classics supernatural kids re-reads children-s...
4,6,read-in-2014 fiction rory-gilmore-reading-chal...


# Contact Based Filtering