In [1]:
import pandas as pd
import seaborn as sns

In [2]:
def read_file_csv(file_name):
    return pd.read_csv(file_name, low_memory=False, encoding="ISO-8859-1")

def weighted_average_rating(counts, m, mean_rating, rating_all_mean):
    return (counts / (counts + m)) * mean_rating + (m / (counts + m)) * rating_all_mean

def get_top_k_recommendations(df, counts_df, k, columns, m):
    return df[counts_df > m].nlargest(k, columns)

In [3]:
books_df = read_file_csv('./data/books.csv')
books_tags_df = read_file_csv('./data/books_tags.csv')
users_df = read_file_csv('./data/users.csv')
ratings_df = read_file_csv('./data/ratings.csv')
tags_df = read_file_csv('./data/tags.csv')
test_df = read_file_csv('./data/test.csv')

In [4]:
# b_r_u_df = book_rating_user_df
b_r_u_df = pd.merge(books_df[['book_id', 'title']], ratings_df, on='book_id', how='inner')
b_r_u_df = pd.merge(b_r_u_df, users_df, on='user_id', how='inner')
b_r_u_df.head()

Unnamed: 0,book_id,title,user_id,rating,location,age
0,1,"The Hunger Games (The Hunger Games, #1)",3634,4,District of Columbia,92
1,4,To Kill a Mockingbird,3634,5,District of Columbia,92
2,5,The Great Gatsby,3634,3,District of Columbia,92
3,8,The Catcher in the Rye,3634,5,District of Columbia,92
4,17,"Catching Fire (The Hunger Games, #2)",3634,4,District of Columbia,92


In [5]:
title_df = b_r_u_df[['book_id', 'title']].drop_duplicates(subset='book_id')
title_df.head()

Unnamed: 0,book_id,title
0,1,"The Hunger Games (The Hunger Games, #1)"
1,4,To Kill a Mockingbird
2,5,The Great Gatsby
3,8,The Catcher in the Rye
4,17,"Catching Fire (The Hunger Games, #2)"


In [6]:
nb_voters_data = b_r_u_df['book_id'].value_counts()
nb_voters_df = pd.DataFrame(data={'book_id': nb_voters_data.index.tolist(), 'counts': nb_voters_data.values.tolist()})
nb_voters_df.head()

Unnamed: 0,book_id,counts
0,1,1764
1,4,1747
2,5,1655
3,2,1634
4,26,1597


In [7]:
rating_mean_data = b_r_u_df.groupby(['book_id'])['rating'].mean()
rating_mean_df = pd.DataFrame(data={'book_id': rating_mean_data.index.tolist(), 'mean_rating': rating_mean_data.values.tolist()})
rating_mean_df.head()

Unnamed: 0,book_id,mean_rating
0,1,4.238662
1,2,4.05814
2,3,3.376528
3,4,4.364625
4,5,3.748036


In [8]:
# Fix the number of minimum readers to get into the list - m
m = nb_voters_data.quantile(0.90)
print(m)

202.0


In [9]:
# Rating mean (all of the books) - C.
rating_all_mean = b_r_u_df['rating'].mean()
print(rating_all_mean)

3.779077570255898


In [10]:
df = pd.merge(title_df, nb_voters_df, on='book_id', how='inner')
df = pd.merge(df, rating_mean_df, on='book_id', how='inner')
df.head()

Unnamed: 0,book_id,title,counts,mean_rating
0,1,"The Hunger Games (The Hunger Games, #1)",1764,4.238662
1,4,To Kill a Mockingbird,1747,4.364625
2,5,The Great Gatsby,1655,3.748036
3,8,The Catcher in the Rye,1535,3.76873
4,17,"Catching Fire (The Hunger Games, #2)",1494,4.048193


In [11]:
df['weighted_average_rating'] = weighted_average_rating(df['counts'], m, df['mean_rating'], rating_all_mean)
df.head()

Unnamed: 0,book_id,title,counts,mean_rating,weighted_average_rating
0,1,"The Hunger Games (The Hunger Games, #1)",1764,4.238662,4.191441
1,4,To Kill a Mockingbird,1747,4.364625,4.303937
2,5,The Great Gatsby,1655,3.748036,3.751413
3,8,The Catcher in the Rye,1535,3.76873,3.769933
4,17,"Catching Fire (The Hunger Games, #2)",1494,4.048193,4.01614


In [12]:
result = get_top_k_recommendations(df, df['counts'], 10, ['weighted_average_rating'], m)
result.head()

Unnamed: 0,book_id,title,counts,mean_rating,weighted_average_rating
504,25,Harry Potter and the Deathly Hallows (Harry Po...,1444,4.421745,4.342876
1,4,To Kill a Mockingbird,1747,4.364625,4.303937
506,102,Where the Wild Things Are,612,4.449346,4.283014
370,85,The Giving Tree,815,4.364417,4.248155
364,50,Where the Sidewalk Ends,976,4.343238,4.246497


# get_simply_recommendation

In [13]:
def get_simply_recommendation(k):
    pass

# get_simply_place_recommendation

In [14]:
def get_simply_place_recommendation(books_df, title_df, ratings_df, users_df, place, k):
    # b_r_u_df = book_rating_user_df
    b_r_u_df = pd.merge(books_df[['book_id', 'title']], ratings_df, on='book_id', how='inner')
    b_r_u_df = pd.merge(b_r_u_df, users_df, on='user_id', how='inner')

    ratings_state_df = b_r_u_df[b_r_u_df['location'] == place]

    nb_voters_data = ratings_state_df['book_id'].value_counts()
    nb_voters_df = pd.DataFrame(data={'book_id': nb_voters_data.index.tolist(), 'counts': nb_voters_data.values.tolist()})

    rating_mean_data = ratings_state_df.groupby(['book_id'])['rating'].mean()
    rating_mean_df = pd.DataFrame(data={'book_id': rating_mean_data.index.tolist(), 'mean_rating': rating_mean_data.values.tolist()})

    m = nb_voters_data.quantile(0.90)

    rating_all_mean = ratings_state_df['rating'].mean()

    df = pd.merge(title_df, nb_voters_df, on='book_id', how='inner')
    df = pd.merge(df, rating_mean_df, on='book_id', how='inner')

    df['weighted_average_rating'] = weighted_average_rating(df['counts'], m, df['mean_rating'], rating_all_mean)
    return get_top_k_recommendations(df, df['counts'], k, ['weighted_average_rating'], m)

In [15]:
place_recommendation_df = get_simply_place_recommendation(books_df, title_df, ratings_df, users_df, 'Ohio', 10)
place_recommendation_df.head()

Unnamed: 0,book_id,title,counts,mean_rating,weighted_average_rating
310,126,Dune (Dune Chronicles #1),12,4.75,4.379659
602,143,All the Light We Cannot See,15,4.6,4.327888
356,144,"Unbroken: A World War II Story of Survival, Re...",8,4.75,4.280902
190,24,Harry Potter and the Goblet of Fire (Harry Pot...,25,4.4,4.256673
424,102,Where the Wild Things Are,15,4.466667,4.236978


# get_simply_age_recommendation

In [16]:
def get_simply_age_recommendation(books_df, title_df, ratings_df, users_df, age, k):
    # b_r_u_df = book_rating_user_df
    b_r_u_df = pd.merge(books_df[['book_id', 'title']], ratings_df, on='book_id', how='inner')
    b_r_u_df = pd.merge(b_r_u_df, users_df, on='user_id', how='inner')

    ratings_age_df = b_r_u_df[b_r_u_df['age'] == age]

    nb_voters_data = ratings_age_df['book_id'].value_counts()
    nb_voters_df = pd.DataFrame(data={'book_id': nb_voters_data.index.tolist(), 'counts': nb_voters_data.values.tolist()})

    rating_mean_data = ratings_age_df.groupby(['book_id'])['rating'].mean()
    rating_mean_df = pd.DataFrame(data={'book_id': rating_mean_data.index.tolist(), 'mean_rating': rating_mean_data.values.tolist()})

    m = nb_voters_data.quantile(0.90)

    rating_all_mean = ratings_age_df['rating'].mean()

    df = pd.merge(title_df, nb_voters_df, on='book_id', how='inner')
    df = pd.merge(df, rating_mean_df, on='book_id', how='inner')

    df['weighted_average_rating'] = weighted_average_rating(df['counts'], m, df['mean_rating'], rating_all_mean)
    return get_top_k_recommendations(df, df['counts'], 10, ['weighted_average_rating'], m)

In [17]:
age_recommendation_df = get_simply_age_recommendation(books_df, title_df, ratings_df, users_df, 20, 10)
age_recommendation_df.head()

Unnamed: 0,book_id,title,counts,mean_rating,weighted_average_rating
327,70,"Ender's Game (Ender's Saga, #1)",6,5.0,4.42801
1,4,To Kill a Mockingbird,14,4.642857,4.40569
85,113,Catch-22,8,4.75,4.362162
339,250,Wonder,7,4.714286,4.309009
8,31,The Help,13,4.461538,4.261562
