In [49]:
import math
import heapq

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity

In [50]:
def read_file_csv(file_name):
    return pd.read_csv(file_name, encoding="ISO-8859-1")


def weighted_average_rating(counts_df, m, rating_mean_df, rating_all_mean):
    return (counts_df / (counts_df + m)) * rating_mean_df + (m / (counts_df + m)) * rating_all_mean


def get_top_k_recommendations(df, counts_df, k, columns, m):
    return df[counts_df >= m].nlargest(k, columns)


def get_age_range(age):
    if age % 10 == 0:
        age -= 1

    lower_bound = age - ((age % 10) - 1)
    upper_bound = age + (10 - (age % 10))

    return lower_bound, upper_bound

In [51]:
books_df = read_file_csv('./data/books.csv')
books_tags_df = read_file_csv('./data/books_tags.csv')

users_df = read_file_csv('./data/users.csv')
ratings_df = read_file_csv('./data/ratings.csv')
tags_df = read_file_csv('./data/tags.csv')

test_df = read_file_csv('./data/test.csv')

In [52]:
global b_r_u_df
b_r_u_df = pd.merge(books_df[['book_id', 'title']], ratings_df, on='book_id', how='inner')
b_r_u_df = pd.merge(b_r_u_df, users_df, on='user_id', how='inner')
b_r_u_df.head()

Unnamed: 0,book_id,title,user_id,rating,location,age
0,1,"The Hunger Games (The Hunger Games, #1)",3634,4,District of Columbia,92
1,4,To Kill a Mockingbird,3634,5,District of Columbia,92
2,5,The Great Gatsby,3634,3,District of Columbia,92
3,8,The Catcher in the Rye,3634,5,District of Columbia,92
4,17,"Catching Fire (The Hunger Games, #2)",3634,4,District of Columbia,92


In [53]:
global items_df
items_df = b_r_u_df[['book_id', 'title']].drop_duplicates(subset='book_id')
items_df.head()

Unnamed: 0,book_id,title
0,1,"The Hunger Games (The Hunger Games, #1)"
1,4,To Kill a Mockingbird
2,5,The Great Gatsby
3,8,The Catcher in the Rye
4,17,"Catching Fire (The Hunger Games, #2)"


In [54]:
global ratings_df
ratings_df = b_r_u_df[['user_id', 'book_id', 'rating']]
ratings_df.head()

Unnamed: 0,user_id,book_id,rating
0,3634,1,4
1,3634,4,5
2,3634,5,3
3,3634,8,5
4,3634,17,4


# Non-Personalized

In [55]:
def merge_tables(b_r_u_df):
    # Dataframe that contains distribution of votes by book ID.
    nb_voters_data = b_r_u_df['book_id'].value_counts()
    nb_voters_df = pd.DataFrame(data={'book_id': nb_voters_data.index.tolist(), 'counts': nb_voters_data.values.tolist()})

    # Dataframe that contains distribution of rate averages by book ID.
    rating_mean_data = b_r_u_df.groupby(['book_id'])['rating'].mean()
    rating_mean_df = pd.DataFrame(data={'book_id': rating_mean_data.index.tolist(), 'rating_mean': rating_mean_data.values.tolist()})
    
    return nb_voters_df, nb_voters_data, rating_mean_df, rating_mean_data


def get_voters_and_means(nb_voters_data, rating_mean_df):
    m = nb_voters_data.quantile(0.90)
    rating_all_mean = rating_mean_df['rating_mean'].mean()
    return m, rating_all_mean


def calculate_WAR_and_top_k(df, m, rating_all_mean, k, columns):
    df['weighted_average_rating'] = weighted_average_rating(df['counts'], m, df['rating_mean'], rating_all_mean)
    return get_top_k_recommendations(df, df['counts'], k, columns, m)

## get_simply_recommendation

In [56]:
def get_simply_recommendation(k):
    global b_r_u_df
    
    nb_voters_df, nb_voters_data, rating_mean_df, rating_mean_data = merge_tables(b_r_u_df)
    m, rating_all_mean = get_voters_and_means(nb_voters_data, rating_mean_df)
    
    df = pd.merge(items_df, nb_voters_df, on='book_id', how='inner')
    df = pd.merge(df, rating_mean_df, on='book_id', how='inner')

    return calculate_WAR_and_top_k(df, m, rating_all_mean, k, ['weighted_average_rating'])


recommendation_df = get_simply_recommendation(10)
recommendation_df[['book_id','title','weighted_average_rating']].head(10)

Unnamed: 0,book_id,title,weighted_average_rating
504,25,Harry Potter and the Deathly Hallows (Harry Po...,4.338028
1,4,To Kill a Mockingbird,4.299843
506,102,Where the Wild Things Are,4.273212
370,85,The Giving Tree,4.240309
364,50,Where the Sidewalk Ends,4.239724
8,31,The Help,4.238851
425,144,"Unbroken: A World War II Story of Survival, Re...",4.221864
212,27,Harry Potter and the Half-Blood Prince (Harry ...,4.213906
0,1,"The Hunger Games (The Hunger Games, #1)",4.187383
663,133,"Anne of Green Gables (Anne of Green Gables, #1)",4.181489


## get_simply_place_recommendation

In [57]:
def get_simply_place_recommendation(place, k):
    global b_r_u_df
    
    b_r_u_place_df = b_r_u_df[b_r_u_df['location'] == place]
    
    nb_voters_df, nb_voters_data, rating_mean_df, rating_mean_data = merge_tables(b_r_u_place_df)
    m, rating_all_mean = get_voters_and_means(nb_voters_data, rating_mean_df)
    
    df = pd.merge(items_df, nb_voters_df, on='book_id', how='inner')
    df = pd.merge(df, rating_mean_df, on='book_id', how='inner')

    return calculate_WAR_and_top_k(df, m, rating_all_mean, k, ['weighted_average_rating'])


place_recommendation_df = get_simply_place_recommendation('Ohio', 10)
place_recommendation_df[['book_id','title','weighted_average_rating']].head(10)

Unnamed: 0,book_id,title,weighted_average_rating
310,126,Dune (Dune Chronicles #1),4.367963
602,143,All the Light We Cannot See,4.317786
356,144,"Unbroken: A World War II Story of Survival, Re...",4.266087
190,24,Harry Potter and the Goblet of Fire (Harry Pot...,4.249728
424,102,Where the Wild Things Are,4.226877
889,490,Maus I: A Survivor's Tale: My Father Bleeds Hi...,4.213664
1191,1462,The Orphan Master's Son,4.213664
1554,983,Between the World and Me,4.213664
884,119,The Handmaid's Tale,4.199565
800,89,The Princess Bride,4.190062


## get_simply_age_recommendation

In [58]:
def get_simply_age_recommendation(age, k):
    global b_r_u_df
    
    lower_bound, upper_bound = get_age_range(age)
    b_r_u_age_df = b_r_u_df[(b_r_u_df['age'] >= lower_bound) & (b_r_u_df['age'] <= upper_bound)]
    
    nb_voters_df, nb_voters_data, rating_mean_df, rating_mean_data = merge_tables(b_r_u_age_df)
    m, rating_all_mean = get_voters_and_means(nb_voters_data, rating_mean_df)
    
    df = pd.merge(items_df, nb_voters_df, on='book_id', how='inner')
    df = pd.merge(df, rating_mean_df, on='book_id', how='inner')

    return calculate_WAR_and_top_k(df, m, rating_all_mean, k, ['weighted_average_rating'])


age_recommendation_df = get_simply_age_recommendation(28, 10)
age_recommendation_df[['book_id','title','weighted_average_rating']].head(10)

Unnamed: 0,book_id,title,weighted_average_rating
502,25,Harry Potter and the Deathly Hallows (Harry Po...,4.326251
1,4,To Kill a Mockingbird,4.294203
368,85,The Giving Tree,4.289614
948,89,The Princess Bride,4.244702
659,133,"Anne of Green Gables (Anne of Green Gables, #1)",4.224914
362,50,Where the Sidewalk Ends,4.216411
504,102,Where the Wild Things Are,4.20468
418,70,"Ender's Game (Ender's Saga, #1)",4.204095
8,31,The Help,4.202891
209,21,Harry Potter and the Order of the Phoenix (Har...,4.196385


# Collaborative Filtering

In [14]:
unique_users = ratings_df['user_id'].unique()
unique_items = ratings_df['book_id'].unique()
n_users = unique_users.shape[0]
n_items = unique_items.shape[0]

# Working on user data.
unique_users.sort()
users_original_to_new = {original_index: new_index for original_index, new_index in zip(unique_users, range(n_users))}
users_new_to_original = {value: key for key, value in users_original_to_new.items()}

# Working on items data.
unique_items.sort()
items_original_to_new = {original_index: new_index for original_index, new_index in zip(unique_items, range(n_items))}
items_new_to_original = {value: key for key, value in items_original_to_new.items()}

In [15]:
data_matrix = np.empty((n_users, n_items))
data_matrix[:] = np.nan
for line in ratings_df.itertuples():
    user = users_original_to_new[line[1]]
    book = items_original_to_new[line[2]]
    rating = line[3]
    data_matrix[user, book] = rating


mean_user_rating = np.nanmean(data_matrix, axis=1).reshape(-1, 1)

ratings_diff = (data_matrix - mean_user_rating)
ratings_diff[np.isnan(ratings_diff)] = 0

In [16]:
def keep_top_k(array, k):
    smallest = heapq.nlargest(k, array)[-1]
    array[array < smallest] = 0
    return array


user_similarity = 1-pairwise_distances(ratings_diff, metric='cosine')

k = 10
user_similarity = np.array([keep_top_k(np.array(arr), k) for arr in user_similarity])
pred = mean_user_rating + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T

In [17]:
def get_recommendations(predicted_ratings_row, data_matrix_row, items, k=5):
    predicted_ratings_unrated = predicted_ratings_row.copy()
    predicted_ratings_unrated[~np.isnan(data_matrix_row)] = 0

    idx = np.argsort(-predicted_ratings_unrated)
    sim_scores = idx[0:k]

    books_original_indexes_df = pd.DataFrame(data={'book_id': [items_new_to_original[index] for index in sim_scores]})
    return pd.merge(books_original_indexes_df, items, on='book_id', how='inner')


def get_CF_recommendation(user_id, k):
    user_id = user_id - 1
    predicted_ratings_row = pred[user_id]
    data_matrix_row = data_matrix[user_id]

    return get_recommendations(predicted_ratings_row, data_matrix_row, items_df, k=k)

In [18]:
recommendations_by_user = get_CF_recommendation(user_id=1, k=10)
recommendations_by_user.head(10)

Unnamed: 0,book_id,title
0,101,Me Talk Pretty One Day
1,775,Just Kids
2,264,The Sun Also Rises
3,289,"Watership Down (Watership Down, #1)"
4,335,James and the Giant Peach
5,1084,To the Lighthouse
6,468,Their Eyes Were Watching God
7,184,Matilda
8,83,A Tale of Two Cities
9,344,Naked


# Contact Based Filtering

In [44]:
bookreads_tags_df = pd.merge(books_tags_df, tags_df, on='tag_id', how='inner')

groupped_data = bookreads_tags_df.groupby('goodreads_book_id', as_index=False)['tag_name'].transform(lambda x: ' '.join(x))
books_tags_row_df = pd.DataFrame(data={'book_id': groupped_data.index.tolist(), 'tag_name': groupped_data['tag_name'].values.tolist()})

b_t_df = pd.merge(books_df[['book_id', 'title', 'language_code', 'authors', 'original_title']], books_tags_row_df, on='book_id', how='outer')
b_t_df['tag_name'] = b_t_df['tag_name'].fillna('')

b_t_df.head()

Unnamed: 0,book_id,title,language_code,authors,original_title,tag_name
0,1,"The Hunger Games (The Hunger Games, #1)",eng,Suzanne Collins,The Hunger Games,to-read fantasy favorites currently-reading yo...
1,2,Harry Potter and the Sorcerer's Stone (Harry P...,eng,"J.K. Rowling, Mary GrandPrÃÂÃÂ©",Harry Potter and the Philosopher's Stone,to-read fantasy favorites currently-reading yo...
2,3,"Twilight (Twilight, #1)",en-US,Stephenie Meyer,Twilight,to-read fantasy favorites currently-reading yo...
3,4,To Kill a Mockingbird,eng,Harper Lee,To Kill a Mockingbird,to-read fantasy young-adult fiction harry-pott...
4,5,The Great Gatsby,eng,F. Scott Fitzgerald,The Great Gatsby,to-read fantasy favorites currently-reading yo...


In [45]:
def clean_data(x):
    x = str.lower(str(x))
    return x.replace('-', '')


features = ['language_code', 'tag_name']
for feature in features:
    b_t_df[feature] = b_t_df[feature].apply(clean_data)

b_t_df.head()

Unnamed: 0,book_id,title,language_code,authors,original_title,tag_name
0,1,"The Hunger Games (The Hunger Games, #1)",eng,Suzanne Collins,The Hunger Games,toread fantasy favorites currentlyreading youn...
1,2,Harry Potter and the Sorcerer's Stone (Harry P...,eng,"J.K. Rowling, Mary GrandPrÃÂÃÂ©",Harry Potter and the Philosopher's Stone,toread fantasy favorites currentlyreading youn...
2,3,"Twilight (Twilight, #1)",enus,Stephenie Meyer,Twilight,toread fantasy favorites currentlyreading youn...
3,4,To Kill a Mockingbird,eng,Harper Lee,To Kill a Mockingbird,toread fantasy youngadult fiction harrypotter ...
4,5,The Great Gatsby,eng,F. Scott Fitzgerald,The Great Gatsby,toread fantasy favorites currentlyreading youn...


In [46]:
def create_soup(x):
    return x['language_code'] + ' ' + x['tag_name']

b_t_df['soup'] = b_t_df.apply(create_soup, axis=1)
b_t_df.drop(['language_code', 'tag_name'], axis=1, inplace=True)

b_t_df.head()

Unnamed: 0,book_id,title,authors,original_title,soup
0,1,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,The Hunger Games,eng toread fantasy favorites currentlyreading ...
1,2,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPrÃÂÃÂ©",Harry Potter and the Philosopher's Stone,eng toread fantasy favorites currentlyreading ...
2,3,"Twilight (Twilight, #1)",Stephenie Meyer,Twilight,enus toread fantasy favorites currentlyreading...
3,4,To Kill a Mockingbird,Harper Lee,To Kill a Mockingbird,eng toread fantasy youngadult fiction harrypot...
4,5,The Great Gatsby,F. Scott Fitzgerald,The Great Gatsby,eng toread fantasy favorites currentlyreading ...


In [47]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(b_t_df['soup'])
print(count_matrix.shape)

(39000, 4383)


In [48]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

b_t_df = b_t_df.reset_index()
indices = pd.Series(b_t_df.index, index=b_t_df['title'])
print(indices[:10])

title
The Hunger Games (The Hunger Games, #1)                     0
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)    1
Twilight (Twilight, #1)                                     2
To Kill a Mockingbird                                       3
The Great Gatsby                                            4
The Fault in Our Stars                                      5
The Hobbit                                                  6
The Catcher in the Rye                                      7
Angels & Demons  (Robert Langdon, #1)                       8
Pride and Prejudice                                         9
dtype: int64


In [37]:
def get_CBF_recommendations(book_name, cosine_sim=cosine_sim):
    idx = indices[book_name]

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:k+1]

    book_indices = [i[0] for i in sim_scores]
    return b_t_df['title'].iloc[book_indices]

In [38]:
print(get_CBF_recommendations('The Hunger Games (The Hunger Games, #1)'))
print('\n\n')
print(get_CBF_recommendations('Twilight (Twilight, #1)'))

390                                             The Lorax
895                       Rendezvous with Rama (Rama, #1)
1364    The Once and Future King (The Once and Future ...
2086                                            Identical
2464                 His Majesty's Dragon (Temeraire, #1)
2872                                 Harvesting the Heart
3784     Alice's Adventures in Wonderland & Other Stories
3939                                          Deliverance
512     The Hiding Place: The Triumphant True Story of...
1274                            Unearthly (Unearthly, #1)
Name: title, dtype: object



1276                 Knife of Dreams (Wheel of Time, #11)
2089                      The Sneetches and Other Stories
3295                        Birthmarked (Birthmarked, #1)
392                 Throne of Glass (Throne of Glass, #1)
514                                   The Prince of Tides
897                                  The Age of Innocence
1366                                      

# Section 4 - Questions

In [26]:
high_rate_test_df = test_df[test_df['rating'] >= 4]
user_value_counts = high_rate_test_df['user_id'].value_counts()
user_value_counts_df = pd.DataFrame(data={'user_id': user_value_counts.index.tolist(), 'appearances': user_value_counts.values.tolist()})
user_value_counts_df = user_value_counts_df[user_value_counts_df['appearances'] >= 10]

user_value_counts_df.head()

Unnamed: 0,user_id,appearances
0,3017,13
1,2384,13
2,20,12
3,1169,12
4,290,12


In [27]:
def get_CF_recommendation_container(user_id, k, pred, data_matrix):
    user_id = user_id - 1
    predicted_ratings_row = pred[user_id]
    data_matrix_row = data_matrix[user_id]

    return get_recommendations(predicted_ratings_row, data_matrix_row, items_df, k=k)

In [28]:
def precision_k(k):
    for sim in ['cosine', 'euclidean', 'jaccard']:
        calculations_list = []

        user_similarity = 1-pairwise_distances(ratings_diff, metric=sim)
        user_similarity = np.array([keep_top_k(np.array(arr), k) for arr in user_similarity])
        pred = mean_user_rating + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T

        for user_idx in user_value_counts_df['user_id'].values:
            user_recommendations = get_CF_recommendation_container(user_idx, k, pred, data_matrix)

            counter = 0
            for book_idx in user_recommendations['book_id'].values:
                chosen_book = high_rate_test_df[(high_rate_test_df['user_id'] == user_idx) & (high_rate_test_df['book_id'] == book_idx)]
                if chosen_book.shape[0] == 1:
                    counter += 1

            calculations_list.append(counter / k)

        result = sum(calculations_list) / user_value_counts_df.shape[0]
        print(f'Precision with {sim} similarly is {result}.')


precision_k(10)

Precision with cosine similarly is 0.08.
Precision with euclidean similarly is 0.008.




Precision with jaccard similarly is 0.08.


In [29]:
def ARHA(k):
    for sim in ['cosine', 'euclidean', 'jaccard']:
        calculations_list = []

        user_similarity = 1-pairwise_distances(ratings_diff, metric=sim)
        user_similarity = np.array([keep_top_k(np.array(arr), k) for arr in user_similarity])
        pred = mean_user_rating + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T

        for user_idx in user_value_counts_df['user_id'].values:
            user_recommendations = get_CF_recommendation_container(user_idx, k, pred, data_matrix)

            user_high_rate_df = high_rate_test_df[high_rate_test_df['user_id'] == user_idx]
            user_rec_merged_df = pd.merge(user_recommendations, user_high_rate_df, on='book_id', how='inner')

            for position in user_rec_merged_df.index + 1:
                calculations_list.append(1 / position)

        result = sum(calculations_list) / user_value_counts_df.shape[0]
        print(f'ARHA with {sim} similarly is {result}.')


ARHA(10)

ARHA with cosine similarly is 0.6466666666666666.
ARHA with euclidean similarly is 0.08.




ARHA with jaccard similarly is 0.6266666666666666.


In [30]:
def get_recommendations_RMSE(user_id, pred, data_matrix):
    user_id = user_id - 1
    predicted_ratings_row = pred[user_id]
    data_matrix_row = data_matrix[user_id]

    predicted_ratings_unrated = predicted_ratings_row.copy()
    predicted_ratings_unrated[~np.isnan(data_matrix_row)] = 0

    book_ids = np.argsort(-predicted_ratings_unrated)
    books_rating = np.sort(predicted_ratings_unrated)[::-1]

    return {idx: rating for idx, rating in zip(book_ids, books_rating)}

def RMSE():
    for sim in ['cosine', 'euclidean', 'jaccard']:
        sum_error = 0
        count_lines = 0

        user_similarity = 1-pairwise_distances(ratings_diff, metric=sim)
        #user_similarity = np.array([keep_top_k(np.array(arr), k) for arr in user_similarity])
        pred = mean_user_rating + user_similarity.dot(ratings_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T

        for user_id, test_user_data in test_df.groupby('user_id'):
            user_recommendations = get_recommendations_RMSE(user_id, pred, data_matrix)
            for row in test_user_data.itertuples(index=False):
                _, test_book_id, rating = tuple(row)
                prediction = user_recommendations[test_book_id] if test_book_id in user_recommendations else 0
                sum_error += (prediction - rating)**2
                count_lines += 1

        result = math.sqrt(sum_error/count_lines)
        print(f'RMSE with {sim} similarly is {result}.')
    
    
RMSE()

RMSE with cosine similarly is 1.4417771848804117.
RMSE with euclidean similarly is 1.441315728104633.




RMSE with jaccard similarly is 1.4423866936810168.
