## Flow of the code

> **Part-1:** Define all the functions required to build a recommendation system

> **Part-2:** Call (i.e. execute) those functions

1. Load amazon-data and get unique book-names from entire file
2. Load goodreads data and get unique book-names
3. Find the book-names that exist in both the sets of book-names
4. Do necessary cleaning and prepare data for exploration
5. Build Item-by-Item CF and Matrix-Factorization-based recommendation system

In [1]:
%load_ext autotime
# %unload_ext autotime

In [2]:
## Import required modules:
import pandas as pd
import numpy as np    
import matplotlib.pyplot as plt
import re
import random

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from collections import Counter

time: 26.1 s


## Part-1
### Data Post-processing Functions

In [3]:
## Load several rows of data and see what we've got
## Import reviews file (i.e. pre-processed amazon-book-reviews created by `reviews_preprocessing.ipynb` ):
def load_data(path, cols):
    """
    Load several rows of amazon-dataset.
    """

    books_amazon = pd.read_csv(path, nrows = 10, encoding = 'utf-8', names = cols)
    
    return books_amazon

    
## Get all the book-names from Amaozn-data (later this list will be used to find crime-mystery books):
def get_unique_books(path):
    """
    Get all the unique book names from the amazon-data-file. As the file is huge (~11 GB), I'll read entire file line-by-line 
    and extract book names and store them in the dictionary.
    """
    
    book_names = {}
    with open (path) as books:
        for line in books:
            book = line.split(',')[1].strip().lower()
            if book not in book_names.keys():
                book_names[book] = True    
    
    return book_names.keys()


def load_goodreads_data(path):
    """
    Load the data collected by scraping goodreads.com and clean the book-names.
    """

    books_goodreads = pd.read_csv(path, usecols = ['Books'], encoding = 'latin1')
    books_goodreads['Books'] = [book.strip().lower() for book in books_goodreads['Books']]
    
    # books_goodreads['Total_Num_Ratings'] = [int(rating.replace(',','')) for rating in books_goodreads['Total_Num_Ratings']]
    # books_goodreads['Total_Num_Votes'] = [int(vote.replace(',','')) for vote in books_goodreads['Total_Num_Votes']]
    # books_goodreads['Avg_Rating'] = [float(rating.strip("['']")) for rating in books_goodreads['Avg_Rating']]
    
    return books_goodreads


def get_matching_books(amazon_books, goodreads_books):
    """
    Find the books-names that exist in both: amazon and goodreads data.
    """
    
    common_books = list(set(amazon_books) & set(goodreads_books))
    common_book_dict = dict()
    for i in common_books:
        common_book_dict[i] = True
    
    return common_book_dict

time: 5 ms


### Start building a recommendation system:

In [22]:
def subset_amazon_data(path, n = 1000000):
    """
    Amaxon-data have ~14 million books ratings but here I am only interested in crime-mystery book-ratings. So, I'll subset 
    amazon-data for those books only. Moreover, I only use first n (here 1,000,000) rows to avoid memory-error issues.
    """
    
    list_df = []
    problematic_lines = []

    rows = 0
    with open (path) as books:
        while rows < n:
            line = books.readline()

            try:
                book = line.split(', ')[1].lower().strip()
                if book in common_book_dict.keys():
                    list_df.append(line)
            except:
                problematic_lines.append(line)
                
            rows+=1
        
    return list_df

time: 321 ms


In [8]:
def extract_info(data):
    """
    Extract necessary information from list_df, namely book-ratings, books-ids, book-title, and user-ids, and save it as 
    dataframe.
    """

    ## Get ratings
    ratings = [float(re.search(pattern = ' [0-5][.]0',  string = i).group()) for i in data]

    ## Get book IDs and names
    book_ids = [i[0:11] for i in data]

    ## Get book-names
    book_titles = [i.split(', ')[1] for i in data]

    ## Get user-ids
    user_ids = [i.split(', ')[3][0:14] for i in data]
    

    books_amazon_whole = pd.DataFrame({'BookID': book_ids, 'BookTitle': book_titles, 'UserID': user_ids, 'Score': ratings})
    books_amazon_whole = books_amazon_whole[books_amazon_whole['UserID'] != 'unknown']
    books_amazon_whole = books_amazon_whole[~books_amazon_whole['UserID'].str.contains('[0-9]+[.][0-9]+')]

    return books_amazon_whole

time: 28 ms


In [9]:
def create_user_item_df(ratings_df):
    """
    Convert books_amazon_whole dataframe into user-item ratings dataframe.
    """
    
    # books_amazon_whole.drop_duplicates(subset = ['BookTitle', 'UserID'], inplace = True)
    user_item_df = pd.pivot_table(data = ratings_df, index = 'UserID', columns = 'BookTitle', values = 'Score')

    ## Fill NAs with 0 and get the new list of book-titles:
    user_item_df.fillna(0, inplace=True)

    print ('Shape: ', user_item_df.shape)
    
    return user_item_df

time: 19 ms


##### Item-by-item Collaborative Filtering

In [10]:
## Train-test split
## Get #books rated by each user

def generate_train_test_data(books_ratings, user_item_mat):
    """
    Generate train/test set. In the training data, consider users who rated more than 2 books. And only those who rated more 
    than two books will only be a part of test-set because we'll predict the ratings for those additional books that they rated.
    Whenever users rated >2 books, I randomly choose one of those ratings to be a part of test-data (which eventually be 
    predicted and be used to calculate error) and make specifically that rating '0' in the train-set.
    """
    
    count_ratings_per_user = Counter(books_ratings['UserID'])

    ## list of users who rated more than 2 books
    more_than_2 = [i[0] for i in count_ratings_per_user.items() if i[1] > 2]

    test = np.zeros((len(more_than_2), user_item_mat.shape[1]))

    for i,j in enumerate(more_than_2):
        temp = books_ratings[books_ratings['UserID'] == j].groupby(by = 'UserID').agg(np.random.choice)

        ## Populate test-matrix
        ### Get column-index and rating to populate in test-data
        col_id = user_item_mat.columns.get_loc(temp.iloc[0,1])
        test[i, col_id] = temp.iloc[0,2]

        ## Make this particular pair of user-boook rating 0 in train-data
        user_item_mat.iloc[(user_item_mat.index == j), col_id] = 0
        
    return user_item_mat, test

def calculate_similarity(user_item_ratings, e = 1e-5):
    """
    Train model to get similarity-matrix
    """
    
    sim_mat = user_item_ratings.T.dot(user_item_ratings) + e
    norms = np.array([np.sqrt(np.diagonal(sim_mat))])
        
    return (sim_mat/norms/norms.T)


def predict_ratings_train(user_item_ratings, similarity_mat):
    """
    Predict ratings on training-data
    """
    
    pred_ratings = user_item_ratings.dot(similarity_mat) / np.reshape(np.abs(similarity_mat).sum(axis = 1), (-1))
    
    return pred_ratings


## Testing on test data
def predict_ratings_test(test_data, similarity_mat):
    """
    Preedict ratings on testing-data
    """
    
    pred_test = test_data.dot(similarity_mat)/np.reshape(np.abs(similarity_mat).sum(axis = 1), (-1))    
    
    return pred_test


def calculate_error(test_data, pred_data):
    """
    Returns Root-Mean-Squared-Error (RMSE) for ratings predicted on test-set
    """

    rmse = []
    for i in range(test_data.shape[0]): 
        non_zero_ind = np.nonzero(test_data[i,:])
        error = np.sqrt(np.sum((test_data[i, non_zero_ind] - np.array(pred_data[i, non_zero_ind])[0])**2))
        rmse.append(error)

    print ('Average RMSE: ')
    
    return round(np.mean(rmse), 2)


## Make recommendations: can access actual user-ids from 'more_than_2' list using index from 'test'
# print (user_item_df.columns[np.argsort(test[user_number,:])[::-1][0:3]])
def get_recommendations(user_index, pred_matrix, books, n_recs = 10):
    """
    Generate recommendations
    """
    
    recs_id = np.argsort(pred_matrix[user_index,:].tolist()[0])[::-1][0:n_recs]
    recs = [books[i] for i in recs_id]    
    print ('For user-{}, recommendations are: \n'.format(user_index))
    
    return recs

time: 24 ms


##### Matrix-Factorization Based Recommendation System

In [None]:
def fit_svd(train, n_compo = 10, random_state = 11):
    """
    Fit SVD to user-item-matrix
    """
    
    tsvd = TruncatedSVD(n_components = n_compo, random_state = random_state)
    tsvd.fit(train)
    
    return tsvd

def predict_train(tsvd_obj, train):
    """
    Predict ratings on training-data
    """
    
    train_predictions = np.dot(tsvd_obj.transform(train), tsvd_obj.components_)
    
    return train_predictions

def predict_test(tsvd_obj, test):
    """
    Predict ratings on test-data
    """
    
    test_predictions = np.dot(tsvd_obj.transform(test), tsvd_obj.components_)
        
    return test_predictions

In [None]:
## Testing new user
# new_user_1 = np.zeros((1, user_item_df.shape[1]))
# new_user_1[0, 1] = 5
# pred_user_1 = new_user_1.dot(sim_mat)/np.reshape(np.abs(sim_mat).sum(axis = 1), (-1))

## Part-2 

## Execute Functions

#### Data Preparation

In [16]:
amazon_file_loc = 'E://MRP//0102/Books.csv'

col_names = ['productID', 'title', 'price', 'userID', 'profileName', 'helpfulness', 'score', 'time', 'summary', 'text']
books_amazon = load_data(path = amazon_file_loc, cols = col_names)

book_names = get_unique_books(path = amazon_file_loc)

goodreads_file_loc = 'E://MRP//0331//Book-Recommender-System//data//books_df.csv'
books_goodreads = load_goodreads_data(path = goodreads_file_loc)    

common_book_dict = get_matching_books(amazon_books = book_names, goodreads_books = books_goodreads['Books'])

list_df = subset_amazon_data(path = amazon_file_loc)        

books_amazon_whole = extract_info(data = list_df)
books_amazon_whole.head()

user_item_df = create_user_item_df(ratings_df = books_amazon_whole)

# List of books
new_book_names = user_item_df.columns

Shape:  (21532, 258)
time: 2min 42s


#### Item-by-Item - Recommendation System

In [17]:
user_item_df, test = generate_train_test_data(books_ratings = books_amazon_whole, user_item_mat = user_item_df)
sim_mat = calculate_similarity(user_item_ratings = np.matrix(user_item_df))
pred_ratings = predict_ratings_train(user_item_ratings = np.matrix(user_item_df), similarity_mat = sim_mat)
pred_test = predict_ratings_test(test_data = test, similarity_mat = sim_mat)
calculate_error(test_data = test, pred_data = pred_test)

Average RMSE: 


2.4500000000000002

time: 58.4 s


#### Matrix-Factorization - Recommendation System

In [None]:
tsvd = fit_svd(train = user_item_df, n_compo = 25)
predict_ratings_train = predict_train(tsvd_obj = tsvd, train = user_item_df)
predict_ratings_test = predict_test(tsvd_obj = tsvd, test = test)

#### Compare results

In [48]:
user_number = 11

print ('Item-by-item CF: ', 
       get_recommendations(user_index = user_number, pred_matrix = pred_test, books = new_book_names, n_recs = 10))

print ('\n')

print ('Matrix-Factorization: ', 
       get_recommendations(user_index = user_number, pred_matrix = np.matrix(predict_ratings_test), 
                           books = new_book_names, n_recs = 10))

For user-11, recommendations are: 

Item-by-item CF:  ['Mr. Timothy ', 'Exposure', 'Lioness', 'Doohickey ', 'Not the End of the World', 'Thicker Than Water', 'Sacrifice', 'Puppet', 'Nobody True', 'Blindsided']


For user-11, recommendations are: 

Matrix-Factorization:  ['The Jester', 'Pompeii', 'Motherless Brooklyn', 'The Last Juror', 'Carrie', 'Just One Look', 'Rebecca', 'Tell No One', 'Me Talk Pretty One Day', 'Night Watch']
time: 27 ms


##### Old-code (kept here just for my reference)

In [None]:
## For EDA
# amazon_rating = pd.DataFrame(books_amazon_whole.groupby(by = 'BookTitle').mean()).reset_index(drop = False)

# fig, (ax1, ax2) = plt.subplots(figsize= (14,7), ncols = 2, sharey = True)
# ax1.hist(amazon_rating['Score'], bins = [1,2,3,4,5], normed = True)
# ax1.set_xlabel('Average Rating', fontsize = 15)
# ax1.set_ylabel('Frequency (Normalized)', fontsize = 15)
# ax1.set_title('Distribution of Average Rating \n (Amazon)', fontsize = 20)

# ax2.hist(books_goodreads['Avg_Rating'], bins = [1,2,3,4,5], normed = True)
# ax2.set_xlabel('Average Rating', fontsize = 15)
# ax2.set_title('Distribution of Average Rating \n (Goodreads)', fontsize = 20)

# plt.show()


# user_data = [np.count_nonzero(user_item_df.iloc[i,:]) for i in range(user_item_df.shape[0])]
# book_data = [np.count_nonzero(user_item_df.iloc[:,i]) for i in range(user_item_df.shape[1])]

# fig, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize = (14,6))
# ax1.hist(user_data, bins = range(0,15,1))
# ax2.hist(book_data, bins = range(0, 3000,50))

# ax1.set_title('Distribution of User-Ratings', fontsize = 15)
# ax2.set_title('Distribution of Book-Ratings', fontsize = 15)

# ax1.set_xlabel('#Books rated by user', fontsize = 15)
# ax2.set_xlabel('#Ratings each book received', fontsize = 15)

# ax1.set_ylabel('#Users', fontsize = 15)
# ax2.set_ylabel('#Books', fontsize = 15)

# plt.show()

In [None]:
# def plot_rmse(rmse_list, n_users = 500):
    
#     fig, ax = plt.subplots(figsize = (14,8))
#     ax.plot(rmse_list[0:n_users])
#     ax.axhline(y = np.mean(rmse_list), label = 'Avg. RMSE: {}'.format(round(np.mean(rmse_list), 3)), 
#                color = 'r', linestyle = 'dashed')
#     ax.set_ylabel('RMSE', fontsize = 15)
#     ax.set_xlabel('UserId', fontsize = 15)
#     ax.set_title('RMSE for each user', fontsize = 20)
#     ax.legend()
#     plt.show()    
    
#     return None

# rmse_train = np.sqrt(np.mean(np.array((np.matrix(user_item_df) - predict_ratings_train))**2, axis = 1))
# plot_rmse(rmse_train)

In [49]:
# def get_recommended_books(user_id, books_list, latent_ratings, ui_mat, top_n = 15):
    
#     ## Get recommendations for a given user:
#     ind_top_rated_books = np.argsort(latent_ratings.iloc[user_id])[::-1][0:top_n]
#     recommended_books = [books_list[ind] for ind in ind_top_rated_books]    
#     recommendation_df = pd.DataFrame({'UserID': user_id, 'BookID': ind_top_rated_books, 
#                                      'Recommended_Books': recommended_books})
    
#     ## Get actual books that the user rated:
#     user_rated_books = ui_mat[user_id,:].toarray()
#     rated_books_ind = np.argwhere(user_rated_books != 0)[:,1]
#     rated_books = [books_list[ind] for ind in rated_books_ind]
#     user_rated_books_df = pd.DataFrame({'BookID': rated_books_ind, 'RatedBooks': rated_books, 'UserID': user_id})
    
#     return user_rated_books_df, recommendation_df

# ## Try: 211
# user_rated_books, recommended_books = get_recommended_books(user_id = 211, books_list = new_book_names, 
#                                       latent_ratings = predict_ratings, ui_mat = user_item_mat)

# rated_books, recommended_books = get_recommended_books(user_id = 100, books_list = new_book_names, 
#                                       latent_ratings = pd.DataFrame(predict_ratings_train), ui_mat = train_data)

time: 19 ms
