In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from numpy import int64

import requests
import IPython.display as Disp

import sklearn
from sklearn.decomposition import TruncatedSVD

# ----------------------------------------------------------------- #

books_df = pd.read_csv("books.csv")[['book_id','title']]
ratings_df = pd.read_csv("ratings-books.csv", encoding='UTF-8',  dtype={'user_id': int,'book_id':int, 'rating':int} )
ratings_df = ratings_df[['user_id','book_id','rating']]

# ----------------------------------------------------------------- #

print("\n------------------------------------------------------------\n")
print("\nSample of 'Books-Title' DataFrame\n")
print(books_df.sample(5))
print(f"\nLength of 'Books-Title' DataFrame = {len(books_df)}\n")
print("------------------------------------------------------------\n")

print("Sample of 'User-Book-Rating' DataFrame\n")
print(ratings_df.sample(5))
print(f"\nLength of 'User-Book-Rating' DataFrame = {len(ratings_df)}")
print("\n------------------------------------------------------------\n")

# ----------------------------------------------------------------- #

# Filter sparse books
min_book_ratings = 600
filter_books = (ratings_df['book_id'].value_counts()>min_book_ratings)
filter_books = filter_books[filter_books].index.tolist()

# Filter sparse users
min_user_ratings = 135
filter_users = (ratings_df['user_id'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# Actual filtering
df_filtered = ratings_df[(ratings_df['book_id'].isin(filter_books)) & (ratings_df['user_id'].isin(filter_users))]
del filter_books, filter_users, min_book_ratings, min_user_ratings
print('Shape User-Ratings unfiltered:\t{}'.format(ratings_df.shape))
print('Shape User-Ratings filtered:\t{}'.format(df_filtered.shape))

print("\n------------------------------------------------------------\n")
print("\ndf_filtered sample follows:\n")
print(df_filtered.sample(5))

# ----------------------------------------------------------------- #

print("\n------------------------------------------------------------\n")
print("Number of Users in Filtered DF: ",len(df_filtered.groupby('user_id')))
print("Number of Books in Filtered DF: ",len(df_filtered.groupby('book_id')))

# ----------------------------------------------------------------- #

combined_books_df = pd.merge(df_filtered, books_df, on='book_id')
print("\n------------------------------------------------------------\n")
print("\nSample of 'Combined Books follows:\n")
print(combined_books_df.sample(5))

# ----------------------------------------------------------------- #

ct_df = combined_books_df.pivot_table(values='rating', index='user_id', columns='title', fill_value=0)


# ----------------------------------------------------------------- #

X = ct_df.values.T
print("\n------------------------------------------------------------\n")
print("Shape of the sparse matrix contains 'book titles' versus 'user id' = ",X.shape)

# ----------------------------------------------------------------- #

SVD  = TruncatedSVD(n_components=20, random_state=17)
result_matrix = SVD.fit_transform(X)
print("\n------------------------------------------------------------\n")
print("Matrix shape after TruncatedSVD = ",result_matrix.shape)

# ----------------------------------------------------------------- #

corr_mat = np.corrcoef(result_matrix)
print("\n------------------------------------------------------------\n")
print("USER Cosine Similarity matrix shape = ",corr_mat.shape)

# ----------------------------------------------------------------- #

book_names = ct_df.columns
book_list = list(book_names)
num_books_in_list = len(book_list)

# ----------------------------------------------------------------- #

def GetMainMenuOption():
    print("\n- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n")
    print("1. Enter '1' to enter a book index and get similar books to it")
    print("2. Enter '2' to get a list of books between two indices")
    print("3. Enter '3' to see if a book is in the list")
    print("4. Enter '4' to quit")
    answer = int(input("Enter 1, 2, 3, or 4: "))
    if answer < 1 or answer > 4:
        answer = 4
    return answer

    

answer = GetMainMenuOption()
while answer != 4:
    if answer == 1:
        book_index = int(input(f"\nEnter a book index (1 thru {num_books_in_list-1})"));
        if book_index > 0 and book_index < num_books_in_list:
            book_title = book_list[book_index]
            print(f"\nBooks that are similar to '{book_title}' follows:\n")
            corr_bk = corr_mat[book_index]
            top_books = list(book_names[(corr_bk <= 0.99) & (corr_bk>0.8)])

            num_to_display = 10
            for i in range(0,num_to_display):
                if i < len(top_books):
                    print(top_books[i])
            
            if num_to_display < len(top_books):
                answer_char = input("\nSee all the books that are similar ('y' or 'n'): ")
                if answer_char == 'y':
                    print("\n")
                    for i in range(num_to_display,len(top_books)):
                        print(top_books[i])
                    

    elif answer == 2:
        # show them the books in the database
        low_index = int(input(f"\nenter lower-bound index (between '1' and '{num_books_in_list-1}'): "))
        high_index = int(input(f"\nenter high-bound index(between '1' and '{num_books_in_list-1}'): "))
        if (low_index < high_index and low_index > 0 and high_index < num_books_in_list):
            for i in range(low_index,high_index):
                book_title = book_list[i]
                print(f"\nIndex {i}, Title = '{book_title}'")
                
    elif answer == 3:
        title = input("\nEnter book title to search for: ")
        subtitle = title[0:10]  # search for the first 10 characters
        subtitle_len = len(subtitle)
        found_title = False
        for i in range(num_books_in_list):
            if book_list[i].find(subtitle,0) >= 0:
                print(f"\nThe first {subtitle_len} characters, '{subtitle}', is in the book list, '{book_list[i]}' has index {i}")
                found_title = True
        
        if found_title == False:
            print(f"\n'{title}' was not found")
    
    answer = GetMainMenuOption()           
    

print("\n- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n")
print("DONE!")

import pickle
pickle.dump( corr_mat, open( "save_corr_mat.p", "wb" ) )
pickle.dump( book_list, open( "save_book_list.p", "wb" ) )
pickle.dump( book_names, open( "save_book_names.p", "wb" ) )




------------------------------------------------------------


Sample of 'Books-Title' DataFrame

      book_id                                              title
3679     3680                      Blow Fly (Kay Scarpetta, #12)
4207     4208  The Penderwicks: A Summer Tale of Four Sisters...
244       245                               Bridge to Terabithia
8410     8411  Diagnostic and Statistical Manual of Mental Di...
6165     6166                     The I Ching or Book of Changes

Length of 'Books-Title' DataFrame = 10000

------------------------------------------------------------

Sample of 'User-Book-Rating' DataFrame

         user_id  book_id  rating
2326623    30882      117       4
414209      8334     6389       3
5572621    24433     4808       3
3977310    45260      280       3
2354071    31171       50       4

Length of 'User-Book-Rating' DataFrame = 5976479

------------------------------------------------------------

Shape User-Ratings unfiltered:	(5976479, 3)
Shap