In [5]:
%matplotlib inline
import pandas as pd
import numpy as np
from numpy import int64

import requests
import IPython.display as Disp

import sklearn
from sklearn.decomposition import TruncatedSVD

# ----------------------------------------------------------------- #

books_df = pd.read_csv("dataset\\books.csv")[['book_id','title']]
ratings_df = pd.read_csv("dataset\\ratings-books.csv", encoding='UTF-8',  dtype={'user_id': int,'book_id':int, 'rating':int} )
ratings_df = ratings_df[['user_id','book_id','rating']]

# ----------------------------------------------------------------- #

# Save the relationship between 'book_id' and 'title'
di_book_title = {}  # book_id ==> title
for i in range(len(books_df)):
    bk_id = books_df.loc[i,'book_id']
    title = books_df.loc[i,'title']
    di_book_title[bk_id] = title
rev_di_book_title = {value : key for (key, value) in di_book_title.items()} # title ==> book_id

# ----------------------------------------------------------------- #


print("\n------------------------------------------------------------\n")
print("\nSample of 'Books-Title' DataFrame\n")
print(books_df.sample(5))
print(f"\nLength of 'Books-Title' DataFrame = {len(books_df)}\n")
print("------------------------------------------------------------\n")

print("Sample of 'User-Book-Rating' DataFrame\n")
print(ratings_df.sample(5))
print(f"\nLength of 'User-Book-Rating' DataFrame = {len(ratings_df)}")
print("\n------------------------------------------------------------\n")

# ----------------------------------------------------------------- #

# Filter sparse books
min_book_ratings = 600
filter_books = (ratings_df['book_id'].value_counts()>min_book_ratings)
filter_books = filter_books[filter_books].index.tolist()

# Filter sparse users
min_user_ratings = 135
filter_users = (ratings_df['user_id'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# Actual filtering
df_filtered = ratings_df[(ratings_df['book_id'].isin(filter_books)) & (ratings_df['user_id'].isin(filter_users))]
del filter_books, filter_users, min_book_ratings, min_user_ratings
print('Shape User-Ratings unfiltered:\t{}'.format(ratings_df.shape))
print('Shape User-Ratings filtered:\t{}'.format(df_filtered.shape))

print("\n------------------------------------------------------------\n")
print("\ndf_filtered sample follows:\n")
print(df_filtered.sample(5))

# ----------------------------------------------------------------- #

print("\n------------------------------------------------------------\n")
print("Number of Users in Filtered DF: ",len(df_filtered.groupby('user_id')))
print("Number of Books in Filtered DF: ",len(df_filtered.groupby('book_id')))

# ----------------------------------------------------------------- #

combined_books_df = pd.merge(df_filtered, books_df, on='book_id')
print("\n------------------------------------------------------------\n")
print("\nSample of 'Combined Books follows:\n")
print(combined_books_df.sample(5))

# ----------------------------------------------------------------- #

ct_df = combined_books_df.pivot_table(values='rating', index='user_id', columns='title', fill_value=0)


# ----------------------------------------------------------------- #

X = ct_df.values.T
print("\n------------------------------------------------------------\n")
print("Shape of the sparse matrix contains 'book titles' versus 'user id' = ",X.shape)

# ----------------------------------------------------------------- #

SVD  = TruncatedSVD(n_components=20, random_state=17)
result_matrix = SVD.fit_transform(X)
print("\n------------------------------------------------------------\n")
print("Matrix shape after TruncatedSVD = ",result_matrix.shape)

# ----------------------------------------------------------------- #

corr_mat = np.corrcoef(result_matrix)
print("\n------------------------------------------------------------\n")
print("USER Cosine Similarity matrix shape = ",corr_mat.shape)

# ----------------------------------------------------------------- #




# ----------------------------------------------------------------- #




------------------------------------------------------------


Sample of 'Books-Title' DataFrame

      book_id                                              title
3795     3796                                Tricks (Tricks, #1)
213       214  The Titan's Curse (Percy Jackson and the Olymp...
2897     2898  Nine Princes in Amber (The Chronicles of Amber...
5529     5530                        From Ashes (From Ashes, #1)
8830     8831                           Assata: An Autobiography

Length of 'Books-Title' DataFrame = 10000

------------------------------------------------------------

Sample of 'User-Book-Rating' DataFrame

         user_id  book_id  rating
2009461     7026      495       2
4610695    49387     2554       2
3902299    43915     3458       4
2274603    30309      795       4
3261886    35336      574       4

Length of 'User-Book-Rating' DataFrame = 5976479

------------------------------------------------------------

Shape User-Ratings unfiltered:	(5976479, 3)
Shap

In [10]:
# ----------------------------------------------------------------- #
def Sort_Tuple(tup): 
  
    # reverse = None (Sorts in Ascending order) 
    # key is set to sort using second element of 
    # sublist lambda has been used 
    tup.sort(key = lambda x: x[1],reverse = False) 
    #tup.sort(key=sortSecond,reverse = False)
    return tup 

book_names = list(ct_df.columns)
print("\nbook_names follows:\n")
print(book_names[0:20])
print("\nlength book_names = ",len(book_names))

filtered_di_book_title = {rev_di_book_title[bk_name] : bk_name for bk_name in ct_df.columns}
book_title_list = list(filtered_di_book_title.items())

book_title_list = Sort_Tuple(book_title_list)
print("\nbook_title_list follows:\n")
print(book_title_list[0:20])
print("\nlength book_title_list = ",len(book_title_list))

num_books_in_list = len(book_title_list)

max_books_to_display = 20

print(di_book_title[1292])



book_names follows:

["'Salem's Lot", "'Tis (Frank McCourt, #2)", '11/22/63', '1776', '1984', '1Q84', "1st to Die (Women's Murder Club, #1)", '2001: A Space Odyssey (Space Odyssey, #1)', "2nd Chance (Women's Murder Club, #2)", "3rd Degree (Women's Murder Club, #3)", "4th of July (Women's Murder Club, #4)", '61 Hours (Jack Reacher, #14)', 'A Bear Called Paddington (Paddington, #1)', 'A Beautiful Mind', 'A Bend in the Road', 'A Breath of Snow and Ashes (Outlander, #6)', 'A Brief History of Time', 'A Canticle for Leibowitz', 'A Child Called "It" (Dave Pelzer #1)', 'A Christmas Carol']

length book_names =  2100

book_title_list follows:

[(1292, "'Salem's Lot"), (2252, "'Tis (Frank McCourt, #2)"), (295, '11/22/63'), (687, '1776'), (13, '1984'), (649, '1Q84'), (336, "1st to Die (Women's Murder Club, #1)"), (503, '2001: A Space Odyssey (Space Odyssey, #1)'), (1379, "2nd Chance (Women's Murder Club, #2)"), (1858, "3rd Degree (Women's Murder Club, #3)"), (1763, "4th of July (Women's Murder C

In [14]:
# ----------------------------------------------------------------- #

def GetMainMenuOption():
    print("\n- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n")
    print("1. Enter '1' to enter a book index and get similar books to it")
    print("2. Enter '2' to get a list of books between two indices")
    print("3. Enter '3' to see if a book is in the list")
    print("4. Enter '4' to quit")
    answer = int(input("Enter 1, 2, 3, or 4: "))
    if answer < 1 or answer > 4:
        answer = 4
    return answer

    

answer = GetMainMenuOption()
while answer != 4:
    if answer == 1:
        book_index = int(input(f"\nEnter a book index (0 thru {num_books_in_list-1})"));
        if book_index >= 0 and book_index < num_books_in_list:
            book_title = book_names[book_index]
            print(f"\nBooks that are similar to '{book_title}' follows:\n")
            corr_bk = corr_mat[book_index]
            arr = [((corr_bk[i] <= 0.99) & (corr_bk[i]>0.8)) for i in range(num_books_in_list)]
            print("length arr = ",len(arr))
            top_books = [book_names[i] for i in range(len(book_names)) if arr[i]]
            #top_books = list(book_names[(corr_bk <= 0.99) & (corr_bk>0.8)])
            

            num_to_display = 10
            for i in range(0,num_to_display):
                if i < len(top_books):
                    print(top_books[i])
            
            if num_to_display < len(top_books):
                answer_char = input("\nSee all the books that are similar ('y' or 'n'): ")
                if answer_char == 'y':
                    print("\n")
                    for i in range(num_to_display,len(top_books)):
                        print(top_books[i])
                    

    elif answer == 2:
        # show them the books in the database
        low_index = int(input(f"\nenter lower-bound index (between '1' and '{num_books_in_list-1}'): "))
        high_index = int(input(f"\nenter high-bound index(between '1' and '{num_books_in_list-1}'): "))
        if (low_index < high_index and low_index > 0 and high_index < num_books_in_list):
            for i in range(low_index,high_index):
                book_title = book_list[i]
                print(f"\nIndex {i}, Title = '{book_title}'")
                
    elif answer == 3:
        title = input("\nEnter book title to search for: ")
        subtitle = title[0:10]  # search for the first 10 characters
        subtitle_len = len(subtitle)
        found_title = False
        for i in range(num_books_in_list):
            if book_list[i].find(subtitle,0) >= 0:
                print(f"\nThe first {subtitle_len} characters, '{subtitle}', is in the book list, '{book_list[i]}' has index {i}")
                found_title = True
        
        if found_title == False:
            print(f"\n'{title}' was not found")
    
    answer = GetMainMenuOption()           
    

print("\n- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n")
print("DONE!")


- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

1. Enter '1' to enter a book index and get similar books to it
2. Enter '2' to get a list of books between two indices
3. Enter '3' to see if a book is in the list
4. Enter '4' to quit
Enter 1, 2, 3, or 4: 1

Enter a book index (0 thru 2099)0

Books that are similar to ''Salem's Lot' follows:

length arr =  2100
Anne Rice's The Vampire Lestat: A Graphic Novel
Bag of Bones
Black House (The Talisman, #2)
Brother Odd (Odd Thomas, #3)
Carrie
Cell
Christine
Cujo
Cycle of the Werewolf
Desperation

See all the books that are similar ('y' or 'n'): y


Different Seasons
Doctor Sleep (The Shining, #2)
Dolores Claiborne
Dreamcatcher
Duma Key
Everything's Eventual: 14 Dark Tales
Firestarter
Forever Odd (Odd Thomas, #2)
Four Past Midnight 
From a Buick 8
Full Dark, No Stars
Gerald's Game
Ghost Story
Heart-Shaped Box
Hearts in Atlantis
Horns
I Am Legend and Other Stories
Insomnia
Intensity
It
Joyland
Just A

In [15]:
import pickle
pickle.dump( corr_mat, open( "save_corr_mat.p", "wb" ) )
pickle.dump( book_title_list, open( "save_book_title_list.p", "wb" ) )
pickle.dump( book_names, open( "save_book_names_list.p", "wb" ) )
pickle.dump( di_book_title, open( "save_orig_book_title_di.p", "wb" ))