In [50]:
import numpy as np
import pandas as pd
from sklearn.decomposition import TruncatedSVD


In [51]:
#loading data for usage
books = pd.read_csv("books.csv")
columns = ['id','book_id', 'best_book_id', 'work_id', 'books_count', 'isbn', 'isbn13', 'authors',
'original_publication_year', 'original_title', 'title', 'language_code', 'average_rating',
'ratings_count', 'work_ratings_count', 'work_text_reviews_count', 'ratings_1', 'ratings_2',
'ratings_3', 'ratings_4', 'ratings_5', 'image_url', 'small_image_url']

ratings = pd.read_csv("ratings.csv")
columns = ['book_id','user_id','rating']

In [52]:
books.head(10)
#combining the books and ratings dataset on a common column, ie.book_id
combinedDataset = pd.merge(ratings,books,on="book_id")


In [53]:
#creates a pivot table to group the combined data according to rating, user_id and title of books
combined_utilityMatrix = combinedDataset.pivot_table(values = 'rating',index="user_id", columns='title',fill_value= 0 )
combined_utilityMatrix.head(5)



title,'Salem's Lot,"'Tis (Frank McCourt, #2)",1421: The Year China Discovered America,1776,1984,A Bend in the River,A Bend in the Road,A Brief History of Time,A Briefer History of Time,A Case of Need,...,"Women in Love (Brangwen Family, #2)",World War Z: An Oral History of the Zombie War,"World Without End (The Kingsbridge Series, #2)",Wuthering Heights,"Xenocide (Ender's Saga, #3)",Year of Wonders,You Shall Know Our Velocity!,Zen and the Art of Motorcycle Maintenance: An Inquiry Into Values,Zodiac,number9dream
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0


In [54]:
#Transposing the combined utility matrix
X = combined_utilityMatrix.T
X

user_id,2,3,4,7,9,10,11,14,15,19,...,53404,53406,53408,53409,53416,53419,53420,53422,53423,53424
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Salem's Lot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"'Tis (Frank McCourt, #2)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1421: The Year China Discovered America,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Shall Know Our Velocity!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry Into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zodiac,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
#Reducing the transposed Dataset using Singular Variable Decomposition(SVD) to compress the number of users into 20 components
SVD = TruncatedSVD(n_components= 20)
#Perfomrms dimensionality reduction on X
transformed_matrix = SVD.fit_transform(X)
transformed_matrix.shape



(812, 20)

In [56]:
# creates a correlation matrix containing the various correlation coefficients obtained from the reduced dataset between the book titles and their
# ratings by other users
corr_matrix = np.corrcoef(transformed_matrix)
books = combined_utilityMatrix.columns
books_list = list(books)
books_list

current_book = books_list.index("A History of God: The 4,000-Year Quest of Judaism, Christianity, and Islam")


In [57]:
#picking books that have a correlation between 0.8 and 1.0
corr = corr_matrix[current_book]
list(books[(corr < 1.0) & (corr > 0.8) ])

['All the Names',
 'Crow Lake',
 'Fall on Your Knees',
 'Homage to Catalonia',
 'The Giraffe and the Pelly and Me',
 'The Path Between the Seas: The Creation of the Panama Canal, 1870-1914',
 'Timequake']