In [1]:
import os
import sys
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.decomposition import TruncatedSVD

# Custom libraries
sys.path.append('../Util')
from loader import get_books, get_book_dataframe, get_book_features

In [2]:
# Set this to where you save and load all data - RELATIVE TO INSIDE UTIL FOLDER
data_path = '../goodbooks-10k/'

In [3]:
# title of book to get recs for
title = 'The Fellowship of the Ring (The Lord of the Rings, #1)'

In [17]:
# Function that takes in book feature similarity matrices as input and outputs most similar book
def get_recommendations(df, indices, title, similarities, weights):
    
    # Get the index of the book that matches the title
    idx = indices[title]
    idx -= 1
    
    # Get the total number of books
    num_books = len(similarities[0])

    # Get the pairwsie similarity scores of all books with that book
    similarity_scores = []
    for similarity in similarities:
        similarity_scores.append(list(enumerate(similarity[idx])))
    
    # Sum and average the similarity scores of the three feature sets to get true similarity
    sim_scores = []
    for i in range(num_books):  
        book_id = similarity_scores[0][i][0]
        
        score = 0
        for j in range(len(weights)):
            score += (similarity_scores[j][i][1] * weights[j])
            
        sim_scores.append((book_id, score))
        
    # Sort the books based on the highest similarity scores first
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the N most similar books
    N = 10
    sim_scores = sim_scores[0:N]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return df['title'].iloc[book_indices]

In [7]:
# Get dataframe from books
books = get_book_dataframe(data_path)

found books_dataframe in file...


In [8]:
#Construct a reverse map of indices and book titles
indices = pd.Series(books.index, index=books['title']).drop_duplicates()

In [9]:
print(indices)

title
The Hunger Games (The Hunger Games, #1)                                                          1
Harry Potter and the Sorcerer's Stone (Harry Potter, #1)                                         2
Twilight (Twilight, #1)                                                                          3
To Kill a Mockingbird                                                                            4
The Great Gatsby                                                                                 5
                                                                                             ...  
Bayou Moon (The Edge, #2)                                                                     9996
Means of Ascent (The Years of Lyndon Johnson, #2)                                             9997
The Mauritius Command                                                                         9998
Cinderella Ate My Daughter: Dispatches from the Frontlines of the New Girlie-Girl Culture     9999
The 

In [10]:
# produce feature matrix
feature_matrix = get_book_features(books)
feature_matrix.shape

feature_matrix exists in file...


(10000, 82203)

In [11]:
# Compute the cosine similarity matrix for feature matrix
cosine_sim_features = cosine_similarity(feature_matrix)

In [18]:
# Get recs using the combined feature matrix
similarities_features = [cosine_sim_features]
weights_features = [1]
recs_features = get_recommendations(books, indices, title, similarities_features, weights_features)
recs_features.to_csv('recs_features.csv', index=False)

In [None]:
'''

Load in Item Matrix

'''

In [19]:
# Load in item_matrix (concepts and features) and test recs
filename = '../.tmp/item_matrix.npy'
item_matrix = np.load(filename)
item_matrix.shape

(10000, 2000)

In [20]:
# Compute the cosine similarity matrix for the item matrix
cosine_sim_item_matrix = cosine_similarity(item_matrix)

In [21]:
# Get recs using the item matrix (concepts and features)
similarities_item_matrix = [cosine_sim_item_matrix]
weights_item_matrix = [1]
recs_item_matrix = get_recommendations(books, indices, title, similarities_item_matrix, weights_item_matrix)
recs_item_matrix.to_csv('recs_item_matrix.csv', index=False)

In [13]:
# Test just the matrix derived from a single matrix
item_matrix_test = item_matrix[:,5:10]
item_matrix_test.shape

(10000, 5)

In [14]:
# Compute the cosine similarity matrix for the collab filtering matrix
cosine_sim_test = cosine_similarity(item_matrix_test)

In [15]:
similarities_test = [cosine_sim_test]
weights_test = [1]
recs_test = get_recommendations(books, indices, title, similarities_test, weights_test)
recs_test

id
19      The Fellowship of the Ring (The Lord of the Ri...
6527                              Saga, Vol. 6 (Saga, #6)
3846            Between the Lines (Between the Lines, #1)
536                             Red Queen (Red Queen, #1)
1018                 Royal Assassin (Farseer Trilogy, #2)
9512                                           شكلها باظت
3663                  Squire (Protector of the Small, #3)
4632                     The Indwelling (Left Behind, #7)
9119    When Christ and His Saints Slept  (Henry II & ...
5335                              The Palace of Illusions
1333                  Benjamin Franklin: An American Life
2096            Polar Bear, Polar Bear, What Do You Hear?
8352                               The Devil All the Time
7212                                  Captains Courageous
3343                              Someday, Someday, Maybe
3012                                                Maude
763                                        The Bluest Eye
1799       