In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk
import re

stop_words = nltk.corpus.stopwords.words('english')

In [2]:
df = pd.read_csv(r'.\booklist.csv', index_col=False)
print(df['title'].head(10))
print(df.shape)
df = df.head(10000)
# df = df.sample(n=20000)
# print(df['title'].head(10))
# print(df.shape)

col_names=['bookId','title','series','author','rating','description','lang','isbn','genres','characters','bookForm','edition','pages','publisher','publishDate','firstPublished','awards','numRating','ratingsByStars','likedPercent','setting','coverImg','bbeScore','bbeVotes','price']
wanted_cols = ['bookId','title','series','author','rating','description','genres','characters','setting','coverImg']

#TODO: filter out our wanted cols, create our index and save it to a file(?)
filtered_df = df[wanted_cols].head(5)

print(df['genres'].dtypes)
print(df['characters'].dtypes)
print(df['setting'].dtypes)

0                                     The Hunger Games
1            Harry Potter and the Order of the Phoenix
2                                To Kill a Mockingbird
3                                  Pride and Prejudice
4                                             Twilight
5                                       The Book Thief
6                                          Animal Farm
7                             The Chronicles of Narnia
8    J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
9                                   Gone with the Wind
Name: title, dtype: object
(52478, 25)
14717                                               Eloise
8771           The Death and Life of Great American Cities
25073                                         Rubbernecker
9841                             Out of Sight, Out of Mind
21190    The Rhino Crash: A Memoir of Conservation, Unl...
45860                                               Bolero
4091                                       Bad Moon Risin

In [3]:
# Characters, author and genre
df['genres'] = df['genres'].apply(literal_eval)
df['characters'] = df['characters'].apply(literal_eval)
df['setting'] = df['setting'].apply(literal_eval)
df['author'] = df['author'].apply(lambda x : x.split(", "))
print(df['author'].dtypes)
print(df['description'].dtypes)

df['soup'] = df['author'] + df['genres'] + df['characters'] + df['setting'] 
df['soup'] = df['soup'].apply(lambda x: ' '.join(x))


object
object


In [4]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [5]:
df['description'] = df['description'].apply(lambda x : normalize_document(str(x)))


df['soup'] = df['soup'] + " " + df['description']
print(df['description'].iloc[0])
print(df['soup'].iloc[0])

meet eloise precocious darling plaza hotel eloise little girl lives plaza hotel new york yet pretty already person henry james would want study queen victoria would recognize equal new york jets would want side lewis carroll would love got initial shock knows everything plaza interested people boring inner resources take home always glad
Kay Thompson Hilary Knight Picture Books Childrens Fiction Classics Humor New York Juvenile Kids Adventure Realistic Fiction Mr. Salomone Weenie Skipperdee Thomas Bill Nanny René Johanna Joe Vincent Philip Lily Eloise meet eloise precocious darling plaza hotel eloise little girl lives plaza hotel new york yet pretty already person henry james would want study queen victoria would recognize equal new york jets would want side lewis carroll would love got initial shock knows everything plaza interested people boring inner resources take home always glad


In [6]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['soup'].values.astype('U'))

count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(df['soup'])

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [7]:
print(cosine_sim)
print(cosine_sim.shape)
indices = pd.Series([i for i in range(len(df))] ,index=df['title'])
print(indices.head(10))
print(indices.shape)
print(cosine_sim)

[[1.         0.00276782 0.00505917 ... 0.00456839 0.00259938 0.00687278]
 [0.00276782 1.         0.00438346 ... 0.00352644 0.00418344 0.00274461]
 [0.00505917 0.00438346 1.         ... 0.00653436 0.00810144 0.00180712]
 ...
 [0.00456839 0.00352644 0.00653436 ... 1.         0.17711616 0.00691302]
 [0.00259938 0.00418344 0.00810144 ... 0.17711616 1.         0.0097533 ]
 [0.00687278 0.00274461 0.00180712 ... 0.00691302 0.0097533  1.        ]]
(10000, 10000)
title
The Hunger Games                                                         0
Harry Potter and the Order of the Phoenix                                1
To Kill a Mockingbird                                                    2
Pride and Prejudice                                                      3
Twilight                                                                 4
The Book Thief                                                           5
Animal Farm                                                              6
The Chroni

In [8]:
def get_recommendations(title, no_of_recommendation):
    idx = indices[title]
    df['similarity score'] = np.array(cosine_sim[idx])
    df['weightedRating'] = (((df["rating"] -  1)/4) + (df["likedPercent"] * 0.01) + ((df['numRatings'] - df['numRatings'].min()) / (df['numRatings'].max() - df['numRatings'].min())))/3
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:no_of_recommendation+1]
    book_indices = [i[0] for i in sim_scores]
    book_cossim = df[["title", "similarity score", "weightedRating"]].iloc[book_indices]
    book_cossim = book_cossim.sort_values("weightedRating", ascending=False)
    return book_cossim

In [9]:
# print(df['title'].iloc[0])
get_recommendations("Harry Potter and the Order of the Phoenix", 20)

Unnamed: 0,title,similarity score,weightedRating
46343,The Connection,0.054956,0.600926
9741,Too Consumed,0.054684,0.592836
3423,Mine,0.053351,0.592157
47255,Sacrifice,0.072951,0.590361
45442,Seductive Nights Trilogy Bundle (Seductive Nig...,0.057863,0.585887
41270,Bad Judgment,0.058771,0.564635
42006,The Bad Boy in the Glasses,0.053061,0.562513
20570,Sweetness,0.054299,0.547539
48836,Since Drew,0.053994,0.545907
28361,Wildcard: Volume One,0.054957,0.532547
