In [44]:
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk
import re

stop_words = nltk.corpus.stopwords.words('english')

In [45]:
df = pd.read_csv(r'.\booklist.csv', index_col=False)
print(df['title'].head(10))
print(df.shape)
df = df.sample(n=20000)
print(df['title'].head(10))
print(df.shape)

col_names=['bookId','title','series','author','rating','description','lang','isbn','genres','characters','bookForm','edition','pages','publisher','publishDate','firstPublished','awards','numRating','ratingsByStars','likedPercent','setting','coverImg','bbeScore','bbeVotes','price']
wanted_cols = ['bookId','title','series','author','rating','description','genres','characters','setting','coverImg']

#TODO: filter out our wanted cols, create our index and save it to a file(?)
filtered_df = df[wanted_cols].head(5)

print(df['genres'].dtypes)
print(df['characters'].dtypes)
print(df['setting'].dtypes)

0                                     The Hunger Games
1            Harry Potter and the Order of the Phoenix
2                                To Kill a Mockingbird
3                                  Pride and Prejudice
4                                             Twilight
5                                       The Book Thief
6                                          Animal Farm
7                             The Chronicles of Narnia
8    J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
9                                   Gone with the Wind
Name: title, dtype: object
(52478, 25)
39378                     The Midnight Ride of Paul Revere
40460    Guests of the Sheik: An Ethnography of an Iraq...
21118                                  Capturing the Devil
3398                                             Betrayals
5521                                           Angels Fall
29306                                Thank You, Mr. Falker
49048    Haunted Kent (Images Of England) (Images Of En..

In [46]:
# Characters, author and genre
df['genres'] = df['genres'].apply(literal_eval)
df['characters'] = df['characters'].apply(literal_eval)
df['setting'] = df['setting'].apply(literal_eval)

df['author'] = df['author'].apply(lambda x : x.split(", "))
print(df['author'].dtypes)
print(df['description'].dtypes)

df['soup'] = df['author'] + df['genres'] + df['characters'] + df['setting'] 
df['soup'] = df['soup'].apply(lambda x: ' '.join(x))


object
object


In [47]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [71]:
df['description'] = df['description'].apply(lambda x : normalize_document(str(x)))


df['soup'] = df['soup'] + " " + df['description']
print(df['description'].iloc[0])
print(df['soup'].iloc[0])

longfellow tribute famous revolutionary hero begins stirring cadence american school children committed memory century illustrator christoper bing adds luminous paintings historically rich engravings enrichments longfellow poem tying fiction fact really happened april night
Henry Wadsworth Longfellow Christopher H. Bing (Illustrator) Poetry Picture Books History Classics Childrens Nonfiction Historical American Revolution American History School longfellow tribute famous revolutionary hero begins stirring cadence american school children committed memory century illustrator christoper bing adds luminous paintings historically rich engravings enrichments longfellow poem tying fiction fact really happened april nightlongfellow tribute famous revolutionary hero begins stirring cadence american school children committed memory century illustrator christoper bing adds luminous paintings historically rich engravings enrichments longfellow poem tying fiction fact really happened april night l

In [72]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['soup'].values.astype('U'))

count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(df['soup'])

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [73]:
print(cosine_sim.shape)
indices = pd.Series([i for i in range(len(df))] ,index=df['title'])
print(indices.head(10))
print(indices.shape)
print(cosine_sim)

(20000, 20000)
title
The Midnight Ride of Paul Revere                           0
Guests of the Sheik: An Ethnography of an Iraqi Village    1
Capturing the Devil                                        2
Betrayals                                                  3
Angels Fall                                                4
Thank You, Mr. Falker                                      5
Haunted Kent (Images Of England) (Images Of England)       6
The Spook’s Revenge                                        7
City of Glass                                              8
Awaken                                                     9
dtype: int64
(20000,)
[[1.00000000e+00 1.36994798e-03 2.52124685e-03 ... 2.62798961e-03
  1.58860487e-03 2.41027730e-04]
 [1.36994798e-03 1.00000000e+00 3.14064176e-03 ... 1.17648101e-03
  0.00000000e+00 1.51333044e-03]
 [2.52124685e-03 3.14064176e-03 1.00000000e+00 ... 1.14347358e-02
  1.12962958e-02 8.35699488e-03]
 ...
 [2.62798961e-03 1.17648101e-03 1.14347358e-0

In [74]:
def get_recommendations(title, no_of_recommendation):
    idx = indices[title]
    df['similarity score'] = np.array(cosine_sim[idx])
    df['weightedRating'] = (((df["rating"] -  1)/4) + (df["likedPercent"] * 0.01) + ((df['numRatings'] - df['numRatings'].min()) / (df['numRatings'].max() - df['numRatings'].min())))/3
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:no_of_recommendation+1]
    book_indices = [i[0] for i in sim_scores]
    book_cossim = df[["title", "similarity score", "weightedRating"]].iloc[book_indices]
    book_cossim = book_cossim.sort_values("weightedRating", ascending=False)
    return book_cossim

In [75]:
print(df['title'].iloc[0])
get_recommendations(df['title'].iloc[0], 10)

The Midnight Ride of Paul Revere


Unnamed: 0,title,similarity score,weightedRating
32319,Dante's Divine Comedy: Inferno,0.028903,0.615001
1413,The Raven,0.034137,0.604298
18767,As Long as We Both Shall Live,0.04902,0.595914
20647,The Love Letter,0.038597,0.569302
8104,An Absolutely Remarkable Thing,0.033788,0.565808
39984,The Organ Grinders,0.03004,0.550872
31959,Best Kept Secret,0.045759,0.542834
47471,"Five Smooth Stones : Hope's Diary, Philadelphi...",0.029414,0.538377
7610,In Our Time,0.050948,0.536288
17698,Tender Buttons,0.031665,0.504538
