In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk
import re

stop_words = nltk.corpus.stopwords.words('english')

In [2]:
df = pd.read_csv(r'.\booklist.csv')

col_names=['bookId','title','series','author','rating','description','lang','isbn','genres','characters','bookForm','edition','pages','publisher','publishDate','firstPublished','awards','numRating','ratingsByStars','likedPercent','setting','coverImg','bbeScore','bbeVotes','price']
wanted_cols = ['bookId','title','series','author','rating','description','genres','characters','setting','coverImg']

#TODO: filter out our wanted cols, create our index and save it to a file(?)
# filtered_df = df[wanted_cols].head(5)

In [3]:
print(df['description'].iloc[0])
print("\n")

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

df['description'] = df['description'].apply(lambda x : normalize_document(str(x)))
print(df['description'].iloc[0])

WINNING MEANS FAME AND FORTUNE.LOSING MEANS CERTAIN DEATH.THE HUNGER GAMES HAVE BEGUN. . . .In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and once girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV.Sixteen-year-old Katniss Everdeen regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against humanity and life against love.


winning means fame fortune losing means certain death hunger games begun ruins place known north america lies nation panem shining capitol surrounded twelve

In [4]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['description'].values.astype('U'))
# test_df = pd.DataFrame(tfidf_matrix.toarray(), columns = tf.get_feature_names())
# print(test_df)


In [5]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# print(cosine_sim[0])
# print(cosine_sim[1])
# print(cosine_sim[0].shape)


In [6]:
indices = pd.Series(df.index ,index=df['title'])
print(indices.head(10))
idx = indices["Harry Potter and the Order of the Phoenix"]
# sim_scores = list(enumerate(cosine_sim[idx]))


title
The Hunger Games                                                         0
Harry Potter and the Order of the Phoenix                                1
To Kill a Mockingbird                                                    2
Pride and Prejudice                                                      3
Twilight                                                                 4
The Book Thief                                                           5
Animal Farm                                                              6
The Chronicles of Narnia                                                 7
J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings    8
Gone with the Wind                                                       9
dtype: int64


In [7]:
def get_recommendations(title, no_of_recommendation):
    idx = indices[title]
    df['similarity score'] = np.array(cosine_sim[idx])
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # df['similarity score'] = np.array(sim_scores)
    sim_scores = sim_scores[1:no_of_recommendation+1]
    print("sim score :", sim_scores)
    book_indices = [i[0] for i in sim_scores]
    return df[["title", "similarity score"]].iloc[book_indices]

In [8]:
get_recommendations("Harry Potter and the Order of the Phoenix", 10)

sim score : [(93, 0.13539402534536957), (126, 0.12636874612268817), (32, 0.08802578022596587), (11030, 0.08352963181239673), (7008, 0.067901759092262), (103, 0.06733821023824844), (51722, 0.06688608003832092), (21783, 0.06139585317488493), (1600, 0.06040568457632132), (34781, 0.05919730355113858)]


Unnamed: 0,title,similarity score
93,Harry Potter and the Prisoner of Azkaban,0.135394
126,Harry Potter and the Chamber of Secrets,0.126369
32,Harry Potter and the Sorcerer's Stone,0.088026
11030,Harry Potter Schoolbooks Box Set: Two Classic ...,0.08353
7008,"Harry Potter Boxed Set, Books 1-5 (Harry Potte...",0.067902
103,Harry Potter and the Goblet of Fire,0.067338
51722,Hogwarts: An Incomplete and Unreliable Guide,0.066886
21783,Pronto,0.061396
1600,The Harry Potter Collection 1-4,0.060406
34781,The Unofficial Harry Potter Cookbook: From Cau...,0.059197
