In [175]:
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk
import re
from evaluation import mean_average_precision,ndcg_at_k

stop_words = nltk.corpus.stopwords.words('english')

In [176]:
df = pd.read_csv(r'.\booklist.csv', index_col=False,encoding="latin-1")
print(df['title'].head(10))
print(df.shape)
df = df.head(5000)
# df = df.sample(n=20000)
# print(df['title'].head(10))
# print(df.shape)

col_names=['bookId','title','series','author','rating','description','lang','isbn','genres','characters','bookForm','edition','pages','publisher','publishDate','firstPublished','awards','numRating','ratingsByStars','likedPercent','setting','coverImg','bbeScore','bbeVotes','price']
wanted_cols = ['bookId','title','series','author','rating','description','genres','characters','setting','coverImg']

#TODO: filter out our wanted cols, create our index and save it to a file(?)
filtered_df = df[wanted_cols].head(5)

print(df['genres'].dtypes)
print(df['characters'].dtypes)
print(df['setting'].dtypes)

0                                     The Hunger Games
1            Harry Potter and the Order of the Phoenix
2                                To Kill a Mockingbird
3                                  Pride and Prejudice
4                                             Twilight
5                                       The Book Thief
6                                          Animal Farm
7                             The Chronicles of Narnia
8    J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
9                                   Gone with the Wind
Name: title, dtype: object
(52478, 25)
object
object
object


In [177]:
# Characters, author and genre
df['genres'] = df['genres'].apply(literal_eval)
df['characters'] = df['characters'].apply(literal_eval)
df['setting'] = df['setting'].apply(literal_eval)
df['author'] = df['author'].apply(lambda x : x.split(", "))
print(df['author'].dtypes)
print(df['description'].dtypes)

df['soup'] = df['author'] + df['genres'] + df['characters'] + df['setting'] 
df['soup'] = df['soup'].apply(lambda x: ' '.join(x))


object
object


In [178]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [179]:
df['description'] = df['description'].apply(lambda x : normalize_document(str(x)))


df['soup'] = df['soup'] + " " + df['description']
print(df['description'].iloc[0])
print(df['soup'].iloc[0])

winning means fame fortune losing means certain death hunger games begun ruins place known north america lies nation panem shining capitol surrounded twelve outlying districts capitol harsh cruel keeps districts line forcing send one boy girl ages twelve eighteen participate annual hunger games fight death live tv sixteen year old katniss everdeen regards death sentence steps forward take sister place games katniss close dead survival second nature without really meaning becomes contender win start making choices weight survival humanity life love
Suzanne Collins Young Adult Fiction Dystopia Fantasy Science Fiction Romance Adventure Teen Post Apocalyptic Action Katniss Everdeen Peeta Mellark Cato (Hunger Games) Primrose Everdeen Gale Hawthorne Effie Trinket Haymitch Abernathy Cinna President Coriolanus Snow Rue Flavius Lavinia (Hunger Games) Marvel Glimmer Clove Foxface Thresh Greasy Sae Madge Undersee Caesar Flickerman Claudius Templesmith Octavia (Hunger Games) Portia (hunger Games) 

In [180]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['soup'].values.astype('U'))

count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(df['soup'])

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [181]:
print(cosine_sim)
print(cosine_sim.shape)
indices = pd.Series([i for i in range(len(df))] ,index=df['title'])
print(indices.head(10))
print(indices.shape)
print(cosine_sim)

[[1.00000000e+00 2.91851539e-03 5.26098958e-03 ... 3.31765390e-03
  3.84680504e-03 2.79715626e-04]
 [2.91851539e-03 1.00000000e+00 4.61624965e-03 ... 3.68303052e-03
  7.85337040e-03 2.51782139e-03]
 [5.26098958e-03 4.61624965e-03 1.00000000e+00 ... 8.90615832e-03
  2.92957087e-03 2.61495065e-03]
 ...
 [3.31765390e-03 3.68303052e-03 8.90615832e-03 ... 1.00000000e+00
  6.59909649e-03 5.32069424e-04]
 [3.84680504e-03 7.85337040e-03 2.92957087e-03 ... 6.59909649e-03
  1.00000000e+00 6.08153068e-04]
 [2.79715626e-04 2.51782139e-03 2.61495065e-03 ... 5.32069424e-04
  6.08153068e-04 1.00000000e+00]]
(5000, 5000)
title
The Hunger Games                                                         0
Harry Potter and the Order of the Phoenix                                1
To Kill a Mockingbird                                                    2
Pride and Prejudice                                                      3
Twilight                                                                 4
The Bo

In [182]:
from distutils import core


def get_recommendations(title, no_of_recommendation):
    idx = indices[title]
    if type(indices[title]) == pd.core.series.Series:
        idx = indices[title].array[0]
    df['similarity score'] = np.array(cosine_sim[idx])
    df['weightedRating'] = (((df["rating"] -  1)/4) + (df["likedPercent"] * 0.01) + ((df['numRatings'] - df['numRatings'].min()) / (df['numRatings'].max() - df['numRatings'].min())))/3
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:no_of_recommendation+1]
    book_indices = [i[0] for i in sim_scores]
    book_cossim = df[["title", "similarity score", "weightedRating"]].iloc[book_indices]
    book_cossim = book_cossim.sort_values("weightedRating", ascending=False)
    return book_cossim

In [183]:
# print(df['title'].iloc[0])
get_recommendations("Harry Potter and the Order of the Phoenix", 20)

Unnamed: 0,title,similarity score,weightedRating
32,Harry Potter and the Sorcerer's Stone,0.398687,0.9425
71,Harry Potter and the Deathly Hallows,0.57037,0.7613
93,Harry Potter and the Prisoner of Azkaban,0.495859,0.760222
103,Harry Potter and the Goblet of Fire,0.506937,0.74937
126,Harry Potter and the Chamber of Secrets,0.496368,0.74141
105,Harry Potter and the Half-Blood Prince,0.421241,0.739447
409,Harry Potter Series Box Set,0.069753,0.649406
2286,Harry Potter Collection,0.107498,0.638962
1600,The Harry Potter Collection 1-4,0.10512,0.635928
3576,Harry Potter and the Order of the Phoenix (Har...,0.155798,0.632952


In [184]:
evaldf = pd.read_csv(r'.\small_eval.csv', index_col=False, encoding="utf-8")
evaldf['recommended'] = evaldf['recommended'].apply(literal_eval)

def getr(booktitle,k = 20):
    predictions = get_recommendations(booktitle,k)
    actualidx = evaldf[evaldf['title']==booktitle]['recommended']
    actual = evaldf['recommended'].iloc[actualidx.index.values[0]]

    # print(predictions)
    # print(actual)

    r = []
    for book in range(k):
        if predictions['title'].iloc[book] in actual:
            r.append(1)
        else:
            r.append(0)

    return r

In [187]:
rs=[]
ndcg_at_k_list = []
showcase = df['title'].values[0:30]
for book in range(30):
    r=getr(df['title'].iloc[book])
    rs.append(r)
    ndcg_at_k_list.append(ndcg_at_k(r,20))

results = pd.DataFrame({'books': showcase, 'ndcg': ndcg_at_k_list})

print(results.head(20))

print("average ndcg: ", np.mean(ndcg_at_k_list))
print("mean average precision: ",mean_average_precision(rs))

                                                books      ndcg
0                                    The Hunger Games  1.000000
1           Harry Potter and the Order of the Phoenix  0.270413
2                               To Kill a Mockingbird  1.000000
3                                 Pride and Prejudice  1.000000
4                                            Twilight  0.000000
5                                      The Book Thief  0.750000
6                                         Animal Farm  0.639471
7                            The Chronicles of Narnia  0.000000
8   J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...  0.322636
9                                  Gone with the Wind  0.615689
10                             The Fault in Our Stars  1.000000
11               The Hitchhiker's Guide to the Galaxy  0.333333
12                                    The Giving Tree  0.678104
13                                  Wuthering Heights  0.836447
14                                  The 