In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk
import re
from evaluation import mean_average_precision,ndcg_at_k

stop_words = nltk.corpus.stopwords.words('english')

In [2]:
df = pd.read_csv(r'.\booklist.csv', index_col=False,encoding="latin-1")
print(df['title'].head(10))
print(df.shape)
df = df.head(100)
# df = df.sample(n=20000)
# print(df['title'].head(10))
# print(df.shape)

col_names=['bookId','title','series','author','rating','description','lang','isbn','genres','characters','bookForm','edition','pages','publisher','publishDate','firstPublished','awards','numRating','ratingsByStars','likedPercent','setting','coverImg','bbeScore','bbeVotes','price']
wanted_cols = ['bookId','title','series','author','rating','description','genres','characters','setting','coverImg']

#TODO: filter out our wanted cols, create our index and save it to a file(?)
filtered_df = df[wanted_cols].head(5)

print(df['genres'].dtypes)
print(df['characters'].dtypes)
print(df['setting'].dtypes)

0                                     The Hunger Games
1            Harry Potter and the Order of the Phoenix
2                                To Kill a Mockingbird
3                                  Pride and Prejudice
4                                             Twilight
5                                       The Book Thief
6                                          Animal Farm
7                             The Chronicles of Narnia
8    J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
9                                   Gone with the Wind
Name: title, dtype: object
(52478, 25)
object
object
object


In [3]:
# Characters, author and genre
df['genres'] = df['genres'].apply(literal_eval)
df['characters'] = df['characters'].apply(literal_eval)
df['setting'] = df['setting'].apply(literal_eval)
df['author'] = df['author'].apply(lambda x : x.split(", "))
print(df['author'].dtypes)
print(df['description'].dtypes)

df['soup'] = df['author'] + df['genres'] + df['characters'] + df['setting'] 
df['soup'] = df['soup'].apply(lambda x: ' '.join(x))


object
object


In [4]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

In [5]:
df['description'] = df['description'].apply(lambda x : normalize_document(str(x)))


df['soup'] = df['soup'] + " " + df['description']
print(df['description'].iloc[0])
print(df['soup'].iloc[0])

winning means fame fortune losing means certain death hunger games begun ruins place known north america lies nation panem shining capitol surrounded twelve outlying districts capitol harsh cruel keeps districts line forcing send one boy girl ages twelve eighteen participate annual hunger games fight death live tv sixteen year old katniss everdeen regards death sentence steps forward take sister place games katniss close dead survival second nature without really meaning becomes contender win start making choices weight survival humanity life love
Suzanne Collins Young Adult Fiction Dystopia Fantasy Science Fiction Romance Adventure Teen Post Apocalyptic Action Katniss Everdeen Peeta Mellark Cato (Hunger Games) Primrose Everdeen Gale Hawthorne Effie Trinket Haymitch Abernathy Cinna President Coriolanus Snow Rue Flavius Lavinia (Hunger Games) Marvel Glimmer Clove Foxface Thresh Greasy Sae Madge Undersee Caesar Flickerman Claudius Templesmith Octavia (Hunger Games) Portia (hunger Games) 

In [6]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df['soup'].values.astype('U'))

count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(df['soup'])

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [7]:
print(cosine_sim)
print(cosine_sim.shape)
indices = pd.Series([i for i in range(len(df))] ,index=df['title'])
print(indices.head(10))
print(indices.shape)
print(cosine_sim)

[[1.         0.00598356 0.00947881 ... 0.00675359 0.0070943  0.0093164 ]
 [0.00598356 1.         0.0070761  ... 0.00373122 0.00321951 0.00806997]
 [0.00947881 0.0070761  1.         ... 0.01320548 0.01130528 0.01470862]
 ...
 [0.00675359 0.00373122 0.01320548 ... 1.         0.01824504 0.01649287]
 [0.0070943  0.00321951 0.01130528 ... 0.01824504 1.         0.01277869]
 [0.0093164  0.00806997 0.01470862 ... 0.01649287 0.01277869 1.        ]]
(100, 100)
title
The Hunger Games                                                         0
Harry Potter and the Order of the Phoenix                                1
To Kill a Mockingbird                                                    2
Pride and Prejudice                                                      3
Twilight                                                                 4
The Book Thief                                                           5
Animal Farm                                                              6
The Chronicles

In [8]:
def get_recommendations(title, no_of_recommendation):
    idx = indices[title]
    df['similarity score'] = np.array(cosine_sim[idx])
    df['weightedRating'] = (((df["rating"] -  1)/4) + (df["likedPercent"] * 0.01) + ((df['numRatings'] - df['numRatings'].min()) / (df['numRatings'].max() - df['numRatings'].min())))/3
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:no_of_recommendation+1]
    book_indices = [i[0] for i in sim_scores]
    book_cossim = df[["title", "similarity score", "weightedRating"]].iloc[book_indices]
    book_cossim = book_cossim.sort_values("weightedRating", ascending=False)
    return book_cossim

In [9]:
# print(df['title'].iloc[0])
get_recommendations("Harry Potter and the Order of the Phoenix", 20)

Unnamed: 0,title,similarity score,weightedRating
32,Harry Potter and the Sorcerer's Stone,0.38065,0.9425
71,Harry Potter and the Deathly Hallows,0.539947,0.758119
93,Harry Potter and the Prisoner of Azkaban,0.480838,0.757038
80,1984,0.026388,0.724749
30,The Help,0.020414,0.714506
77,The Fellowship of the Ring,0.022711,0.704526
46,A Game of Thrones,0.018227,0.698439
42,The Lightning Thief,0.026925,0.678756
28,City of Bones,0.016658,0.637415
8,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,0.019866,0.626667


In [10]:
evaldf = pd.read_csv(r'.\small_eval.csv', index_col=False, encoding="utf-8")
evaldf['recommended'] = evaldf['recommended'].apply(literal_eval)

def getr(booktitle,k = 20):
    predictions = get_recommendations(booktitle,k)
    actualidx = evaldf[evaldf['title']==booktitle]['recommended']
    actual = evaldf['recommended'].iloc[actualidx.index.values[0]]

    # print(predictions)
    # print(actual)

    r = []
    for book in range(k):
        if predictions['title'].iloc[book] in actual:
            r.append(1)
        else:
            r.append(0)

    return r

In [11]:
rs=[]
for book in range(30):
    r=getr(df['title'].iloc[book])
    rs.append(r)
    print(ndcg_at_k(r,20))

print(mean_average_precision(rs))

0.9502344167898356
0.2946456790715744
0.9581977340790593
0.925358623919311
0.9333585904239958
0.9876955095526512
0.8801419222862238
0.6098717386602533
0.6410792916814373
0.8715776549323655
0.8543765286771784
0.4990555036344162
0.4202343573058318
0.8023907673666874
0.9778585125241057
0.7868926468099412
0.908058938832676
0.5252894218316833
0.8560923338990806
0.6421474625954678
0.6173702175535636
1.0
0.9016405666782845
0.9354909279468936
0.570065590152532
0.7113217937622272
0.8251010200361474
0.8817886798142951
0.5228494960089104
0.6661034241472861
0.6134387020647033
