In [228]:
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk
import re
from evaluation import mean_average_precision,ndcg_at_k
stop_words = nltk.corpus.stopwords.words('english')

In [229]:
df = pd.read_csv(r'.\booklist.csv', index_col=False,encoding="latin-1")
print(df['title'].head(10))
print(df.shape)
df = df.head(5000)
# df = df.sample(n=1000)
# print(df['title'].head(10))
# print(df.shape)

col_names=['bookId','title','series','author','rating','description','lang','isbn','genres','characters','bookForm','edition','pages','publisher','publishDate','firstPublished','awards','numRating','ratingsByStars','likedPercent','setting','coverImg','bbeScore','bbeVotes','price']
wanted_cols = ['bookId','title','series','author','rating','description','genres','characters','setting','coverImg']

#TODO: filter out our wanted cols, create our index and save it to a file(?)
filtered_df = df[wanted_cols].head(5)

print(df['genres'].dtypes)
print(df['characters'].dtypes)
print(df['setting'].dtypes)

0                                     The Hunger Games
1            Harry Potter and the Order of the Phoenix
2                                To Kill a Mockingbird
3                                  Pride and Prejudice
4                                             Twilight
5                                       The Book Thief
6                                          Animal Farm
7                             The Chronicles of Narnia
8    J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
9                                   Gone with the Wind
Name: title, dtype: object
(52478, 25)
object
object
object


In [230]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    # doc = ' '.join(filtered_tokens)
    return filtered_tokens

In [231]:
import math
from six import iteritems
from six.moves import xrange

PARAM_K1 = 2.5
PARAM_B = 0.85
EPSILON = 0.2

class BM25(object):
    def __init__(self, corpus):
        self.corpus_size = len(corpus)
        self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
        self.corpus = corpus
        self.f = []
        self.df = {}
        self.idf = {}
        self.doc_len = []
        self.initialize()

    def initialize(self):
        for document in self.corpus:
            frequencies = {}
            self.doc_len.append(len(document))
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.f.append(frequencies)

            for word, freq in iteritems(frequencies):
                if word not in self.df:
                    self.df[word] = 0
                self.df[word] += 1

        for word, freq in iteritems(self.df):
            self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

    def get_score(self, document, index, average_idf):
        score = 0
        for word in document:
            if word not in self.f[index]:
                continue
            idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
            score += (idf * self.f[index][word] * (PARAM_K1 + 1)
                      / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
        return score

    def get_scores(self, document, average_idf):
        scores = []
        for index in xrange(self.corpus_size):
            score = self.get_score(document, index, average_idf)
            scores.append(score)
        return scores


In [232]:
def get_bm25_weights(corpus):
    bm25 = BM25(corpus)
    average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)

    weights = []
    for doc in corpus:
        scores = bm25.get_scores(doc, average_idf)
        weights.append(scores)

    return weights

In [233]:
df['genres'] = df['genres'].apply(literal_eval)
df['characters'] = df['characters'].apply(literal_eval)
df['setting'] = df['setting'].apply(literal_eval)
df['author'] = df['author'].apply(lambda x : x.split(", "))
df['description'] = df['description'].apply(lambda x : normalize_document(str(x)))
df['soup'] = df['author'] + df['description'] + df['genres'] + df['characters'] + df['setting'] 

df['weightedRating'] = (((df["rating"] -  1)/4) + (df["likedPercent"] * 0.01) + ((df['numRatings'] - df['numRatings'].min()) / (df['numRatings'].max() - df['numRatings'].min())))/3
# df['soup'] = df['soup'].apply(lambda x: ' '.join(x))
# print(df['description'].head())
# test = list(df['description'])
# print(test)

# print(df['description'].iloc[0])
# print(df['soup'].iloc[0])

In [234]:

norm_corpus = df['soup'].to_list()
wts = get_bm25_weights(norm_corpus)
bm25_wts_df = pd.DataFrame(wts)

In [235]:
evaldf = pd.read_csv(r'.\small_eval.csv', index_col=False, encoding="utf-8")
evaldf['recommended'] = evaldf['recommended'].apply(literal_eval)

def get_recommendations(book_title, doc_sims, no_of_recommendation):
    # find movie id
    book_idx = np.where(df['title'].values == book_title)[0][0] # (1000,1000)
    # get movie similarities
    book_similarities = doc_sims.iloc[book_idx].values # (1000,)
    similar_books = np.sort(-book_similarities)[1:no_of_recommendation+1]
    # print(similar_movies)
    # get top 5 similar movie IDs
    book_movies_idxs = np.argsort(-book_similarities)[1:no_of_recommendation+1]
    # print(similar_movies_idxs)
    # get top 5 movies

    similar_book_title = df['title'].values[book_movies_idxs]
    # print(similar_movies.shape)
    similar_book_weightedRating = df['weightedRating'].values[book_movies_idxs]

    actualidx = evaldf[evaldf['title']==book_title]['recommended']
    actual = evaldf['recommended'].iloc[actualidx.index.values[0]]

    # print(predictions)
    # print(actual)

    r = []
    for book in similar_book_title:
        if book in actual:
            r.append(1)
        else:
            r.append(0)
    similar_books = pd.DataFrame({'title': similar_book_title, 'similarity score': -similar_books, 'weightedRating': similar_book_weightedRating})
    similar_books= similar_books.sort_values("weightedRating", ascending=False)
    # return the top 5 movies
    return similar_books

In [236]:
print(bm25_wts_df.head())

         0           1           2           3           4          5     \
0  587.224354    3.183719    7.788555    6.020353   13.383729   8.145223   
1    3.432120  586.268833    3.054199    2.421664    6.613244   3.940377   
2    7.743880    2.643679  473.357174   18.252376   17.382770  17.487915   
3    5.584182    2.083645   18.143365  542.085280    2.768886  15.153366   
4    9.948257    4.288437   12.723608    2.086795  380.041480   5.476670   

        6          7          8          9     ...       4990       4991  \
0  10.875650   9.027843   4.296819  11.181562  ...  13.313969   2.353875   
1   2.740494  13.601803   8.247968   5.413042  ...   6.327226   5.989090   
2  25.911594  21.743780  13.436576  10.015584  ...   7.698664  15.511767   
3  32.089511   9.060647   6.665320  16.143959  ...   2.511954   4.923033   
4   3.208665   7.952828   7.324767   2.746987  ...   2.901245   9.239878   

       4992       4993      4994       4995      4996       4997       4998  \
0  8.07

In [237]:
# movie = df['title'].iloc[0]

get_recommendations(book_title="Harry Potter and the Order of the Phoenix", doc_sims=bm25_wts_df, no_of_recommendation=20)

Unnamed: 0,title,similarity score,weightedRating
4,Harry Potter and the Sorcerer's Stone,179.256872,0.9425
0,Harry Potter and the Deathly Hallows,277.274268,0.7613
3,Harry Potter and the Prisoner of Azkaban,234.448504,0.760222
2,Harry Potter and the Goblet of Fire,238.107427,0.74937
1,Harry Potter and the Chamber of Secrets,245.642557,0.74141
5,Harry Potter and the Half-Blood Prince,176.240911,0.739447
13,Harry Potter Series Box Set,48.402357,0.649406
10,Harry Potter Collection,61.578176,0.638962
11,The Harry Potter Collection 1-4,61.307988,0.635928
17,Lodestar,35.728805,0.635609


In [238]:
x=get_recommendations(book_title="Harry Potter and the Order of the Phoenix", doc_sims=bm25_wts_df, no_of_recommendation=20)

print(x['title'].iloc[0])

Harry Potter and the Sorcerer's Stone


In [239]:
evaldf = pd.read_csv(r'.\small_eval.csv', index_col=False, encoding="utf-8")
evaldf['recommended'] = evaldf['recommended'].apply(literal_eval)

def getr(booktitle,k = 20):
    predictions = get_recommendations(book_title=booktitle, doc_sims=bm25_wts_df, no_of_recommendation=k)
    actualidx = evaldf[evaldf['title']==booktitle]['recommended']
    actual = evaldf['recommended'].iloc[actualidx.index.values[0]]

    # print(predictions)
    # print(actual)

    r = []
    for book in range(k):
        if predictions['title'].iloc[book] in actual:
            r.append(1)
        else:
            r.append(0)

    return r

In [240]:
rs=[]
for book in range(30):
    r=getr(df['title'].iloc[book])
    rs.append(r)
    print(ndcg_at_k(r,20))

print(mean_average_precision(rs))

0.3010299956639812
0.2704130232587376
0
1.0
0
0.5
0.27894294565112987
0
0.2890648263178879
0.6177044566833191
1.0
0.43067655807339306
0.6421838204337547
0.8868854556705132
1.0
0
1.0
0.3562071871080222
0.6505149978319906
0
1.0
0
0
0
0
0.3010299956639812
0.4434264036172708
0
0
0.27023815442731974
0.27571959602145363


In [243]:
showcase = df['title'].values[0:30]
ndcg_at_k_list = []
rs=[]
for book in range(30):
    recommended = get_recommendations(book_title=df['title'].iloc[book], doc_sims=bm25_wts_df, no_of_recommendation=20)
    # r= recommended['relevance'].values
    r=getr(df['title'].iloc[book])
    rs.append(r)
    ndcg_at_k_list.append(ndcg_at_k(r,20))
    
results = pd.DataFrame({'books': showcase, 'ndcg': ndcg_at_k_list})

print(results.head(20))

print("average ndcg: ", np.mean(ndcg_at_k_list))
print("mean average precision: ",mean_average_precision(rs))

                                                books      ndcg
0                                    The Hunger Games  0.301030
1           Harry Potter and the Order of the Phoenix  0.270413
2                               To Kill a Mockingbird  0.000000
3                                 Pride and Prejudice  1.000000
4                                            Twilight  0.000000
5                                      The Book Thief  0.500000
6                                         Animal Farm  0.278943
7                            The Chronicles of Narnia  0.000000
8   J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...  0.289065
9                                  Gone with the Wind  0.617704
10                             The Fault in Our Stars  1.000000
11               The Hitchhiker's Guide to the Galaxy  0.430677
12                                    The Giving Tree  0.642184
13                                  Wuthering Heights  0.886885
14                                  The 