In [19]:
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk
import re
from evaluation import mean_average_precision,ndcg_at_k
stop_words = nltk.corpus.stopwords.words('english')

In [20]:
df = pd.read_csv(r'.\booklist.csv', index_col=False,encoding="latin-1")
print(df['title'].head(10))
print(df.shape)
df = df.head(100)
# df = df.sample(n=1000)
# print(df['title'].head(10))
# print(df.shape)

col_names=['bookId','title','series','author','rating','description','lang','isbn','genres','characters','bookForm','edition','pages','publisher','publishDate','firstPublished','awards','numRating','ratingsByStars','likedPercent','setting','coverImg','bbeScore','bbeVotes','price']
wanted_cols = ['bookId','title','series','author','rating','description','genres','characters','setting','coverImg']

#TODO: filter out our wanted cols, create our index and save it to a file(?)
filtered_df = df[wanted_cols].head(5)

print(df['genres'].dtypes)
print(df['characters'].dtypes)
print(df['setting'].dtypes)

0                                     The Hunger Games
1            Harry Potter and the Order of the Phoenix
2                                To Kill a Mockingbird
3                                  Pride and Prejudice
4                                             Twilight
5                                       The Book Thief
6                                          Animal Farm
7                             The Chronicles of Narnia
8    J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
9                                   Gone with the Wind
Name: title, dtype: object
(52478, 25)
object
object
object


In [21]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    # doc = ' '.join(filtered_tokens)
    return filtered_tokens

In [22]:
import math
from six import iteritems
from six.moves import xrange

PARAM_K1 = 2.5
PARAM_B = 0.85
EPSILON = 0.2

class BM25(object):
    def __init__(self, corpus):
        self.corpus_size = len(corpus)
        self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
        self.corpus = corpus
        self.f = []
        self.df = {}
        self.idf = {}
        self.doc_len = []
        self.initialize()

    def initialize(self):
        for document in self.corpus:
            frequencies = {}
            self.doc_len.append(len(document))
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.f.append(frequencies)

            for word, freq in iteritems(frequencies):
                if word not in self.df:
                    self.df[word] = 0
                self.df[word] += 1

        for word, freq in iteritems(self.df):
            self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

    def get_score(self, document, index, average_idf):
        score = 0
        for word in document:
            if word not in self.f[index]:
                continue
            idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
            score += (idf * self.f[index][word] * (PARAM_K1 + 1)
                      / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
        return score

    def get_scores(self, document, average_idf):
        scores = []
        for index in xrange(self.corpus_size):
            score = self.get_score(document, index, average_idf)
            scores.append(score)
        return scores


In [23]:
def get_bm25_weights(corpus):
    bm25 = BM25(corpus)
    average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)

    weights = []
    for doc in corpus:
        scores = bm25.get_scores(doc, average_idf)
        weights.append(scores)

    return weights

In [24]:
df['genres'] = df['genres'].apply(literal_eval)
df['characters'] = df['characters'].apply(literal_eval)
df['setting'] = df['setting'].apply(literal_eval)
df['author'] = df['author'].apply(lambda x : x.split(", "))
df['description'] = df['description'].apply(lambda x : normalize_document(str(x)))
df['soup'] = df['author'] + df['description'] + df['genres'] + df['characters'] + df['setting'] 
# df['soup'] = df['soup'].apply(lambda x: ' '.join(x))
# print(df['description'].head())
# test = list(df['description'])
# print(test)

# print(df['description'].iloc[0])
# print(df['soup'].iloc[0])

In [25]:

norm_corpus = df['soup'].to_list()
wts = get_bm25_weights(norm_corpus)
bm25_wts_df = pd.DataFrame(wts)

In [32]:
def get_recommendations(movie_title, doc_sims, no_of_recommendation):
    # find movie id
    movie_idx = np.where(df['title'].values == movie_title)[0][0] # (1000,1000)
    # get movie similarities
    movie_similarities = doc_sims.iloc[movie_idx].values # (1000,)
    similar_movies = np.sort(-movie_similarities)[1:no_of_recommendation+1]
    # print(similar_movies)
    # get top 5 similar movie IDs
    similar_movies_idxs = np.argsort(-movie_similarities)[1:no_of_recommendation+1]
    # print(similar_movies_idxs)
    # get top 5 movies

    similar_movies_title = df['title'].values[similar_movies_idxs]
    # print(similar_movies.shape)
    similar_movies = pd.DataFrame({'title': similar_movies_title, 'similarity score': -similar_movies})
    # return the top 5 movies
    return similar_movies

In [27]:
print(bm25_wts_df.head())

           0           1           2           3           4          5   \
0  410.088709    1.749539    5.876535    5.333438   12.380981   7.413062   
1    1.885226  372.058359    1.930180    1.583032    5.563658   2.820266   
2    5.866141    1.672180  326.338859   12.889248   10.913682  12.083029   
3    4.949344    1.363296   12.812761  354.514024    2.548888   9.960987   
4    9.149236    3.618670    8.196905    1.925036  275.944028   3.865783   

          6          7         8          9   ...         90        91  \
0   7.270192   6.232013  2.786062  12.138057  ...  17.069216  5.335407   
1   1.641839  10.127221  6.515663   5.201536  ...  11.831332  5.489673   
2  17.796188  16.075786  8.324483   6.408624  ...   1.361506  9.063883   
3  22.353134   6.560155  5.619648  12.248910  ...   1.940820  7.246074   
4   1.996548   7.190991  4.578263   2.528899  ...   4.334745  2.060329   

          92          93         94         95         96        97  \
0  10.133908    7.279578   

In [28]:
# movie = df['title'].iloc[0]

get_recommendations(movie_title="Harry Potter and the Order of the Phoenix", doc_sims=bm25_wts_df, no_of_recommendation=20)

[-144.15547367 -129.49458075  -97.06942668  -22.49286441  -19.28801911
  -19.28698884  -18.9449024   -18.35556371  -16.84273283  -13.54911688
  -12.22221743  -11.83133204  -11.77707109  -11.64909501  -11.57671616
  -11.5210787   -10.12722094   -9.72157839   -9.66987022   -9.45491568]
[71 93 32 22 31 28 96 76 45 50 77 90 21 61 26 92  7 30 41 42]
(20,)


Unnamed: 0,title,similarity score
0,Harry Potter and the Deathly Hallows,144.155474
1,Harry Potter and the Prisoner of Azkaban,129.494581
2,Harry Potter and the Sorcerer's Stone,97.069427
3,Lord of the Flies,22.492864
4,Anne of Green Gables,19.288019
5,City of Bones,19.286989
6,Winnie-the-Pooh,18.944902
7,Matilda,18.355564
8,A Wrinkle in Time,16.842733
9,Where the Wild Things Are,13.549117


In [29]:
x=get_recommendations(movie_title="Harry Potter and the Order of the Phoenix", doc_sims=bm25_wts_df, no_of_recommendation=20)

print(x['title'].iloc[0])

[-144.15547367 -129.49458075  -97.06942668  -22.49286441  -19.28801911
  -19.28698884  -18.9449024   -18.35556371  -16.84273283  -13.54911688
  -12.22221743  -11.83133204  -11.77707109  -11.64909501  -11.57671616
  -11.5210787   -10.12722094   -9.72157839   -9.66987022   -9.45491568]
[71 93 32 22 31 28 96 76 45 50 77 90 21 61 26 92  7 30 41 42]
(20,)
Harry Potter and the Deathly Hallows


In [30]:
evaldf = pd.read_csv(r'.\small_eval.csv', index_col=False, encoding="utf-8")
evaldf['recommended'] = evaldf['recommended'].apply(literal_eval)

def getr(booktitle,k = 20):
    predictions = get_recommendations(movie_title=booktitle, doc_sims=bm25_wts_df, no_of_recommendation=k)
    actualidx = evaldf[evaldf['title']==booktitle]['recommended']
    actual = evaldf['recommended'].iloc[actualidx.index.values[0]]

    # print(predictions)
    # print(actual)

    r = []
    for book in range(k):
        if predictions['title'].iloc[book] in actual:
            r.append(1)
        else:
            r.append(0)

    return r

In [33]:
rs=[]
for book in range(30):
    r=getr(df['title'].iloc[book])
    rs.append(r)
    print(ndcg_at_k(r,20))

print(mean_average_precision(rs))

0.6313247675185968
0.23801437763899258
0.5553782207444934
0.8784098137096863
0.5077643968214793
0.38384462508052564
0.5697444455330438
0.6309297535714575
0.4250459329151599
0.6585381834928036
0.39241192488026627
0.8531776795146803
0.7546117266328853
0.8977107666850511
0.4540111928030904
0.40309698369020075
0.7842137690517234
0.46673735257960025
0.7310358129389314
0.7423373896514631
0.5800952514711237
0.5516408353038085
0.5453679230449677
0.8150873052679217
0.717619793917951
0.5629104584393229
0.4053927342610788
0.49628413432295043
0.5876652500336149
0.32140541602217854
0.40833068348615187
