In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk
import re

stop_words = nltk.corpus.stopwords.words('english')

In [2]:
df = pd.read_csv(r'.\booklist.csv', index_col=False)
print(df['title'].head(10))
print(df.shape)
df = df.sample(n=1000)
print(df['title'].head(10))
print(df.shape)

col_names=['bookId','title','series','author','rating','description','lang','isbn','genres','characters','bookForm','edition','pages','publisher','publishDate','firstPublished','awards','numRating','ratingsByStars','likedPercent','setting','coverImg','bbeScore','bbeVotes','price']
wanted_cols = ['bookId','title','series','author','rating','description','genres','characters','setting','coverImg']

#TODO: filter out our wanted cols, create our index and save it to a file(?)
filtered_df = df[wanted_cols].head(5)

print(df['genres'].dtypes)
print(df['characters'].dtypes)
print(df['setting'].dtypes)

0                                     The Hunger Games
1            Harry Potter and the Order of the Phoenix
2                                To Kill a Mockingbird
3                                  Pride and Prejudice
4                                             Twilight
5                                       The Book Thief
6                                          Animal Farm
7                             The Chronicles of Narnia
8    J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
9                                   Gone with the Wind
Name: title, dtype: object
(52478, 25)
51483          Never Do Anything, Ever
9770         Black Bird of the Gallows
12721                  October Breezes
41193    Prins Valiant (Jaargang 1959)
22279                  Finding Freedom
40072         Amazing Agent Luna Vol 1
6467                      Dark Reunion
35770     The Honorable Imposter: 1620
27604                     Hvid som sne
22385                        Sanctuary
Name: title, dtype: o

In [3]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    # doc = ' '.join(filtered_tokens)
    return filtered_tokens

In [4]:

import math
from six import iteritems
from six.moves import xrange

PARAM_K1 = 2.5
PARAM_B = 0.85
EPSILON = 0.2

class BM25(object):
    def __init__(self, corpus):
        self.corpus_size = len(corpus)
        self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
        self.corpus = corpus
        self.f = []
        self.df = {}
        self.idf = {}
        self.doc_len = []
        self.initialize()

    def initialize(self):
        for document in self.corpus:
            frequencies = {}
            self.doc_len.append(len(document))
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.f.append(frequencies)

            for word, freq in iteritems(frequencies):
                if word not in self.df:
                    self.df[word] = 0
                self.df[word] += 1

        for word, freq in iteritems(self.df):
            self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

    def get_score(self, document, index, average_idf):
        score = 0
        for word in document:
            if word not in self.f[index]:
                continue
            idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
            score += (idf * self.f[index][word] * (PARAM_K1 + 1)
                      / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
        return score

    def get_scores(self, document, average_idf):
        scores = []
        for index in xrange(self.corpus_size):
            score = self.get_score(document, index, average_idf)
            scores.append(score)
        return scores


In [5]:
def get_bm25_weights(corpus):
    bm25 = BM25(corpus)
    average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)

    weights = []
    for doc in corpus:
        scores = bm25.get_scores(doc, average_idf)
        weights.append(scores)

    return weights

In [6]:
df['genres'] = df['genres'].apply(literal_eval)
df['characters'] = df['characters'].apply(literal_eval)
df['setting'] = df['setting'].apply(literal_eval)
df['author'] = df['author'].apply(lambda x : x.split(", "))
df['description'] = df['description'].apply(lambda x : normalize_document(str(x)))
df['soup'] = df['author'] + df['description'] + df['genres'] + df['characters'] + df['setting'] 
# df['soup'] = df['soup'].apply(lambda x: ' '.join(x))
# print(df['description'].head())
# test = list(df['description'])
# print(test)

# print(df['description'].iloc[0])
# print(df['soup'].iloc[0])

In [7]:

norm_corpus = df['soup'].to_list()
wts = get_bm25_weights(norm_corpus)
bm25_wts_df = pd.DataFrame(wts)

In [106]:
def movie_recommender(movie_title, doc_sims, no_of_recommendation):
    # find movie id
    movie_idx = np.where(df['title'].values == movie_title)[0][0] # (1000,1000)
    # get movie similarities
    movie_similarities = doc_sims.iloc[movie_idx].values # (1000,)
    similar_movies = np.sort(-movie_similarities)[1:no_of_recommendation+1]
    print(similar_movies)
    # get top 5 similar movie IDs
    similar_movies_idxs = np.argsort(-movie_similarities)[1:no_of_recommendation+1]
    print(similar_movies_idxs)
    # get top 5 movies

    similar_movies_title = df['title'].values[similar_movies_idxs]
    print(similar_movies.shape)
    similar_movies = pd.DataFrame({'title': similar_movies_title, 'similarity score': -similar_movies})
    # return the top 5 movies
    return similar_movies

In [107]:
print(bm25_wts_df.head())

          0           1           2          3           4          5    \
0  403.592947   19.405255   13.366547   0.000000    6.345203  12.708599   
1   17.158991  329.661316   19.512704   0.000000   11.102772   5.486301   
2   15.903773   25.849042  499.548941   0.000000   22.370886  13.896411   
3    0.000000    0.000000    0.000000  83.463297    0.000000   0.000000   
4    6.647641   18.524590   25.729370   0.000000  417.944323   7.455913   

         6          7         8          9    ...        990        991  \
0  12.155116   7.060760  2.168962   5.376490  ...   5.538318  12.727176   
1  27.132536  18.453405  2.168962  16.688440  ...  24.433872   7.314540   
2   9.840908  10.542621  2.168962   7.212738  ...   0.000000  19.718563   
3   0.000000   0.000000  0.000000   0.000000  ...   0.000000   0.000000   
4  20.814080  14.710168  0.000000  12.338991  ...   9.169454  24.905956   

         992       993       994        995       996       997       998  999  
0   7.741337  3.3

In [109]:
movie = df['title'].iloc[0]

movie_recommender(movie_title=movie, doc_sims=bm25_wts_df, no_of_recommendation=10)

[-48.20045531 -39.78119691 -36.52177067 -35.26544003 -35.04585239
 -35.01305968 -34.81191015 -33.82330947 -33.63081642 -32.29609014]
[ 48 724 512 514 787 839 488 629 149  62]
(10,)


Unnamed: 0,title,similarity score
0,Jamie Dornan: Shades of Desire,48.200455
1,Ramona the Brave,39.781197
2,Voyage on the Great Titanic: The Diary of Marg...,36.521771
3,Miss Daisy Is Crazy!,35.26544
4,Into the Wild Nerd Yonder,35.045852
5,Catch Your Death,35.01306
6,A Long Way from Chicago,34.81191
7,Big Red,33.823309
8,Oblivion,33.630816
9,Lunch Walks Among Us,32.29609
