In [1]:
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import nltk
import re

stop_words = nltk.corpus.stopwords.words('english')

In [2]:
df = pd.read_csv(r'.\booklist.csv', index_col=False)
print(df['title'].head(10))
print(df.shape)
df = df.head(10000)
# df = df.sample(n=1000)
# print(df['title'].head(10))
# print(df.shape)

col_names=['bookId','title','series','author','rating','description','lang','isbn','genres','characters','bookForm','edition','pages','publisher','publishDate','firstPublished','awards','numRating','ratingsByStars','likedPercent','setting','coverImg','bbeScore','bbeVotes','price']
wanted_cols = ['bookId','title','series','author','rating','description','genres','characters','setting','coverImg']

#TODO: filter out our wanted cols, create our index and save it to a file(?)
filtered_df = df[wanted_cols].head(5)

print(df['genres'].dtypes)
print(df['characters'].dtypes)
print(df['setting'].dtypes)

0                                     The Hunger Games
1            Harry Potter and the Order of the Phoenix
2                                To Kill a Mockingbird
3                                  Pride and Prejudice
4                                             Twilight
5                                       The Book Thief
6                                          Animal Farm
7                             The Chronicles of Narnia
8    J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
9                                   Gone with the Wind
Name: title, dtype: object
(52478, 25)
object
object
object


In [3]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = nltk.word_tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    # doc = ' '.join(filtered_tokens)
    return filtered_tokens

In [4]:
import math
from six import iteritems
from six.moves import xrange

PARAM_K1 = 2.5
PARAM_B = 0.85
EPSILON = 0.2

class BM25(object):
    def __init__(self, corpus):
        self.corpus_size = len(corpus)
        self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
        self.corpus = corpus
        self.f = []
        self.df = {}
        self.idf = {}
        self.doc_len = []
        self.initialize()

    def initialize(self):
        for document in self.corpus:
            frequencies = {}
            self.doc_len.append(len(document))
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.f.append(frequencies)

            for word, freq in iteritems(frequencies):
                if word not in self.df:
                    self.df[word] = 0
                self.df[word] += 1

        for word, freq in iteritems(self.df):
            self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

    def get_score(self, document, index, average_idf):
        score = 0
        for word in document:
            if word not in self.f[index]:
                continue
            idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
            score += (idf * self.f[index][word] * (PARAM_K1 + 1)
                      / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
        return score

    def get_scores(self, document, average_idf):
        scores = []
        for index in xrange(self.corpus_size):
            score = self.get_score(document, index, average_idf)
            scores.append(score)
        return scores


In [5]:
def get_bm25_weights(corpus):
    bm25 = BM25(corpus)
    average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)

    weights = []
    for doc in corpus:
        scores = bm25.get_scores(doc, average_idf)
        weights.append(scores)

    return weights

In [6]:
df['genres'] = df['genres'].apply(literal_eval)
df['characters'] = df['characters'].apply(literal_eval)
df['setting'] = df['setting'].apply(literal_eval)
df['author'] = df['author'].apply(lambda x : x.split(", "))
df['description'] = df['description'].apply(lambda x : normalize_document(str(x)))
df['soup'] = df['author'] + df['description'] + df['genres'] + df['characters'] + df['setting'] 
# df['soup'] = df['soup'].apply(lambda x: ' '.join(x))
# print(df['description'].head())
# test = list(df['description'])
# print(test)

# print(df['description'].iloc[0])
# print(df['soup'].iloc[0])

In [7]:

norm_corpus = df['soup'].to_list()
wts = get_bm25_weights(norm_corpus)
bm25_wts_df = pd.DataFrame(wts)

In [8]:
def get_recommendations(movie_title, doc_sims, no_of_recommendation):
    # find movie id
    movie_idx = np.where(df['title'].values == movie_title)[0][0] # (1000,1000)
    # get movie similarities
    movie_similarities = doc_sims.iloc[movie_idx].values # (1000,)
    similar_movies = np.sort(-movie_similarities)[1:no_of_recommendation+1]
    print(similar_movies)
    # get top 5 similar movie IDs
    similar_movies_idxs = np.argsort(-movie_similarities)[1:no_of_recommendation+1]
    print(similar_movies_idxs)
    # get top 5 movies

    similar_movies_title = df['title'].values[similar_movies_idxs]
    print(similar_movies.shape)
    similar_movies = pd.DataFrame({'title': similar_movies_title, 'similarity score': -similar_movies})
    # return the top 5 movies
    return similar_movies

In [9]:
print(bm25_wts_df.head())

         0           1           2           3           4          5     \
0  606.363261    3.438488    8.055974    6.056881   13.687213   8.436175   
1    3.707948  615.613422    3.433525    2.803829    6.937940   4.362891   
2    8.010169    2.970136  488.141372   18.909129   18.209200  17.990079   
3    5.616133    2.410870   18.795663  564.079749    2.905173  16.279885   
4   10.167031    4.489039   13.291603    2.186112  399.713033   5.724651   

        6          7          8          9     ...       9990       9991  \
0  11.305669   9.401613   4.649136  11.349427  ...  16.313835   6.559626   
1   3.248405  14.240590   8.975289   5.963304  ...  17.341963  13.137622   
2  26.982337  22.356577  14.520865  11.054707  ...   2.430311  14.370795   
3  34.438896   9.545442   7.198024  16.872977  ...   4.210757   6.604789   
4   3.484751   8.033939   7.660406   2.882051  ...   2.778229   2.223883   

        9992  9993       9994       9995       9996      9997       9998  \
0   7.4949

In [10]:
# movie = df['title'].iloc[0]

get_recommendations(movie_title="Harry Potter and the Order of the Phoenix", doc_sims=bm25_wts_df, no_of_recommendation=20)

[-299.36955069 -264.47984763 -256.267927   -250.50630621 -192.05569847
 -189.49631071 -129.94412878 -110.64960746 -108.85266279  -99.49256734
  -78.30803382  -68.99195147  -68.08789395  -66.35440696  -65.63946709
  -64.80132011  -55.47920003  -54.94456407  -51.1292344   -48.1565657 ]
[  71  126  103   93   32  105 7418 2580 1184 5203 1332 9637 7008 1323
 2286 1600 6600 3576  409 6186]
(20,)


Unnamed: 0,title,similarity score
0,Harry Potter and the Deathly Hallows,299.369551
1,Harry Potter and the Chamber of Secrets,264.479848
2,Harry Potter and the Goblet of Fire,256.267927
3,Harry Potter and the Prisoner of Azkaban,250.506306
4,Harry Potter and the Sorcerer's Stone,192.055698
5,Harry Potter and the Half-Blood Prince,189.496311
6,Harry Potter: Film Wizardry,129.944129
7,Harry Potter and the Methods of Rationality,110.649607
8,Fantastic Beasts and Where to Find Them,108.852663
9,James Potter and the Hall of Elders' Crossing,99.492567
