In [20]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import re
import torch
from ast import literal_eval
from evaluation import mean_average_precision,ndcg_at_k
# nltk.download('stopwords')

In [21]:
df = pd.read_csv(r'.\booklist.csv', index_col=False, encoding="latin-1")

col_names=['bookId','title','series','author','rating','description','lang','isbn','genres','characters','bookForm','edition','pages','publisher','publishDate','firstPublished','awards','numRating','ratingsByStars','likedPercent','setting','coverImg','bbeScore','bbeVotes','price']
wanted_cols = ['bookId','title','series','author','rating','description','genres','characters','setting','coverImg']

filtered_df = df[wanted_cols].head(5)
df = df.head(5000)

# print(df['description'].iloc[0])


In [22]:
#SEMANTIC SEARCH
model = SentenceTransformer('all-mpnet-base-v2') #ms marco models for asymmetric semantic search (https://www.sbert.net/examples/applications/semantic-search/README.html)

# all-mpnet-base-v2 
# msmarco-MiniLM-L-6-v3 (SEMANTIC)

# query_embedding = model.encode('How big is London')
# passage_embedding = model.encode('London has 9,787,426 inhabitants at the 2011 census')

# print("Similarity:", util.cos_sim(query_embedding, passage_embedding))

In [23]:
# x = df['description'].iloc[0]
# print(type(x))

def clean_and_tokenize(x):
    try:
        stopwords = nltk.corpus.stopwords.words('english')
        d = re.sub(r'[^a-zA-Z0-9\s]', ' ', x, re.I|re.A)
        d = d.lower().strip()
        # nltk.download('punkt')
        tks = word_tokenize(d)
        f_tks = [t for t in tks if t not in stopwords]
        return ' '.join(f_tks)
    except TypeError:
        print(x)
        return ""

df['genres'] = df['genres'].apply(literal_eval)
df['characters'] = df['characters'].apply(literal_eval)
df['setting'] = df['setting'].apply(literal_eval)
df['author'] = df['author'].apply(lambda x : x.split(", "))

df['soup'] = df['author'] + df['genres'] + df['characters'] + df['setting'] 
df['soup'] = df['soup'].apply(lambda x: ' '.join(x))
df['description'] = df['description'].apply(lambda x : clean_and_tokenize(str(x)))
df['soup'] = df['soup'] + " " + df['description']
print(df['soup'].iloc[0])

# print(df['soup'].to_numpy())
np_df = df['soup'].to_numpy()
# v_clean_tokenize = np.vectorize(clean_and_tokenize)
# clean_df = v_clean_tokenize(np_df)
# clean_df += df["soup"].to_numpy()
encoded_df = model.encode(np_df)



Suzanne Collins Young Adult Fiction Dystopia Fantasy Science Fiction Romance Adventure Teen Post Apocalyptic Action Katniss Everdeen Peeta Mellark Cato (Hunger Games) Primrose Everdeen Gale Hawthorne Effie Trinket Haymitch Abernathy Cinna President Coriolanus Snow Rue Flavius Lavinia (Hunger Games) Marvel Glimmer Clove Foxface Thresh Greasy Sae Madge Undersee Caesar Flickerman Claudius Templesmith Octavia (Hunger Games) Portia (hunger Games) District 12, Panem Capitol, Panem Panem (United States) winning means fame fortune losing means certain death hunger games begun ruins place known north america lies nation panem shining capitol surrounded twelve outlying districts capitol harsh cruel keeps districts line forcing send one boy girl ages twelve eighteen participate annual hunger games fight death live tv sixteen year old katniss everdeen regards death sentence steps forward take sister place games katniss close dead survival second nature without really meaning becomes contender win 

In [24]:
def get_top_k_recos(query,k):
    qry_e = model.encode(query)
    scores = util.cos_sim(qry_e,encoded_df)
    # print(torch.topk(scores,k))
    vals,indices = torch.topk(scores,k)
    df['scores'] = scores[0].numpy()
    df['weightedRating'] = (((df["rating"] -  1)/4) + (df["likedPercent"] * 0.01) + ((df['numRatings'] - df['numRatings'].min()) / (df['numRatings'].max() - df['numRatings'].min())))/3
    # print(indices)
    dfsortedbyscores = df[['title','scores','weightedRating']].iloc[indices[0].numpy()]
    return dfsortedbyscores.sort_values("weightedRating",ascending = False)

In [25]:
print(np_df[0])

Suzanne Collins Young Adult Fiction Dystopia Fantasy Science Fiction Romance Adventure Teen Post Apocalyptic Action Katniss Everdeen Peeta Mellark Cato (Hunger Games) Primrose Everdeen Gale Hawthorne Effie Trinket Haymitch Abernathy Cinna President Coriolanus Snow Rue Flavius Lavinia (Hunger Games) Marvel Glimmer Clove Foxface Thresh Greasy Sae Madge Undersee Caesar Flickerman Claudius Templesmith Octavia (Hunger Games) Portia (hunger Games) District 12, Panem Capitol, Panem Panem (United States) winning means fame fortune losing means certain death hunger games begun ruins place known north america lies nation panem shining capitol surrounded twelve outlying districts capitol harsh cruel keeps districts line forcing send one boy girl ages twelve eighteen participate annual hunger games fight death live tv sixteen year old katniss everdeen regards death sentence steps forward take sister place games katniss close dead survival second nature without really meaning becomes contender win 

In [26]:
get_top_k_recos("Harry Potter and the Order of the Phoenix",20)

Unnamed: 0,title,scores,weightedRating
32,Harry Potter and the Sorcerer's Stone,0.594131,0.9425
71,Harry Potter and the Deathly Hallows,0.507017,0.7613
93,Harry Potter and the Prisoner of Azkaban,0.552612,0.760222
103,Harry Potter and the Goblet of Fire,0.534209,0.74937
126,Harry Potter and the Chamber of Secrets,0.542235,0.74141
105,Harry Potter and the Half-Blood Prince,0.564788,0.739447
1,Harry Potter and the Order of the Phoenix,0.501393,0.736923
409,Harry Potter Series Box Set,0.580491,0.649406
2286,Harry Potter Collection,0.614232,0.638962
1600,The Harry Potter Collection 1-4,0.573618,0.635928


In [27]:
evaldf = pd.read_csv(r'.\small_eval.csv', index_col=False, encoding="utf-8")
evaldf['recommended'] = evaldf['recommended'].apply(literal_eval)

def getr(booktitle,k = 20):
    predictions = get_top_k_recos(booktitle,k)
    actualidx = evaldf[evaldf['title']==booktitle]['recommended']
    actual = evaldf['recommended'].iloc[actualidx.index.values[0]]

    # print(predictions)
    # print(actual)

    r = []
    for book in range(k):
        if predictions['title'].iloc[book] in actual:
            r.append(1)
        else:
            r.append(0)

    return r


In [28]:
# print(df['title'].iloc[19])
# print(evaldf['title'].iloc[19])
from audioop import avg


showcase = df['title'].values[0:30]
ndcg_at_k_list = []
rs=[]
for book in range(30):
    r=getr(df['title'].iloc[book])
    rs.append(r)
    ndcg_at_k_list.append(ndcg_at_k(r,20))
    
results = pd.DataFrame({'books': showcase, 'ndcg': ndcg_at_k_list})

print(results.head(20))

# print(getmap("Harry Potter and the Order of the Phoenix"))
print(getr("The Hunger Games"))

print("average ndcg: ", np.mean(ndcg_at_k_list))
print("mean average precision: ",mean_average_precision(rs))

                                                books      ndcg
0                                    The Hunger Games  0.657732
1           Harry Potter and the Order of the Phoenix  0.238014
2                               To Kill a Mockingbird  0.000000
3                                 Pride and Prejudice  0.000000
4                                            Twilight  0.000000
5                                      The Book Thief  0.000000
6                                         Animal Farm  0.000000
7                            The Chronicles of Narnia  0.530208
8   J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...  0.000000
9                                  Gone with the Wind  0.000000
10                             The Fault in Our Stars  0.000000
11               The Hitchhiker's Guide to the Galaxy  0.333333
12                                    The Giving Tree  0.000000
13                                  Wuthering Heights  0.000000
14                                  The 