In [1]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import re
import torch
from ast import literal_eval
from evaluation import mean_average_precision,ndcg_at_k
# nltk.download('stopwords')

In [2]:
df = pd.read_csv(r'.\booklist.csv', index_col=False, encoding="latin-1")

col_names=['bookId','title','series','author','rating','description','lang','isbn','genres','characters','bookForm','edition','pages','publisher','publishDate','firstPublished','awards','numRating','ratingsByStars','likedPercent','setting','coverImg','bbeScore','bbeVotes','price']
wanted_cols = ['bookId','title','series','author','rating','description','genres','characters','setting','coverImg']

filtered_df = df[wanted_cols].head(5)
df = df.head(100)

# print(df['description'].iloc[0])


In [3]:
#dont run this
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
query_embedding = model.encode('How big is London')
passage_embedding = model.encode(['London has 9,787,426 inhabitants at the 2011 census',
                                  'London is known for its finacial district'])

print("Similarity:", util.dot_score(query_embedding, passage_embedding))
print(util.cos_sim(query_embedding,passage_embedding))

Downloading: 100%|██████████| 737/737 [00:00<00:00, 735kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 190kB/s]
Downloading: 100%|██████████| 11.5k/11.5k [00:00<00:00, 5.75MB/s]
Downloading: 100%|██████████| 612/612 [00:00<00:00, 291kB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 68.2kB/s]
Downloading: 100%|██████████| 25.5k/25.5k [00:00<00:00, 102kB/s] 
Downloading: 100%|██████████| 90.9M/90.9M [00:03<00:00, 24.2MB/s]
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 26.5kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 112kB/s]
Downloading: 100%|██████████| 466k/466k [00:01<00:00, 372kB/s]  
Downloading: 100%|██████████| 383/383 [00:00<00:00, 192kB/s]
Downloading: 100%|██████████| 13.8k/13.8k [00:00<00:00, 54.3kB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 238kB/s]  
Downloading: 100%|██████████| 349/349 [00:00<00:00, 190kB/s]


Similarity: tensor([[0.5472, 0.6330]])
tensor([[0.5472, 0.6330]])


In [4]:
#SEMANTIC SEARCH
model = SentenceTransformer('msmarco-MiniLM-L-6-v3') #ms marco models for asymmetric semantic search (https://www.sbert.net/examples/applications/semantic-search/README.html)

# query_embedding = model.encode('How big is London')
# passage_embedding = model.encode('London has 9,787,426 inhabitants at the 2011 census')

# print("Similarity:", util.cos_sim(query_embedding, passage_embedding))

Downloading: 100%|██████████| 736/736 [00:00<00:00, 525kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 190kB/s]
Downloading: 100%|██████████| 3.68k/3.68k [00:00<00:00, 2.08MB/s]
Downloading: 100%|██████████| 627/627 [00:00<00:00, 626kB/s]
Downloading: 100%|██████████| 122/122 [00:00<00:00, 65.0kB/s]
Downloading: 100%|██████████| 90.9M/90.9M [00:03<00:00, 25.9MB/s]
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 53.1kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 56.0kB/s]
Downloading: 100%|██████████| 466k/466k [00:01<00:00, 380kB/s]  
Downloading: 100%|██████████| 430/430 [00:00<00:00, 189kB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 241kB/s]  
Downloading: 100%|██████████| 229/229 [00:00<00:00, 123kB/s]


In [5]:
# x = df['description'].iloc[0]
# print(type(x))

def clean_and_tokenize(x):
    try:
        stopwords = nltk.corpus.stopwords.words('english')
        d = re.sub(r'[^a-zA-Z0-9\s]', ' ', x, re.I|re.A)
        d = d.lower().strip()
        # nltk.download('punkt')
        tks = word_tokenize(d)
        f_tks = [t for t in tks if t not in stopwords]
        return ' '.join(f_tks)
    except TypeError:
        print(x)
        return ""

df['genres'] = df['genres'].apply(literal_eval)
df['characters'] = df['characters'].apply(literal_eval)
df['setting'] = df['setting'].apply(literal_eval)
df['author'] = df['author'].apply(lambda x : x.split(", "))

df['soup'] = df['author'] + df['genres'] + df['characters'] + df['setting'] 
df['soup'] = df['soup'].apply(lambda x: ' '.join(x))
df['description'] = df['description'].apply(lambda x : clean_and_tokenize(str(x)))
df['soup'] = df['soup'] + " " + df['description']
print(df['soup'].iloc[0])

# print(df['soup'].to_numpy())
np_df = df['soup'].to_numpy()
# v_clean_tokenize = np.vectorize(clean_and_tokenize)
# clean_df = v_clean_tokenize(np_df)
# clean_df += df["soup"].to_numpy()
encoded_df = model.encode(np_df)

def get_top_k_recos(query,k):
    qry_e = model.encode(query)
    scores = util.cos_sim(qry_e,encoded_df)
    # print(torch.topk(scores,k))
    vals,indices = torch.topk(scores,k)
    # print(indices)
    # for i in indices[0].numpy():
    #     print(df['title'].iloc[i])
    return df['title'].iloc[indices[0].numpy()]

Suzanne Collins Young Adult Fiction Dystopia Fantasy Science Fiction Romance Adventure Teen Post Apocalyptic Action Katniss Everdeen Peeta Mellark Cato (Hunger Games) Primrose Everdeen Gale Hawthorne Effie Trinket Haymitch Abernathy Cinna President Coriolanus Snow Rue Flavius Lavinia (Hunger Games) Marvel Glimmer Clove Foxface Thresh Greasy Sae Madge Undersee Caesar Flickerman Claudius Templesmith Octavia (Hunger Games) Portia (hunger Games) District 12, Panem Capitol, Panem Panem (United States) winning means fame fortune losing means certain death hunger games begun ruins place known north america lies nation panem shining capitol surrounded twelve outlying districts capitol harsh cruel keeps districts line forcing send one boy girl ages twelve eighteen participate annual hunger games fight death live tv sixteen year old katniss everdeen regards death sentence steps forward take sister place games katniss close dead survival second nature without really meaning becomes contender win 

In [6]:
print(np_df[0])

Suzanne Collins Young Adult Fiction Dystopia Fantasy Science Fiction Romance Adventure Teen Post Apocalyptic Action Katniss Everdeen Peeta Mellark Cato (Hunger Games) Primrose Everdeen Gale Hawthorne Effie Trinket Haymitch Abernathy Cinna President Coriolanus Snow Rue Flavius Lavinia (Hunger Games) Marvel Glimmer Clove Foxface Thresh Greasy Sae Madge Undersee Caesar Flickerman Claudius Templesmith Octavia (Hunger Games) Portia (hunger Games) District 12, Panem Capitol, Panem Panem (United States) winning means fame fortune losing means certain death hunger games begun ruins place known north america lies nation panem shining capitol surrounded twelve outlying districts capitol harsh cruel keeps districts line forcing send one boy girl ages twelve eighteen participate annual hunger games fight death live tv sixteen year old katniss everdeen regards death sentence steps forward take sister place games katniss close dead survival second nature without really meaning becomes contender win 

In [7]:
get_top_k_recos("Harry Potter and the Order of the Phoenix",20)

71                 Harry Potter and the Deathly Hallows
32                Harry Potter and the Sorcerer's Stone
93             Harry Potter and the Prisoner of Azkaban
1             Harry Potter and the Order of the Phoenix
8     J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...
7                              The Chronicles of Narnia
44                             A Thousand Splendid Suns
68                                   Great Expectations
77                           The Fellowship of the Ring
98                           Interview with the Vampire
0                                      The Hunger Games
59                                      The Kite Runner
46                                    A Game of Thrones
90                                   The Golden Compass
14                                    The Da Vinci Code
3                                   Pride and Prejudice
85                               The Brothers Karamazov
29                                         Ender

In [8]:
evaldf = pd.read_csv(r'.\small_eval.csv', index_col=False, encoding="utf-8")
evaldf['recommended'] = evaldf['recommended'].apply(literal_eval)

def getr(booktitle,k = 20):
    predictions = get_top_k_recos(booktitle,k)
    actualidx = evaldf[evaldf['title']==booktitle]['recommended']
    actual = evaldf['recommended'].iloc[actualidx.index.values[0]]

    # print(predictions)
    # print(actual)

    r = []
    for book in range(k):
        if predictions.iloc[book] in actual:
            r.append(1)
        else:
            r.append(0)

    return r


In [9]:
# print(df['title'].iloc[19])
# print(evaldf['title'].iloc[19])
showcase = df['title'].values[0:30]
ndcg_at_k_list = []
rs=[]
for book in range(30):
    r=getr(df['title'].iloc[book])
    rs.append(r)
    ndcg_at_k_list.append(ndcg_at_k(r,20))
    
results = pd.DataFrame({'movies': showcase, 'ndcg': ndcg_at_k_list})

print(results.head())

# print(getmap("Harry Potter and the Order of the Phoenix"))
# print(getmap("The Hunger Games"))

print(mean_average_precision(rs))

                                      movies      ndcg
0                           The Hunger Games  0.815465
1  Harry Potter and the Order of the Phoenix  0.354200
2                      To Kill a Mockingbird  0.231378
3                        Pride and Prejudice  0.596280
4                                   Twilight  0.492987
0.2525216399520726
