In [66]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
import numpy as np
import re
import torch
# nltk.download('stopwords')

In [77]:
df = pd.read_csv(r'.\booklist.csv', index_col=False)

col_names=['bookId','title','series','author','rating','description','lang','isbn','genres','characters','bookForm','edition','pages','publisher','publishDate','firstPublished','awards','numRating','ratingsByStars','likedPercent','setting','coverImg','bbeScore','bbeVotes','price']
wanted_cols = ['bookId','title','series','author','rating','description','genres','characters','setting','coverImg']

filtered_df = df[wanted_cols].head(5)
df = df.head(10000)

# print(df['description'].iloc[0])


WINNING MEANS FAME AND FORTUNE.LOSING MEANS CERTAIN DEATH.THE HUNGER GAMES HAVE BEGUN. . . .In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and once girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV.Sixteen-year-old Katniss Everdeen regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against humanity and life against love.


In [33]:
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
query_embedding = model.encode('How big is London')
passage_embedding = model.encode(['London has 9,787,426 inhabitants at the 2011 census',
                                  'London is known for its finacial district'])

print("Similarity:", util.dot_score(query_embedding, passage_embedding))
print(util.cos_sim(query_embedding,passage_embedding))

Similarity: tensor([[0.5472, 0.6330]])
tensor([[0.5472, 0.6330]])


In [35]:
#SEMANTIC SEARCH
model = SentenceTransformer('msmarco-MiniLM-L-6-v3') #ms marco models for asymmetric semantic search (https://www.sbert.net/examples/applications/semantic-search/README.html)

query_embedding = model.encode('How big is London')
passage_embedding = model.encode('London has 9,787,426 inhabitants at the 2011 census')

print("Similarity:", util.cos_sim(query_embedding, passage_embedding))

Downloading: 100%|██████████| 736/736 [00:00<00:00, 243kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 182kB/s]
Downloading: 100%|██████████| 3.68k/3.68k [00:00<00:00, 1.14MB/s]
Downloading: 100%|██████████| 627/627 [00:00<00:00, 604kB/s]
Downloading: 100%|██████████| 122/122 [00:00<00:00, 61.5kB/s]
Downloading: 100%|██████████| 90.9M/90.9M [00:08<00:00, 10.1MB/s]  
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 15.8kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 22.4kB/s]
Downloading: 100%|██████████| 466k/466k [00:01<00:00, 381kB/s]  
Downloading: 100%|██████████| 430/430 [00:00<00:00, 71.1kB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 246kB/s]  
Downloading: 100%|██████████| 229/229 [00:00<00:00, 115kB/s]


Similarity: tensor([[0.6061]])


In [79]:
x = df['description'].iloc[0]
print(type(x))

def clean_and_tokenize(x):
    stopwords = nltk.corpus.stopwords.words('english')
    d = re.sub(r'[^a-zA-Z0-9\s]', ' ', x, re.I|re.A)
    d = d.lower().strip()
    # nltk.download('punkt')
    tks = word_tokenize(d)
    f_tks = [t for t in tks if t not in stopwords]
    return ' '.join(f_tks)



# print(df['description'].to_numpy())
np_df = df['description'].to_numpy()
v_clean_tokenize = np.vectorize(clean_and_tokenize)
clean_df = v_clean_tokenize(np_df)
encoded_df = model.encode(clean_df)

def get_top_k_recos(query,k):
    qry_e = model.encode(query)
    scores = util.cos_sim(qry_e,encoded_df)
    # print(torch.topk(scores,k))
    vals,indices = torch.topk(scores,k)
    # print(indices)
    for i in indices[0].numpy():
        print(df['title'].iloc[i])

<class 'str'>
tensor([[ 0.1455,  0.4684, -0.0570,  0.1316,  0.1725,  0.0641,  0.0705,  0.2217,
          0.2930,  0.0423, -0.0801,  0.0227, -0.0327,  0.1124,  0.0999,  0.0032,
          0.0251, -0.0505,  0.0383,  0.0826, -0.0773,  0.0019,  0.0339,  0.1310,
          0.0779,  0.1273,  0.0232,  0.0335,  0.1238,  0.0747,  0.0478,  0.0074,
          0.5455, -0.0601, -0.0692,  0.1075, -0.0484,  0.1511,  0.2627,  0.0352,
          0.1395, -0.0025,  0.1626,  0.0933,  0.1485,  0.0821,  0.1566, -0.0006,
         -0.1008,  0.0186]])
tensor([[32,  1,  8, 38,  7,  4, 42, 46, 37, 44]])
Harry Potter and the Sorcerer's Stone
Harry Potter and the Order of the Phoenix
J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings
Brave New World
The Chronicles of Narnia
Twilight
The Lightning Thief
A Game of Thrones
Dracula
A Thousand Splendid Suns


In [80]:
get_top_k_recos("Harry Potter and the Order of the Phoenix",20)

tensor([[ 0.1404,  0.3336,  0.0081,  0.2122,  0.2114,  0.0761,  0.1129,  0.2133,
          0.2926,  0.1169,  0.1255,  0.0478, -0.0302,  0.1544,  0.0172, -0.0856,
          0.0030, -0.0304, -0.0027,  0.0573, -0.0498,  0.0021,  0.0876,  0.1139,
         -0.0209,  0.1502, -0.0340,  0.1112,  0.0685,  0.1476,  0.0523,  0.0012,
          0.3676,  0.0352, -0.0587, -0.0333, -0.0054,  0.0894,  0.1896,  0.0170,
          0.0749,  0.0505,  0.1480,  0.0215,  0.2208,  0.0518,  0.1837, -0.0102,
         -0.0585,  0.0340]])
tensor([[32,  1,  8, 44,  7,  3,  4, 38, 46, 13]])
Harry Potter and the Sorcerer's Stone
Harry Potter and the Order of the Phoenix
J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings
A Thousand Splendid Suns
The Chronicles of Narnia
Pride and Prejudice
Twilight
Brave New World
A Game of Thrones
Wuthering Heights
