## Q-3

As it has been requested we have added a new filter. We are asking to user to enter the query and the minimum rating for the books. We have defined a new score which is taking the rating into account and normalize it then multiply by cosine similarity. Then we are showing the 10 books with the highest new score that has been defined.

In [1]:
import nltk

import io 
import codecs
import csv
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import stem

import re
import pandas as pd

from nltk.corpus import stopwords
from string import punctuation
from bs4 import BeautifulSoup as soup
from tqdm import tqdm
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
def cleaner(text):

    words = re.split(r'\W+', str(text))
    words = [word.lower() for word in words]
    without_punct = [wp for wp in words if wp not in punctuation]
    sw = stopwords.words('english')
    without_sw = [w for w in without_punct if w not in sw] 
    list_to_remove = ['b','br','span', 'one' , 'id', 'none' ]
    clean_more = [w for w in without_sw if w not in list_to_remove] 
    clean_more_2 = [w for w in clean_more if  not (re.findall(re.compile(r'freetext'),w)
                                                   or re.findall(re.compile(r'\d'),w)) ]
    
    
    ps = nltk.stem.PorterStemmer()
    stemmed_list=[ps.stem(w) for w in clean_more_2 ]
    return ' '.join(stemmed_list)

In [3]:
def find_link(index):
    ind=str(index)
    if(index%100==0):
        folder=int(ind[0:3])-1
    else:
        folder=ind[0:3]
    HtmlFile = open('books_2/html_books_'+str(folder)+'/article_{}.html'.format(ind), 'r', encoding='utf-8')
    source_code = HtmlFile.read()

    book = soup(source_code, 'html.parser')

    book_link = book.find('link')
    
    return book_link['href']

In [82]:
# Create a csv files for all books with cleaned book title, rating, cleaned plot, url, empty similarity and score column.
# The similarity and new score that has been defined will be added to dataframe later.

df_new = pd.DataFrame(columns = ['Book Title','Rating', 'Plot', 'Url', 'Similarity', 'Score'])
title_list = []
plot_list = []
url_list = []
rating_list = []

for index in tqdm(range(10001,11000)):
    try:
        df = pd.read_csv("books_2_tsv/"+str(index)+".tsv", sep='\t', encoding='utf-8' )
    except:
        continue
    df['book_title'] = df['book_title'].apply(cleaner)
    title = df['book_title'].iloc[0]
    df['complete_plot']=df['complete_plot'].apply(cleaner)
    plot = df['complete_plot'].iloc[0]
    complete_plt= title + " " + plot
    plot_split = re.split(r'\W+', str(complete_plt))
    rate = df['rating'].iloc[0]/5 # Normalize the rating
    
    url=find_link(index)    
    title_list.append(title)
    plot_list.append(' '.join(plot_split))
    url_list.append(url)
    rating_list.append(rate)
    
df_new['Book Title'] = title_list
df_new['Plot'] = plot_list
df_new['Url'] = url_list
df_new['Rating'] = rating_list
df_new.to_csv ('tfidf_cos2.csv', index = False, header=True)
    

100%|████████████████████████████████████████████████████████████████████████████████| 999/999 [01:44<00:00,  9.53it/s]


In [83]:
df_new = pd.read_csv("tfidf_cos2.csv")

In [87]:
# Calculate the tfidf and cosine similarity score for all books with the query

vectorizer = TfidfVectorizer() # Get tf-idf matrix using fit_transform function
X = vectorizer.fit_transform(df_new['Plot'].values.astype('U')) # Store tf-idf representations of all docs

query = str(input('Search something: ')) # Ask the user to write the query
min_rating = np.float(input('Minimum rating: ')) # We have asked to user to write the minimum rating he/she wants
query_vec = vectorizer.transform([query]) # Ip -- (n_docs,x), Op -- (n_docs,n_Feats)
results = cosine_similarity(X,query_vec).reshape((-1,)) # Op -- (n_docs,1) -- Cosine Sim with each doc
cos_sim = results.tolist() # Convert the cosine similarity result to list format
df_new['Similarity'] = cos_sim
df_new['Score'] = cos_sim * df_new['Rating']  # Calculate the score as cosine_similarity*rating and write to Score column of dataframe

df_empty = pd.DataFrame(columns = ['Book Title','Rating', 'Plot', 'Url', 'Similarity','Score'])

for i in tqdm(range(len(df_new))):
    if df_new.loc[i]['Rating']*5 >= min_rating: # Filter the books with minimum rating provided by user.
        df_updated = pd.concat([df_empty, df_new], axis=0) # Add the book to updated dataframe which has rating greater than min rating
        
df_updated.nlargest(10,'Score') # Take the first 10 books with larger new score

Search something: last freedom
Minimum rating: 3.5


100%|███████████████████████████████████████████████████████████████████████████████| 944/944 [00:01<00:00, 845.09it/s]


Unnamed: 0,Book Title,Rating,Plot,Url,Similarity,Score
133,chain,0.82,chain revolutionari war begin thirteen year ol...,https://www.goodreads.com/book/show/3002300-ch...,0.223392,0.183182
886,even hint guard heart lust,0.806,even hint guard heart lust harri author kiss d...,https://www.goodreads.com/book/show/696703.Not...,0.166045,0.133832
897,truth,0.828,truth book book seri last two month brianna di...,https://www.goodreads.com/book/show/31315152-t...,0.136968,0.113409
0,fearless,0.766,fearless smartli paint exterior citi commun fa...,https://www.goodreads.com/book/show/1206359.Fe...,0.142722,0.109325
898,last vampir,0.784,last vampir alisa pern last vampir beauti bril...,https://www.goodreads.com/book/show/137972.The...,0.133416,0.104598
449,devil altern,0.812,devil altern russia face famin soviet forc pin...,https://www.goodreads.com/book/show/825473.The...,0.123989,0.100679
659,refuge,0.892,refuge three differ kid mission common escap j...,https://www.goodreads.com/book/show/33118312-r...,0.107656,0.096029
252,case peculiar pink fan,0.828,case peculiar pink fan enola holm encount frie...,https://www.goodreads.com/book/show/3039234-th...,0.106066,0.087823
440,wrong world,0.808,wrong world aptli titl treatis wrong world twe...,https://www.goodreads.com/book/show/184565.Wha...,0.10654,0.086084
471,avatar last airbend promis part,0.866,avatar last airbend promis part war adventur b...,https://www.goodreads.com/book/show/12413836-a...,0.09799,0.084859
