# 2. Search Engine

In [1]:
import pandas as pd
import glob
import os
import re
from langdetect import detect
import nltk

ModuleNotFoundError: No module named 'langdetect'

In [None]:
from our_functions import *

In [None]:
extension = 'tsv'
filenames = [i for i in glob.glob('*.{}'.format(extension))]

In [None]:
column_names = ['title', 'series', 'author', 'ratingValue', 'ratingCount', 'plots', 'reviewCount', 'date', 'characters', 'settings', 'url']

In [None]:
dataset = pd.concat([pd.read_csv(f, sep='\t', header=None, names = column_names) for f in filenames], axis = 0)

In [None]:
numbers = list(map(lambda x : int(re.search("[0-9]+" ,x).group(0)), filenames))

In [None]:
dataset["index"] = numbers

In [None]:
ds = dataset.sort_values(by=['index'])

In [None]:
ds.head()

In [None]:
#drop rows where there is not plot
ds = ds.dropna(subset = ['plots'])

In [None]:
#check if the plot is in english
def is_english(plot):
    try:
        result = (detect(plot) == 'en')
    except:
        #where the plot is empty
        result = False
    return(result)

In [None]:
#discard not eglish plot
df = ds[list(map(lambda x : is_english(x), list(ds['plots'])))]

In [None]:
plots = df[['plots', 'index']]

In [None]:
plots.head()

In [None]:
#save the dataset of file
df.to_csv('books_dataset.tsv', sep = '\t', index=False)    

### Remove Stop Word

In [None]:
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

In [None]:
from nltk.tokenize import RegexpTokenizer

In [None]:
 nltk.download()

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def remove_stop_word(phrases):
    tokenizer = RegexpTokenizer(r'[a-z]+')
    word = tokenizer.tokenize(phrases.lower())
    return [w for w in word if w not in stop_words]

In [None]:
plots['words'] = plots.apply(lambda x : remove_stop_word(x['plots']), axis=1)

In [None]:
plots.head()

In [None]:
words = plots[['words', 'index']]

In [None]:
words.head()

In [None]:
words = words.explode(column='words')

In [None]:
inverted_index = words.groupby('words')['index'].apply(list).to_dict()

In [None]:
keys = list(inverted_index.keys())

In [None]:
for k in keys[500:510]:
    print(k,':', inverted_index[k])

In [None]:
#save on file
import json
with open('inverted_index.json', 'w') as fp:
    json.dump(inverted_index, fp)

## 2.1.2) Execute the query

In [None]:
import json
import pandas as pd
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer

In [None]:
from functools import reduce

In [None]:
#load the file 
with open('inverted_index.json','r') as fp:
    inverted_index = json.load(fp)

In [None]:
#load the dateset
dataset = pd.read_csv('books_dataset.tsv', sep='\t', index_col=None)

In [None]:
dataset = dataset[['title', 'plots','url', 'index']]

In [None]:
dataset = dataset.rename(columns={'index': 'number'})

In [None]:
def clean_text(text):
    stop_words = set(stopwords.words('english'))
    tokenizer = RegexpTokenizer("[a-z]+")
    tokens = tokenizer.tokenize(text.lower())
    result = [word for word in tokens if not word in stop_words]
    return tokens

In [None]:
def search_engine_io(dataset, inverted_index):
    query = input('Insert words to seach: ')
    return search_engine(dataset, inverted_index,query)

In [None]:
def search_engine(dataset, inverted_index, query):    
    query_items = clean_text(query) #remove stop word
    numbers_docs =[set(inverted_index[q]) for q in query_items] #get sets of number' documents for each query word
    and_results = reduce(set.intersection, numbers_docs) #compute the overall intersections between sets
    return and_results    

In [None]:
results = search_engine_io(dataset, inverted_index) # survival, games
dataset[dataset.number.isin(results)]

# 3. Define a new score!

Our third search engine allows the user to insert two new parameter.

In [134]:
import editdistance
import itertools
from heapq import *

In [174]:
pd.set_option('display.max_colwidth', None)

In [235]:
df = pd.read_csv('books_dataset.tsv', sep='\t', index_col = 'number')

So now we are going to define a new score for our search. We know that people search books mainly considering Author, plot and the ratings given to the book by readers. We are going mainly considering those to features to rank the books according to the searches.

First we have to get extra information for the search from the user.
User will insert Author, ratings and plots he's refering to.

In [236]:
#handler the input output from user to search engine
def search_engine3_io():
    plot = input('Insert words in plot:')
    if not plot:
        input('Please insert words in plot:')
    author = input('insert author:')
    ratingValue = input('insert a rating value:')
    if not author and not ratingValue:
        return search_engine_3(dataset, inverted_index, plot) 
    if not author and ratingValue:
        return search_engine_3(dataset, inverted_index, plot, ratingValue)
    if author and not ratingValue:
        return search_engine_3(dataset, inverted_index, plot, author)
    if author and ratingValue:
        return search_engine_3(dataset, inverted_index, plot, author, ratingValue)   
    

 According to the given values by the user, we calculate edit distance of for our search results

In [364]:
def compute_distance(query_author, query_rating_value, num_doc, df):
    if not query_author: # the author has not been required
        distance_author = 1
    else: #compute the edit distance from the author normalized 
        author_doc = df.loc[num_doc,'author']
        distance_author = editdistance.eval(author_doc, query_author)
        distance_author /= len(author_doc) #normalization
        
    #compute the distance from the rating values nomralized 
    rating_value_doc = df.loc[num_doc,'ratingValue']
    rating_value_distance = round(abs(query_rating_value - rating_value_doc),2)
    rating_value_distance /= 5 #normalization
    
    #retun the mean value
    return round(0.5*distance_author + 0.5*rating_value_distance,2)
    

Add and remove documents to the heap according to the priority

In [328]:
def add_doc(pq,doc_num,distance=0):
    'Add a new document'
    count = next(counter)
    entry = [distance, count, doc_num]
    entry_finder[doc_num] = entry
    heappush(pq, entry)    
    
def pop_doc(pq):
    'Remove and return the lowest priority doc'
    distance, count, doc = heappop(pq)
    return [doc, distance]

Build the priority queque according to the calcualted edit distances

In [315]:
#build a priority queue
def build_pq(pq,df, inverted_index, plot='', query_author='', query_rating_value=5):
    docs_num_list = search_engine(df, inverted_index, plot)       
    for num_doc in docs_num_list:
        add_doc(pq,num_doc, compute_distance(query_author, query_rating_value,num_doc,df))
    return pq

New serch engine that rank books according to the score given by calculating edit distances for user's extra information 

In [376]:
def search_engine_3(k,df, inverted_index, plot='', query_author='', query_rating_value=5):
    pq = []                         
    entry_finder = {}
    counter = itertools.count()
    pq = build_pq(pq,df, inverted_index, plot, query_author, query_rating_value)
    
    doc_result = k*[None]
    distance_doc_result = k*[None]
    #retrieve the max 
    for i in range(k):
        doc_result[i],distance_doc_result[i] =  pop_doc(pq)
    df_result = df.loc[doc_result,['title','author','ratingValue','plots','url']]
    similarity = [1 - x for x in distance_doc_result]
    df_result['Similarity'] = similarity
    return df_result
       

Run and tast the algorithm

In [377]:
search_engine_3(5,df, inverted_index, plot='pride', query_author='austen', query_rating_value=4)

Unnamed: 0_level_0,title,author,ratingValue,plots,url,Similarity
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
377,Mansfield Park,Jane Austen,3.86,"Adopted into the household of her uncle, Sir Thomas Bertram, Fanny Price grows up a meek outsider among her cousins in the unaccustomed elegance of Mansfield Park. Soon after Sir Thomas absents himself on estate business in Antigua (the family's investment in slavery and sugar is considered in the Introduction in a new, post-colonial light), Mary Crawford and her brother Henry arrive at Mansfield, bringing with them London glamour, and the seductive taste for flirtation and theatre that precipitates a crisis. While Mansfield Park appears in some ways to continue where Pride and Prejudice left off, it is, as Kathryn Sutherland shows in her illuminating Introduction, a much darker work, which challenges 'the very values (of tradition, stability, retirement and faithfulness) it appears to endorse'. This new edition provides an accurate text based, for the first time since its original publication, on the first edition of 1814.",https://www.goodreads.com/review/show/2756260,0.76
4,Pride and Prejudice,Jane Austen,4.26,"Since its immediate success in 1813, Pride and Prejudice has remained one of the most popular novels in the English language. Jane Austen called this brilliant work ""her own darling child"" and its vivacious heroine, Elizabeth Bennet, ""as delightful a creature as ever appeared in print."" The romantic clash between the opinionated Elizabeth and her proud beau, Mr. Darcy, is a splendid performance of civilized sparring. And Jane Austen's radiant wit sparkles as her characters dance a delicate quadrille of flirtation and intrigue, making this book the most superb comedy of manners of Regency England.",https://www.goodreads.com/book/show/1885.Pride_and_Prejudice,0.75
358,The Complete Novels,Jane Austen,4.56,"This volume contains the six major novels: ""Emma"", ""Mansfield Park"", ""Northanger Abbey"", ""Persuasion"", ""Sense and Sensibility"", ""Pride and Prejudice"".",https://www.goodreads.com/review/show/2506231654,0.72
29798,The Gladiator's Master,Fae Sutherland,3.9,"When Roman politician Caelius inherits a stable of gladiators, there is one who captures his attention above the others...one whose eyes gleam with hate, pride and desire.Forced into slavery by Roman greed, Gaidres can barely conceal his contempt toward his new Dominus. Gaidres has a plan: kill Caelius and end the lineage of the Roman family that enslaved him. For his plan to succeed, he must make a show of respect and obedience--even when called on to service his master's desires.Gaidres is shocked to learn that in the confines of his quarters, Caelius doesn't want to dominate his slave, but to be taken by him. The sex is explosive as they break society's taboos and, to Gaidres's dismay, they form a tenuous relationship. Even when Caelius learns of Gaidres's plans for revenge, he knows he can't live without his perfect lover. Is he willing to risk it all to tame his gladiator's heart?88,000 words",https://www.goodreads.com/review/show/337968509,0.63
20084,Heretics,G.K. Chesterton,4.18,"G. K. Chesterton, the ""Prince of Paradox,"" is at his witty best in this collection of twenty essays and articles from the turn of the twentieth century. Focusing on ""heretics"" — those who pride themselves on their superiority to Christian views — Chesterton appraises prominent figures who fall into that category from the literary and art worlds. Luminaries such as Rudyard Kipling, George Bernard Shaw, H. G. Wells, and James McNeill Whistler come under the author's scrutiny, where they meet with equal measures of his characteristic wisdom and good humor.In addition to incisive assessments of well-known individuals (""Mr. Rudyard Kipling and Making the World Small"" and ""Mr. H. G. Wells and the Giants""), these essays contain observations on the wider world. ""On Sandals and Simplicity,"" ""Science and the Savages,"" ""On Certain Modern Writers and the Institution of the Family,"" ""On Smart Novelists and the Smart Set,"" and ""Slum Novelists and the Slums"" reflect the main themes of Chesterton's life's work. Heretics roused the ire of some critics for censuring contemporary philosophies without providing alternatives; the author responded a few years later with a companion volume, Orthodoxy. Sardonic, jolly, and generous, both books are vintage Chesterton.He is criticizing those who hold incomplete and inadequate views about ""life, the universe, and everything."" He is, in short, criticizing all that host of non-Christian views of reality, as he demonstrated in his follow-up book Orthodoxy. The book is both an easy read and a difficult read. But he manages to demonstrate, among other things, that our new 21st century heresies are really not new because he himself deals with most of them.",https://www.goodreads.com/review/show/58292307,0.62
