##  ADM - Homework 3 - What is the best anime in the world?

#### Importing Libraries

In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
from itertools import islice
import math
import os
from datetime import datetime
import csv
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np
import json
import string
import heapq
from IPython.display import display

# 1 - Data collection




## 1.1 Get the list of animes

We collect the anime's url contained in all the pages of the "Top Anime Series" list (383 pages).
We retrieve them using the html of the page and discovering where the urls are located.

In [None]:
# get urls from first page
html = urlopen("https://myanimelist.net/topanime.php")  # open link
soup = BeautifulSoup(html.read(), features="html5lib")  # read the html
# encoding because there are chars that are not readable so i'll ignore them
soup.prettify().encode(errors='ignore')
links = []
# search this class in h3 element (we saw that each url is contained in this class)
for link in soup.find_all("h3", {"class": "hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3"}):
    a = link.find("a")  # find "a" element
    # get the url in href and append to the list
    links.append(a.get("href").encode('utf-8').decode('ascii', 'ignore'))
textfile = open("anime_url.txt", "w")  # open the file
for element in links:
    textfile.write(element + "\n")  # write each url

# do the same for the next pages
lim = 50
for i in range(382):
    links = []
    # we saw that the following pages follow this pattern
    html = urlopen("https://myanimelist.net/topanime.php?limit="+str(lim))
    soup = BeautifulSoup(html.read(), features="html5lib")
    for link in soup.find_all("h3", {"class": "hoverinfo_trigger fl-l fs14 fw-b anime_ranking_h3"}):
        a = link.find("a")
        links.append(a.get("href").encode('utf-8').decode('ascii', 'ignore'))
    for element in links:
        textfile.write(element + "\n")
    lim += 50

textfile.close()

After executing this code we have the file "anime_url.txt" that contains all the anime's url, one for each line.

## 1.2 Crawl animes


Using the urls in the file previously created, we download every anime's html and then store them in folders, following the organization in pages of the site (50 anime in each folder)

In [None]:
ini = 1  # value that corresponds to the number of line where start in "anime_url.txt" --> used to do it in parallel
for j in range(383):  # number that corresponds to the number of pages to do --> used to do it in parallel
    # set the path to use for the folder
    path = "Anime/page_"+str(math.floor(ini//50)+1)
    os.mkdir(path)  # create the folder
    with open('anime_url.txt') as f:
        # take 50 lines at time of the document (50 anime on each page)
        next_n_lines = list(islice(f, ini, ini+50))
        for i in range(50):
            r = requests.get(next_n_lines[i])  # get the url
            # create the file html and write inside the content of the page
            with open(path+"/article_"+str(ini+i+1)+".html", 'wb') as outfile:
                outfile.write(r.content)
        ini += 50  # update the index of the line

After this we will have a structure like this:

Anime
> /page_1


> > article_1.html

> > article_2.html

> > ...

> > article_50.html

> /page_2


> > article_51.html

> > article_52.html

> > ...

> > article_100.html

...








    

## 1.3 Parse downloaded pages

Now, we retrieve all the information we need, related to the anime, from each html.
We do this, again, using the html structure and the library BeautifulSoup and then we store all the information we got, in tsv files.

In [None]:
ini = 1 # initial value 
for i in range(383): # n of pages to parse
    page = "Anime/page_"+str(ini+i)
    first = ((ini+i-1)*50)+1 # every time get the number of the first article of the page
    for j in range(50): # 50 anime for page
        n_article = str(first+j)
        path = page + '/' + 'article_' + n_article + '.html'
        
        # Open anime page and extract html content
        with open(path, "r", encoding="utf-8") as f:
            content = f.read()
        soup = BeautifulSoup(content, "lxml")
        
        # Check for 404 error --> if we have it, we skip that html
        error = soup.find("div", attrs={"class":"error404"})
        if error:
            print('article_' + n_article + 'was skipped due to 404 error')
            continue
        
        # Title
        animeTitle = soup.find("h1", attrs ={"class":"title-name h1_bold_none"}).get_text(strip=True)
        
        ## Table information
        border = soup.find("td", attrs ={"class":"borderClass"})
        rawtype = border.find("h2", text = "Information").next_sibling.next_sibling.get_text(strip = True)
        # Anime Type
        animeType = rawtype.split(':')[-1]
        # Anime number episode
        rawepisode = border.find("span", attrs ={"class":"dark_text"}, text = "Episodes:").next_sibling.get_text(strip=True)
        try:
            animeNumEpisode = int(rawepisode)
        except:
            animeNumEpisode = ''
        # Release and End date
        aired = border.find("span", attrs ={"class":"dark_text"}, text = "Aired:").next_sibling.get_text(strip=True)
        start, end = aired.split('to')[0], aired.split('to')[-1]
        if len(start.strip()) == 4:
            releaseDate = datetime.strptime(start.strip(), '%Y').year
        else:
            try:
                releaseDate = datetime.strptime(start.strip(), '%b %d, %Y').date()
            except:
                releaseDate = ''
        if len(end.strip()) == 4:
            endDate = datetime.strptime(end.strip(), '%Y').year
        else:
            try:
                endDate = datetime.strptime(end.strip(), '%b %d, %Y').date()
            except:
                endDate = ''
        
        ## Upper bar
        header = soup.find("div", attrs={"class":"anime-detail-header-stats di-tc va-t"})
        # Anime score
        rawscore = header.find("div", attrs={"class":"fl-l score"}).get_text()
        try:
            animeScore = float(rawscore)
        except:
            animeScore = ''
        # Members
        members = header.find("span", attrs={"class":"numbers members"}).get_text().split()[1]
        rawmembers = members.replace(',', '')
        try:
            animeNumMembers = int(rawmembers)
        except:
            animeNumMembers = ''
        # Rank
        rank = header.find("span", attrs={"class":"numbers ranked"}).get_text().split()[1]
        rawrank = rank.replace('#', '')
        try:
            animeRank = int(rawrank)
        except:
            animeRank = ''
        # Anime Users
        users = header.find("div", attrs={"class":"fl-l score"})['data-user'].split()[0]
        rawusers = users.replace(',','')
        try:
            animeUsers = int(rawusers)
        except:
            animeUsers = ''
        # Popularity
        pop = header.find("span", attrs={"class":"numbers popularity"}).get_text().split()[1]
        rawpopularity = pop.replace('#', '')
        try:
            animePopularity = int(rawpopularity)
        except:
            animePopularity = ''
        
        # Synopsis (description)
        animeDescription = soup.find("p", attrs ={"itemprop":"description"}).get_text(strip=True)
        
        # Related Anime
        table = soup.find("table", attrs={"class":"anime_detail_related_anime"})
        animeRelated = []
        if table:
            for x in table.find_all("td", attrs={"class":"borderClass"}):
                for y in x.find_all("a"):
                    animeRelated.append(y.get_text(strip=True))
                    
        # Characters, voices and staff
        # each case: where we have all, some of them or no-one
        tables = soup.find_all("div", attrs={"class":"detail-characters-list clearfix"})
        if len(tables) == 2:
            table, table_staff = tables
            animeCharacters = []
            for x in table.find_all('td', attrs={"class":"borderClass", "valign":"top", "align":None}):
                char = x.get_text("/", strip=True).split('/')[0]
                if char:
                    animeCharacters.append(char)

            animeVoices = []
            for x in table.find_all('td', attrs={"class":"borderClass", "valign":"top", "align":"right"}):
                voice = x.get_text("/", strip=True).split('/')[0]
                if voice:
                    animeVoices.append(voice)

            animeStaff = []
            for x in table_staff.find_all('td', attrs={"class":"borderClass", "valign":"top", "width":None}):
                member = (x.get_text("/", strip=True).split('/'))
                animeStaff.append(member)
        elif len(tables) == 1:
            animeCharacters = ''
            animeVoices = ''
            animeStaff = []
            for x in tables[0].find_all('td', attrs={"class":"borderClass", "valign":"top", "width":None}):
                member = (x.get_text("/", strip=True).split('/'))
                animeStaff.append(member)
        else:
            animeCharacters = ''
            animeVoices = ''
            animeStaff = ''

        # Saving tsv file
        tsvname = 'anime_'+ n_article + '.tsv'
        path = 'File tsv/'+tsvname
        to_write = [animeTitle, animeType, animeNumEpisode, releaseDate, endDate,
                               animeNumMembers, animeScore, animeUsers, animeRank, animePopularity,
                               animeDescription, animeRelated, animeCharacters, animeVoices, animeStaff]
        with open(path, 'wt', encoding = 'utf-8') as out:
            tsv_writer = csv.writer(out, delimiter = '\t')
            tsv_writer.writerow(["animeTitle", "animeType", "animeNumEpisode", "releaseDate", "endDate",
                               "animeNumMembers", "animeScore", "animeUsers", "animeRank", "animePopularity",
                               "animeDescription", "animeRelated", "animeCharacters", "animeVoices", "animeStaff"])
            tsv_writer.writerow(to_write)

### In the following bunch of script, we put an **example of the execution** of each branch of the code on a single html to show how it works



*   How a tsv file created looks like



In [None]:
with open('anime_6551.tsv') as f:
    content = csv.reader(f, delimiter = '\t')
    for line in content:
        print(line)

['animeTitle', 'animeType', 'animeNumEpisode', 'releaseDate', 'endDate', 'animeNumMembers', 'animeScore', 'animeUsers', 'animeRank', 'animePopularity', 'animeDescription', 'animeRelated', 'animeCharacters', 'animeVoices', 'animeStaff']
[]
['Ashita e Free Kick', 'TV', '52', '1992-04-14', '1993-04-24', '1873', '6.42', '611', '6551', '9603', 'Shun Godai is a young boy who likes soccer. But his grandfather, a successful businessman, want his grandson to follow his path...', '[]', "['Godai, Shun', 'Mascowitz, Jose', 'Aritaka, Mizuho', 'Henderson, Carl', 'Toto', 'Morita, Mirei', 'Bazettini, Roberto', 'Randou, Shin', 'Green, Abe', 'Becken, Chris']", "['Kusao, Takeshi', 'Takagi, Wataru', 'Amano, Yuri', 'Ishino, Ryuuzou', 'Yamaguchi, Kappei', 'Orikasa, Ai', 'Sasaki, Nozomu', 'Koyasu, Takehito', 'Iwanaga, Tetsuya', 'Kosugi, Juurouta']", '[[\'Amino, Tetsuro\', \'Director\'], [\'Itou, Naoyuki\', \'Episode Director, Animation Director\'], [\'Kato, Takao\', \'Episode Director, Storyboard\'], ["D\'Av

In [None]:
with open("Anime/page_131/article_6519.html", "r", encoding="utf-8") as f:
    content = f.read()

In [None]:
soup = BeautifulSoup(content, "lxml")



*   Get the title



In [None]:
# Title
animeTitle = soup.find("h1", attrs ={"class":"title-name h1_bold_none"}).get_text(strip=True)
animeTitle

'Blue Archive'



*  Get the info from the table



In [None]:
## Table information
border = soup.find("td", attrs ={"class":"borderClass"})
rawtype = border.find("h2", text = "Information").next_sibling.next_sibling.get_text(strip = True)
# Anime Type
animeType = rawtype.split(':')[-1]
# Anime number episode
rawepisode = border.find("span", attrs ={"class":"dark_text"}, text = "Episodes:").next_sibling.get_text(strip=True)
try:
    animeNumEpisode = int(rawepisode)
except:
    animeNumEpisode = ''
# Release and End date
aired = border.find("span", attrs ={"class":"dark_text"}, text = "Aired:").next_sibling.get_text(strip=True)
start, end = aired.split('to')[0], aired.split('to')[-1]
if len(start.strip()) == 4:
  releaseDate = datetime.strptime(start.strip(), '%Y').year
else:
    try:
        releaseDate = datetime.strptime(start.strip(), '%b %d, %Y').date()
    except:
        releaseDate = ''
if len(end.strip()) == 4:
    endDate = datetime.strptime(end.strip(), '%Y').year
else:
    try:
        endDate = datetime.strptime(end.strip(), '%b %d, %Y').date()
    except:
        endDate = ''
print(animeType, animeNumEpisode, releaseDate, endDate)

ONA NaN 2021-01-25 NaN





*   Get info from the upper bar





In [None]:
# Upper bar
header = soup.find("div", attrs={"class":"anime-detail-header-stats di-tc va-t"})
animeScore = float(header.find("div", attrs={"class":"fl-l score"}).get_text())
members = header.find("span", attrs={"class":"numbers members"}).get_text().split()[1]
animeNumMembers = int(members.replace(',', ''))
rank = header.find("span", attrs={"class":"numbers ranked"}).get_text().split()[1]
animeRank = int(rank.replace('#', ''))
users = header.find("div", attrs={"class":"fl-l score"})['data-user'].split()[0]
animeUsers = int(users.replace(',',''))
pop = header.find("span", attrs={"class":"numbers popularity"}).get_text().split()[1]
animePopularity = int(pop.replace('#', ''))
print(animeScore, animeNumMembers, animeRank, animeUsers, animePopularity)

6.45 5698 6420 1788 6649




*   Get the synopsis



In [None]:
# Synopsis
animeDescription = soup.find("p", attrs ={"itemprop":"description"}).get_text(strip=True)
animeDescription

'Recap featuring fairies with new narration by Watashi.'



*   Get the related anime



In [None]:
# Related
table = soup.find("table", attrs={"class":"anime_detail_related_anime"})
animeRelated = []
if table:
    for x in table.find_all("td", attrs={"class":"borderClass"}):
        for y in x.find_all("a"):
            animeRelated.append(y.get_text(strip=True))
animeRelated

['Jinrui wa Suitai Shimashita']



*   Get characters, voices and staff



In [None]:
# Characters, voices and staff
table, table_staff = soup.find_all("div", attrs={"class":"detail-characters-list clearfix"})

animeCharacters = []
for x in table.find_all('td', attrs={"class":"borderClass", "valign":"top", "align":None}):
    char = x.get_text("/", strip=True).split('/')[0]
    if char:
        animeCharacters.append(char)
        
animeVoices = []
for x in table.find_all('td', attrs={"class":"borderClass", "valign":"top", "align":"right"}):
    voice = x.get_text("/", strip=True).split('/')[0]
    if voice:
        animeVoices.append(voice)
        
animeStaff = []
for x in table_staff.find_all('td', attrs={"class":"borderClass", "valign":"top", "width":None}):
    member = (x.get_text("/", strip=True).split('/'))
    animeStaff.append(member)
    
print(animeCharacters, animeVoices, animeStaff)

['Watashi', 'Yousei-san', 'Y', 'Joshu-san'] ['Nakahara, Mai'] [['Tanaka, Romeo', 'Original Creator'], ['Tobe, Sunaho', 'Original Character Design']]


# 2 - Search Engine

Before creating the search engines, we create a function that computes the pre-process on the text using the nltk library.



*   **Pre_processing_text:**



In [2]:
def pre_process_text(text):
    # Removing stopwords
    # The synopsis is tokenized (divided into words)
    text_tokens = word_tokenize(text)

    tokens_without_sw = [
        word for word in text_tokens if word not in stopwords.words()]

    # join the list of above words to create a sentence without stop words
    filtered_sentence = (" ").join(tokens_without_sw)
    # print(filtered_sentence)

    # Removing punctuation
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    text_without_punctuation = tokenizer.tokenize(filtered_sentence)
    text_without_punctuation2 = (" ").join(text_without_punctuation)
    # print(text_without_punctuation2)

    # Stemming
    porter = PorterStemmer()
    new_synopsis = []
    #new_synopsis = list(map(porter.stem, text_without_punctuation))
    for word in text_without_punctuation:
        w = porter.stem(word)
        new_synopsis.append(w)
    return new_synopsis

## 2.1 Conjuntive query

In this search engine we focus on the synopsis (anime description) and firstly we create a vocabulary that maps each word we find to an integer.
<br>
Secondly we create an inverted index dictionary that has the following structure: 

Key: index_word 

Value: array that contains all the anime where the word is in the description of that anime

### 2.1.1 Create your index

Function that creates the vocabulary and the inverted-index dictionary:

In [None]:
def create_voc_ind(start=1):
    vocabulary = {}
    dictionary = {}
    count = 0
    for i in range(19130):
        # anime 7242 and 15009 gave us a "page_not_found" so we skipped them while creating the tsv, so we do the same here
        if (start+i) == 7242 or (start+i) == 15009:
            continue
        # open the tsv of each anime to retrieve the information
        with open("File tsv/anime_"+str(start+i)+".tsv", encoding="utf-8") as fd:
            rd = csv.reader(fd, delimiter="\t", quotechar='"')
            l = []
            for row in rd:
                l.append(row)
            text = l[2][10]  # get the synopsis of the anime

        # use the function that do the preprocessing operations on the text
        new_synopsis = pre_process_text(text)

        # for each word in the description after the preprocess
        for word in new_synopsis:
            # if word is not in vocabulary, add it and assign it a value (incremental)
            if word not in vocabulary:
                vocabulary[word] = count
                count += 1
            # get the correspondent integer value in the vocabulary
            val = vocabulary[word]
            # if the value (word) is in the dictionary and the anime wasn't added yet, add it
            if val in dictionary and 'anime_'+str(start+i) not in dictionary[val]:
                dictionary[val].append('anime_'+str(start+i))
            # if it's not in the dictionary create the key and add a list that contain the anime as a value
            elif val not in dictionary:
                dictionary[val] = ['anime_'+str(start+i)]

    # Save vocabulary and dictionary (inverted index)
    with open('vocabulary.json', 'w', encoding='utf-8') as f:
        json.dump(vocabulary, f, ensure_ascii=False, indent=4)
    with open('dictionary.json', 'w', encoding='utf-8') as f:
        json.dump(dictionary, f, ensure_ascii=False, indent=4)

Executing this code we will have the files "vocabulary.json" and "dictionary.json" (our inverted index).

### 2.1.2 Execute the query

Firstly, we **load the vocabulary and dictionary** previously created

In [3]:
with open('vocabulary.json', 'r', encoding='utf-8') as f:
    vocabulary = json.load(f)
with open('dictionary.json', 'r', encoding='utf-8') as f:
    dictionary = json.load(f)

Secondly, we create the function that **gets the query and elaborates it** (preprocessing and translation from word to index)

In [4]:
def get_query(vocabulary):
    # ask the user for the query
    query = input('What are you looking for?')
    # preprocess the query
    query = pre_process_text(query)
    
    for i in range(len(query)):
        # check if word in query is present in vocabulary
        if query[i] not in vocabulary:
            print("the word "+query[i]+" is not present in documents")
            return
        # translate query using vocabulary (get the correspondent index)
        else:
            query[i] = str(vocabulary[query[i]])
    return query

Also we create the function that **displays the results** we get from the search engine

In [5]:
def display_res(results):
    print("Results found: ", len(results))
    to_display=[]
    for anime in results:
        path = "File tsv/"+anime+".tsv"
        
        # access the corresponding tsv file to take out title and description
        df = pd.read_csv(path, sep='\t')
        title = df.animeTitle.item()
        descr = df.animeDescription.item()
        
        # open the file with urls to get the url of the anime
        f = open('anime_url.txt')
        lines = f.readlines()
        f.close()
        url = lines[int(anime[6:])-1]
        
        # put it together
        to_display.append([title, descr, url])
    display(pd.DataFrame(to_display, columns = ['Anime title', 'Anime description', 'Url']))

Finally, we can create our effective **search engine** using the functions above

In [6]:
def search():
    # getting the query and preprocessing it
    query = get_query(vocabulary)
    if query:
        possible = []
        for i in range(len(query)):
            # find the documents where the word (index) is, using the inverted index
            possible.append(set(dictionary[str(query[i])]))
        # get the documents where there are all the words in the query doing the intersection
        result = set.intersection(*possible)
        if result:
        # return information for the results
           display_res(result)
        else:
            print('No matches found')

Now we can test it using the query suggested "saiyan race" and see what our search engine returns as a result

In [7]:
search()

What are you looking for? saiyan race


Results found:  4


Unnamed: 0,Anime title,Anime description,Url
0,Dragon Ball Kai,"Five years after the events of Dragon Ball, ma...",https://myanimelist.net/anime/6033/Dragon_Ball...
1,Dragon Ball Z,Five years after winning the World Martial Art...,https://myanimelist.net/anime/813/Dragon_Ball_Z\n
2,Dragon Ball Z Special 1: Tatta Hitori no Saish...,"Bardock, Son Goku's father, is a low-ranking S...",https://myanimelist.net/anime/986/Dragon_Ball_...
3,Dragon Ball Super: Broly,"Forty-one years ago on Planet Vegeta, home of ...",https://myanimelist.net/anime/36946/Dragon_Bal...


As we can see we obtain 4 results, and it give us, for each result: 


*   Title
*   Description
*   Url



# 2.2 Conjunctive query & Ranking score

In this search engine we still focus on the synopsis (anime description) but creating a ranking score using the tfidf.
<br>
We use the vocabulary previously created but this time we will build an inverted-index having the following structure:

Key: index_word

Value: array that contains all the anime where the word is in the description of that anime and the correspondent tfidf score for each of them

### 2.2.1 Create your index

Function that creates the inverted index dictionary and computes the tfidf in it

In [None]:
def create_inv_ind_tfidf(vocabulary, dictionary, start = 1):
    n_anime = 19130
    dictionary_tfidf = {}
    for i in range(n_anime):
        # skipping broken pages
        if (start+i) == 7242 or (start+i) == 15009:
            continue
            
        with open("File tsv/anime_"+str(start+i)+".tsv", encoding="utf-8") as fd:
            rd = csv.reader(fd, delimiter="\t", quotechar='"')
            l = []
            for row in rd:
                l.append(row)
            text = l[2][10]  # the synopsis of the anime
            # use the function that do the preprocessing operations on the synopsis
            text = pre_process_text(text)

        word_text = len(text)
        
        # TF
        for word in text:
            # get the id of the word
            word_id = str(vocabulary[word])
            if word_id in dictionary_tfidf:
                if dictionary_tfidf[word_id][-1][0] != 'anime_'+str(start+i):
                    dictionary_tfidf[word_id].append(
                        ['anime_'+str(ini+i), 1, word_text])
                else:
                    dictionary_tfidf[word_id][-1][1] += 1
            # if it's not in the dictionary create the key and add a list that contain the anime as a value
            else:
                dictionary_tfidf[word_id] = [['anime_'+str(start+i), 1, word_text]]

    # For each word, compute the tfidf related to each document where it appears
    inv_indx_tfidf = {}
    for word in dictionary_tfidf:
        inv_indx_tfidf[word] = []
        for doc in dictionary_tfidf[word]:
            inv_indx_tfidf[word].append((
                doc[0], doc[1]/doc[2]*np.log10(n_anime/len(dictionary[word]))))

    with open('inv_ind_TFIDF.json', 'w', encoding='utf-8') as f:
        json.dump(inv_indx_tfidf, f, ensure_ascii=False, indent=4)

Executing this code we will have the file "inv_ind_TFIDF.json" that contains the inverted index with tfidf scores

### 2.2.2 Execute the query

As before, we need to load the index we have just created

In [8]:
with open('inv_ind_TFIDF.json', 'r', encoding='utf-8') as f:
    inv_ind_tfidf = json.load(f)

Firstly we need the function that **calculates the norm of a document**

In [9]:
def norm_doc(docname, inv_ind, vocabulary):
    doc_vector = []
    # accessing the anime description
    with open('File tsv/'+docname+'.tsv', encoding = 'utf-8') as f:
        rd = csv.reader(f, delimiter="\t", quotechar='"')
        l=[]
        for row in rd:
            l.append(row) 
        text=l[2][10]
    new_text = pre_process_text(text)
    new_text = [str(vocabulary[word]) for word in new_text]
    # extracting the tf-idfs of all the words in the document
    for w in new_text:
        for el in inv_ind[w]:
            if el[0] == docname:
                doc_vector.append(el[1])
    doc_vector = np.array(doc_vector)
    return np.linalg.norm(doc_vector,2)

Then we need to **create a vector for the query** that we can use later to compute the similarity

In [10]:
def vectorize_query(query, inv_ind, vocabulary):
    q_vector = []
    for word in query:
        idf = np.log(19130/len(inv_ind[word]))
        q_vector.append(idf)
    return np.array(q_vector)

We also need a function that **computes the cosine_similarity**

In [11]:
def cosine_similarity(docname, query, inv_ind, vocabulary):
    # compute the norm of the vectorized document
    norm_d = norm_doc(docname, inv_ind, vocabulary)
    # vectorize the query
    vector_q = vectorize_query(query, inv_ind, vocabulary)
    vector_d = []
    # vectorize the document according to the query
    for w in query:
        for el in inv_ind[w]:
            if el[0] == docname:
                vector_d.append(el[1])
    vector_d = np.array(vector_d)
    # compute the norm of the vectorized query
    norm_q = np.linalg.norm(vector_q)
    # compute the cosine similarity
    return np.dot(vector_q,vector_d)/(norm_d*norm_q)

Finally, also in this case, we create a function that **displays the results** obtained from the search engine

In [12]:
def display_res_score(results):
    print("Top", len(results), 'documents')
    to_display=[]
    for score, anime in results:
        # accessing the .tsv file corresponding to the anime
        path = "File tsv/"+anime+".tsv"
        df = pd.read_csv(path, sep='\t')
        # retrieving anime title and description
        title = df.animeTitle.item()
        descr = df.animeDescription.item()
        # retrieving the url of the anime
        f = open('anime_url.txt')
        lines = f.readlines()
        f.close()
        url = lines[int(anime[6:])-1]
        # putting it all together
        to_display.append([title, descr, url, round(score, 2)])
    display(pd.DataFrame(to_display, columns = ['Anime title', 'Anime description', 'Url', 'Score']))

Now we can create our **search engine**

In [13]:
def search2():
    query= get_query(vocabulary)
    if query:
        possible = []
        for i in range(len(query)):
            # find the documents where the word (index) is, using the inverted index
            possible.append(set(dictionary[query[i]]))
        # get the documents where there are all the words in the query doing the intersection
        result = set.intersection(*possible)
        if result:
        # initialize the heap for storing scores
            heap = list()
            heapq.heapify(heap)
            # compute cosine similarity for all matching documents
            for r in result:
                similarity = cosine_similarity(r, query, inv_ind_tfidf, vocabulary)
                heapq.heappush(heap, (similarity, r))
            # retrieve top 5 documents from the heap
            top_5 = heapq.nlargest(5, heap)
            display_res_score(top_5)
        else:
            print('No matches found')

As we did before, we **try it out** using the same query "saiyan race"

In [14]:
search2()

What are you looking for? saiyan race


Top 4 documents


Unnamed: 0,Anime title,Anime description,Url,Score
0,Dragon Ball Z Special 1: Tatta Hitori no Saish...,"Bardock, Son Goku's father, is a low-ranking S...",https://myanimelist.net/anime/986/Dragon_Ball_...,0.31
1,Dragon Ball Super: Broly,"Forty-one years ago on Planet Vegeta, home of ...",https://myanimelist.net/anime/36946/Dragon_Bal...,0.1
2,Dragon Ball Z,Five years after winning the World Martial Art...,https://myanimelist.net/anime/813/Dragon_Ball_Z\n,0.07
3,Dragon Ball Kai,"Five years after the events of Dragon Ball, ma...",https://myanimelist.net/anime/6033/Dragon_Ball...,0.06


As we can see, we have as a results the same anime obtained before but ordered by the similarity score 

# 3 - Define a new score

We are going to use the cosine similarity we have defined in the previous exercise as a starting point for our new score. At the same time, now we will also take into consideration:
- the <b>title</b> of the anime: each of the matched documents will receive a bonus proportional to the number of words in the query that can also be found in the title $\rightarrow$ <b>'boosttitle'</b> function
- the <b>ranking</b> and the <b>popularity</b>: the more popular and appreciated the anime, the more likely it is that a user could be looking for it, so we are going to give the matched documents a boost based on their performance in rankings and popularity $\rightarrow$ <b>'boostrank'</b> and <b>'boostpopularity'</b> functions
- the <b>type</b> of the anime: we are going to give the possibility to the user to input an additional query specifying the type of anime they are looking for. This makes sense if we think about the fact that many animes have different versions/remakes, and one could be looking for a specific version of their favorite anime $\rightarrow$ <b>'matchtype'</b> function 
- the <b>date</b> the anime was released: just like before, the user will be able to specify a range of dates (in the format YYYY-YYYY) that roughly correspond to the time the anime he's looking for was released $\rightarrow$ <b>'boostdate'</b> function
<br><br>
<b>Remark n.1</b>: as specified in the assignment, the additional queries <b>do not</b> filter the results. Instead, if they are present, results that match them will simply receive a bonus in their final score.
<br><br>
The <b>'boostscore'</b> function creates a new score that will look like this:
<br><br>
<div align = center>$\frac{cosine\ similarity + potential\ bonuses}{3.4}$</div>
<br>
where 3.4 is the maximum possible score (1 for the cosine similarity and 2.4 for our bonuses).
<br><br>
<b>Remark n.2</b>: dividing for the maximum score does not only allow us to make direct confrontations with the raw cosine similarities used for Exercise 2, but is also a way of 'penalising' results that perform poorly with respect to the additional bonuses. In this way, the scoring function is actually more balanced because it does not only boost scores but it can also bring them down.

In [15]:
def boosttitle(query, title):
    count = 0
    # removing punctuation and capital letters
    title = title.lower()
    title = title.translate(str.maketrans('', '', string.punctuation))
    title = title.split()
    # translating title 
    for i in range(len(title)):
        if title[i] in vocabulary:
            title[i] = str(vocabulary[title[i]])
    # counting matches and computing bonus (maximum value is 1)       
    for w in query:
        if w in title:
            count += 1
    res = count/len(title)
    return res

In [16]:
def boostpopularity(pop):
    res = 0
    if pop < 50:
        res += 0.4
    elif pop < 100:
        res += 0.3
    elif pop < 1000:
        res += 0.2
    elif pop < 5000:
        res += 0.1
    elif pop < 10000:
        res += 0.05
    return res

In [17]:
def boostrank(rank):
    res = 0
    if rank < 50:
        res += 0.4
    elif rank < 100:
        res += 0.3
    elif rank < 1000:
        res += 0.2
    elif rank < 5000:
        res += 0.1
    elif rank < 10000:
        res += 0.05
    return res

In [18]:
def matchtype(animetype, typequery):
    res = 0
    if typequery:
        # checking if the type of the anime matches with the specified query
        if typequery == animetype:
            res += 0.4
    return res

In [19]:
def time_in_range(start, end, x):
    # checking if date x is in the specified range or not
    if start <= end:
        return start <= x <= end
    else:
        return start <= x or x <= end
    
def boostdate(animedate, datequery):
    res = 0
    if datequery:
        # convert to datetime
        datequery = datequery.split('-')
        start = datetime(int(datequery[0]),1,1)
        end = datetime(int(datequery[1]),1,1)
        if animedate:
            try:
                year = datetime.strptime(animedate, '%Y-%m-%d').year
                year = datetime(year, 1,1)
            except:
                year = datetime(animedate,1,1)
            # checking and adding bonus
            if time_in_range(start,end,year):
                res += 0.2
    return res

In [20]:
def boostscore(anime, cosine_similarity, query, typequery, datequery):
    # accessing  and extracting information about the anime
    with open('File tsv/'+str(anime)+'.tsv', encoding = 'utf-8') as f:
        df = pd.read_csv(f, delimiter='\t')
    title = df.animeTitle.item()
    rank = df.animeRank.item()
    animetype = df.animeType.item()
    animedate = df.releaseDate.item()
    popularity = df.animePopularity.item()
    # computing all the bonuses for the anime
    bonuses = boosttitle(query,title) + boostrank(rank) + matchtype(animetype, typequery) + boostdate(animedate, datequery) + boostpopularity(popularity)
    # computing final score after bonuses
    return (bonuses + cosine_similarity)/3.4

In [21]:
def get_new_query(vocabulary):
    # ask the user for the main and the additional queries
    query = input('What are you looking for?')
    datequery = input('From which period is it from?')
    typequery = input('What type of anime is it?')
    # preprocess the query
    query = pre_process_text(query)
    
    for i in range(len(query)):
        # check if query matches
        if query[i] not in vocabulary:
            print('No matches found')
            return
        # translate query using vocabulary
        else:
            query[i] = str(vocabulary[query[i]])
    return query, datequery, typequery

In [22]:
def search3():
    query, datequery, typequery = get_new_query(vocabulary)
    if query:
        possible = []
        for i in range(len(query)):
            # find the documents where the word (index) is, using the inverted index
            possible.append(set(dictionary[query[i]]))
        # get the documents where there are all the words in the query doing the intersection
        result = set.intersection(*possible)
        if result:
        # initialize the heap for storing scores
            heap = list()
            heapq.heapify(heap)
            # compute the new score for all matching documents
            for r in result:
                similarity = cosine_similarity(r, query, inv_ind_tfidf, vocabulary)
                new_score = boostscore(r, similarity, query, typequery, datequery)
                heapq.heappush(heap, (new_score, r))
            # retrieve top 5 documents from the heap
            top_5 = heapq.nlargest(5, heap)
            display_res_score(top_5)
        else:
            print('No matches found')

Let's try our new score out with our usual query.

In [23]:
search3()

What are you looking for? saiyan race
From which period is it from? 1985-1995
What type of anime is it? TV


Top 4 documents


Unnamed: 0,Anime title,Anime description,Url,Score
0,Dragon Ball Z,Five years after winning the World Martial Art...,https://myanimelist.net/anime/813/Dragon_Ball_Z\n,0.34
1,Dragon Ball Kai,"Five years after the events of Dragon Ball, ma...",https://myanimelist.net/anime/6033/Dragon_Ball...,0.22
2,Dragon Ball Z Special 1: Tatta Hitori no Saish...,"Bardock, Son Goku's father, is a low-ranking S...",https://myanimelist.net/anime/986/Dragon_Ball_...,0.21
3,Dragon Ball Super: Broly,"Forty-one years ago on Planet Vegeta, home of ...",https://myanimelist.net/anime/36946/Dragon_Bal...,0.15


As we see, results our now more accurate with respect to the user query: the best result here is a more popular anime that matches both the type and the range of dates that was specified. We can see that the previous best result is now in third place, with a lower score: as we mentioned before, it must have gotten penalised by poor performance in terms of popularity, rank and type mismatch with respect to the query. At the same time, the second result matches the type specified in the query but not the dates. This is a matter of how bonuses are actually weighted: since dates are a bit more hard to remember correctly, we think it makes sense in this particular case to give more importance to the type of the anime.


# 5 - Algorithmic question

### 1) Write an algorithm that computes the acceptable solution with the longest possible duration

This function computes the longest possible duration and call the function below

In [None]:
# Function that finds the longest duration of appointment respecting the constraint
def best_plan_appointment(x):
    # create a new list of the same dimension setting the first and second elements
    t = [0 for _ in range(len(x))]
    t[0] = x[0]
    t[1] = x[1]
    for i in range(2, len(x)):
        # find at each position what is the maximum value we can get respecting the constraint
        t[i] = max(x[i], max(t[:i-1])+x[i])

    return max(t), find_values_subseq(x, t, max(t))


### 2) Implement a program that given in input an instance in the form given above, gives the optimal solution

This function receives in input what we obtained before and find out what are the appointment to choose to get the best one found

In [None]:
# Function used to find the value that give us the best solution
def find_values_subseq(x, lista, best):
    # both the lists are reversed
    lista.reverse()
    x.reverse()
    ris = []
    #when i find that value i subtract it and i know that this is one of the values of the solution
    for i in range(len(lista)):
        if lista[i] == best:
            best -= x[i]
            ris.append(x[i])
    ris.reverse()
    return ris

We can try them out

In [None]:
print(best_plan_appointment([30, 40, 25, 50, 30, 20]))

(110, [40, 50, 20])
