# **Homework 3 - Places of the world**

In [1]:
#import libraries
import heapq
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
from os import listdir
from os.path import isfile, join
import os
import shutil
import urllib
from datetime import datetime
import csv
from nltk.corpus import stopwords
import string
from nltk import word_tokenize
import nltk
from collections import OrderedDict
import json
from tqdm import tqdm

# **1. Data collection**
For this homework, there is no provided dataset. Instead, you have to build your own. Your search engine will run on text documents. So, here we detail the procedure to follow for the data collection.

**1.1. Get the list of places**
We start with the list of places to include in your corpus of documents. In particular, we focus on the Most popular places. Next, we want you to collect the URL associated with each site in the list from this list. The list is long and split into many pages. Therefore, we ask you to retrieve only the URLs of the places listed in the first 400 pages (each page has 18 places, so that you will end up with 7200 unique place URLs).

The output of this step is a .txt file whose single line corresponds to the place's URL.

In [None]:
url = 'https://www.atlasobscura.com/places?page={}&sort=likes_count'
result = requests.get(url)

print(result)
print(result.text) #getting the txt file

In [None]:
soup = BeautifulSoup(result.text)

In [None]:
mydivs = soup.find_all("a", {"class": "content-card content-card-place"})
'https://www.atlasobscura.com' + mydivs[1]['href']

In [None]:
with open('places_url.txt', 'w') as f:
    for i in range(1,401):
        url = 'https://www.atlasobscura.com/places?page={}&sort=likes_count'.format(i)
        result = requests.get(url)
        soup = BeautifulSoup(result.text)
        mydivs = soup.find_all("a", {"class": "content-card content-card-place"})
        for anchor in mydivs:
            f.write('https://www.atlasobscura.com' + anchor['href'] + "\n")
    f.close();
#getting urls of each line

**1.2. Crawl places**

Once you get all the URLs in the first 400 pages of the list, you:

Download the HTML corresponding to each of the collected URLs.
After you collect a single page, immediately save its HTML in a file. In this way, if your program stops for any reason, you will not lose the data collected up to the stopping point.
Organize the entire set of downloaded HTML pages into folders. Each folder will contain the HTML of the places on page 1, page 2, ... of the list of locations.

Tip: Due to a large number of pages you should download, you can use some methods that can help you shorten the time it takes. If you employed a particular process or approach, kindly describe it.

In [None]:
def saveHtmlFiles():
    notDownloadedUrls = []
    with open('places_url.txt') as file:
        for url in file:
            try:
                urllib.request.urlretrieve(url,url.split('/')[-1].replace('\n','')+'.html')
            except Exception as e:
                notDownloadedUrls.append(url)
                continue
    file.close()
    return notDownloadedUrls
#Downloaded the HTML corresponding to each of the collected URLs

In [None]:
notDownloadedUrls = saveHtmlFiles()

while notDownloadedUrls:
    with open('places_url.txt', 'w') as f:
        for url in notDownloadedUrls:
            f.write(url + "\n")
    f.close();
    notDownloadedUrls = saveHtmlFiles() #saving HTML in a file

**1.3 Parse downloaded pages**

At this point, you should have all the HTML documents about the places of interest, and you can start to extract the places' information. The list of the information we desire for each place and their format is as follows:

In [None]:
file_path = os.getcwd()
files = []
with open('places_url.txt') as file:
    for url in file:
        files.append(url.split('/')[-1].replace('\n','')+'.html')
    file.close()
file_list = np.array_split(files,400)


In [None]:
for i in range(400):
    directory = 'page_'+ str(i+1)

    target_dir = os.path.join(file_path, directory)

    os.mkdir(target_dir)

    for file_name in file_list[i]:
        shutil.move(os.path.join(file_path, file_name), target_dir)

In [3]:
headerList = ['placeName','placeTags','numPeopleVisited','numPeopleWant','placeDesc','placeShortDesc','placesNearby','placeAddress','placeAlt','placeLong','placeEditors','placePubDate','placeRelatedLists','placeRelatedPlaces','placeUrl']
for i in range(1,401):
    dir = os.getcwd() + '\page_{}'.format(i)
    files = [f for f in listdir(dir) if isfile(join(dir, f)) and f.lower().endswith(('.html'))]
    for file in files:
        with open(os.path.join(dir,file), 'rb+') as fp:
            soup = BeautifulSoup(fp, "html.parser")

            placeName = '' if soup.select('h1')[0].text.strip() is None else soup.select('h1')[0].text.strip()

            placeTags = '' if soup.find('div', {'class': 'item-tags'}) is None else [a.text.strip() for a in soup.find('div', {'class': 'item-tags'}).select('a', {'class': 'itemTags__link'})]

            numPeopleVisited = ('' if soup.find('aside', {'class': 'DDPage__item-actions'}) is None else soup.find('aside', {'class': 'DDPage__item-actions'}).select('div', {'class': 'title-md item-action-count'})[3].text.strip())

            numPeopleWant = ('' if soup.find('aside', {'class': 'DDPage__item-actions'}) is None else soup.find('aside', {'class': 'DDPage__item-actions'}).select('div', {'class': 'title-md item-action-count'})[4].text.strip())

            numPeopleWant = '' if numPeopleWant == '' else int(''.join(filter(str.isdigit, numPeopleWant)))

            placeDesc = '' if soup.find('div', {'id': 'place-body'}) is None else soup.find('div', {'id': 'place-body'}).findNext().text.strip()

            placeShortDesc = '' if soup.find('h3', {'class': 'DDPage__header-dek'}) is None else soup.find('h3', {'class': 'DDPage__header-dek'}).text.strip()

            placesNearby = '' if soup.findAll('div', {'class': 'DDPageSiderailRecirc__item-title'}) is None else set([a.text.strip() for a  in soup.findAll('div', {'class': 'DDPageSiderailRecirc__item-title'})])

            placeAddress = '' if soup.find('address', {'class': 'DDPageSiderail__address'}) is None else soup.find('address', {'class': 'DDPageSiderail__address'}).text.strip()

            placeAlt = '' if soup.find('div', {'class': 'DDPageSiderail__coordinates'}) is None else soup.find('div', {'class': 'DDPageSiderail__coordinates'}).text.strip().split(',')[0].strip()

            placeLong = '' if soup.find('div', {'class': 'DDPageSiderail__coordinates'}) is None else soup.find('div', {'class': 'DDPageSiderail__coordinates'}).text.strip().split(',')[1].strip()

            placeEditors = '' if soup.findAll('a', {'class': 'DDPContributorsList__contributor'}) is None else [ a.text.strip() for a in soup.findAll('a', {'class': 'DDPContributorsList__contributor'})]

            placePubDate = '' if soup.find('div', {'class': 'DDPContributor__name'}) is None else datetime.strptime(soup.find('div', {'class': 'DDPContributor__name'}).text,'%B %d, %Y').date()

            placeRelatedLists = '' if soup.find('div', {'class': 'CardRecircSection__title'}, text='Related Places') is None else ([a.findNext('span').text.strip() for a in soup.find('div', {'class': 'CardRecircSection__title'}, text='Related Places').findNext('div', {'class': 'CardRecircSection__card-grid'}).findAll('a')])

            placeRelatedPlaces = '' if soup.find('div', {'class': 'CardRecircSection__title'}, text='Appears in') is None else ([a.findNext('span').text.strip() for a in soup.find('div', {'class': 'CardRecircSection__title'}, text='Appears in').findNext('div', {'class': 'CardRecircSection__card-grid'}).findAll('a')])

            placeUrl = '' if soup.find('div', {'class': 'DDPageSiderail__website'}) is None else soup.find('div', {'class': 'DDPageSiderail__website'}).find('a')['href'].strip()

        fp.close()

        valuesList = [placeName,placeTags,numPeopleVisited,numPeopleWant,placeDesc,placeShortDesc,placesNearby,placeAddress,placeAlt,placeLong,placeEditors,placePubDate,placeRelatedLists,placeRelatedPlaces,placeUrl]
        with open(os.path.join(dir, file.title().replace('.Html','').lower() +'.tsv'), 'w+', encoding="utf-8") as tsvfile:
            writer = csv.writer(tsvfile, delimiter='\t')
            writer.writerow(headerList)
            writer.writerow(valuesList)
        tsvfile.close()

In [4]:
for i in range(1,401):
    dir = os.getcwd() + '\page_{}'.format(i)
    target_dir = os.getcwd() + '/tsv_files'
    files = [f for f in listdir(dir) if isfile(join(dir, f)) and f.lower().endswith(('.tsv'))]
    for file in files:
        shutil.move(os.path.join(dir, file.title().lower()), target_dir)

In [7]:
tsv_dir = os.getcwd() + '/tsv_files'
files = [f for f in listdir(tsv_dir) if isfile(join(tsv_dir, f)) and f.lower().endswith(('.tsv'))]
dataframes = []
for file in files:
    df = pd.read_csv(os.path.join(tsv_dir, file.title().lower()), delimiter="\t")
    dataframes.append(df)
df = pd.DataFrame(pd.concat(dataframes)).reset_index().drop('index', axis=1)
df_original = df.copy()
df

Unnamed: 0,placeName,placeTags,numPeopleVisited,numPeopleWant,placeDesc,placeShortDesc,placesNearby,placeAddress,placeAlt,placeLong,placeEditors,placePubDate,placeRelatedLists,placeRelatedPlaces,placeUrl
0,109 East Palace,"['manhattan project', 'secret', 'nuclear', 'sc...",419,472,When you need to be dropped off at a top-secre...,This innocuous New Mexico storefront was once ...,"{'Palace of the Governors', 'Spitz Clock', 'La...","109 East Palace Santa Fe, New Mexico, 87501Uni...",35.6875,-105.9372,"['michaelksugar', 'tylercole', 'Collector of E...",2014-03-31,"['Camp Century (Project Iceworm)', 'Red Gate W...",,
1,145 Rue Lafayette,"['urban planning', 'cities', 'transportation',...",163,413,"While walking down the street, you’d be forgiv...",What looks like a normal building is really a ...,"{'Le Louxor Palais du Cinema', 'Sape & Co', ""W...",145 Rue la FayetteParisFrance,48.8792,2.3562,"['EVA', 'carllenox', 'erjeffery', 'SEANETTA']",2018-06-15,"[""Vauxhall Bridge's Miniature St. Paul's Cathe...",,
2,17 Room Ruin,"['abandoned houses', 'native americans', 'aban...",161,1444,"Outside Bluff, Utah, a massive 100-foot-deep a...",A well-preserved Ancestral Puebloan ruin tucke...,"{'House on Fire Ruin', 'Forrest Gump Point', '...","Bluff, UtahUnited States",37.2748,-109.5102,"['JWill', 'Molly McBride Jacobson', 'djm213', ...",2017-05-10,"['Keller House', 'Cow Springs Trading Post', '...",,
3,1890s Alien Gravesite,"['aliens', 'graves', 'gravestones', 'cemeteries']",249,1194,"While it receives no fanfare today, the small ...",This small town Texas cemetery is said to be t...,"{'Chef Point Bar & Restaurant', 'Billy Bob’s T...","507 Cemetery RdAurora, Texas, 76078United States",33.0534,-97.5000,"['EricGrundhauser', 'Molly McBride Jacobson', ...",2014-11-21,"['Betty and Barney Hill Graves', 'The Menster ...",,
4,University of Virginia’s Hidden Chemical Hearth,"['thomas jefferson', 'unesco', 'hidden', 'scie...",130,436,Nestled in the ground floor of a UNESCO World ...,Hidden for 165 years inside a building designe...,"{""University of Virginia's Seven Society"", 'Th...","1721 University AveCharlottesville, Virginia, ...",38.0357,-78.5033,"['Collin', 'blimpcaptain']",2016-05-09,"['Kamerlingh Onnes Laboratory Plaque', ""Univer...",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7195,Zorthian Ranch,"['film locations', 'farms', 'outsider art', 'c...",75,468,California’s Zorthian Ranch is a strange compe...,The cobbled together compound of a deceased ar...,"{'The Bunny Museum', 'Cobb Estate', 'Mountain ...","4010 Fair Oaks AveAltadena, California, 91001U...",34.2111,-118.1404,"['ashleypinnick', 'Rachel', 'Martin', 'MagnumP...",2013-08-27,"['Milwaukee Art Museum', 'Rocca Sinibalda Cast...",,http://www.zorthianranch.com/
7196,Zuccari Palace,"['castles', 'architectural oddities', 'archite...",432,1348,"In Rome there are thousands of churches, old b...",Architectural monsters are devouring this pala...,{'The Head of St. John the Baptist at San Silv...,"34 Via GregorianaRome, 00187Italy",41.9051,12.4844,"['pboduch', 'beefjorky', 'diffendale', 'Collec...",2014-03-06,"['Johnstone Castle', 'Castillo Ortega-Douglas'...",,
7197,Zwack Unicum Museum,"['food museums', 'food', 'museums and collecti...",201,418,Sometimes referred to as the Hungarian nationa...,Central Europe's largest collection of mini-bo...,"{'Little Nemecsek Statue', 'Gömböc', 'Paul Str...","1 Dandár u.Budapest, 1095Hungary",47.4759,19.0697,"['Shotsy', 'Molly McBride Jacobson', 'reley250...",2014-10-14,"['Baked Bean Museum of Excellence', 'National ...",,http://www.zwack.hu/en/zwack-muzeumok/zwack-mu...
7198,The Zymoglyphic Museum,"['obscura day locations', 'wonder cabinets', '...",219,1982,The Zymoglyphic Museum houses the cabinet of c...,Private collection of art inspired by cabinets...,"{'Morrison Street Minigallery', 'Lincoln Stree...","6225 SE Alder St.Portland, Oregon, 97215United...",45.5177,-122.5998,"['michelle', 'Martin', 'wythe', 'medhere', 'zy...",2013-06-13,"['Ilana Goor Museum', 'Milwaukee Art Museum', ...",,http://www.zymoglyphic.org


# **2. Search Engine**

Now, we want to create two different Search Engines that, given as input a query, return the places that match the query.

First, you must pre-process all the information collected for each place by:

- Removing stopwords
- Removing punctuation
- Stemming
- Anything else you think it's needed

For this purpose, you can use the nltk library.

In [8]:
# remove punctuation, stopwords and splitting in tokens all the columns

def remove_empty(x):
    if type(x) is str:
        x = x.split(",")
        x = [ y for y in x if y.strip()]
        return ",".join(x)
    elif type(x) is list:
        return [ y for y in x if y.strip()]

stop_words = stopwords.words('english') + list(string.punctuation)

df['placeName'] = df['placeName'].apply(lambda x: [word for word in word_tokenize(str(x)) if word not in stop_words]).apply(remove_empty)
df['placeTags'] = df['placeTags'].apply(lambda x: [word for word in word_tokenize(str(x)) if word not in stop_words]).apply(remove_empty)
df['placeDesc'] = df['placeDesc'].apply(lambda x: [word for word in word_tokenize(str(x)) if word not in stop_words]).apply(remove_empty)
df['placeShortDesc'] = df['placeShortDesc'].apply(lambda x: [word for word in word_tokenize(str(x)) if word not in stop_words]).apply(remove_empty)
df['placesNearby'] = df['placesNearby'].apply(lambda x: [word for word in word_tokenize(str(x)) if word not in stop_words]).apply(remove_empty)
df['placeAddress'] = df['placeAddress'].apply(lambda x: [word for word in word_tokenize(str(x)) if word not in stop_words]).apply(remove_empty)
df['placeEditors'] = df['placeEditors'].apply(lambda x: [word for word in word_tokenize(str(x)) if word not in stop_words]).apply(remove_empty)
df['placeRelatedLists'] = df['placeRelatedLists'].apply(lambda x: [word for word in word_tokenize(str(x)) if word not in stop_words]).apply(remove_empty)
df['placeRelatedPlaces'] = df['placeRelatedPlaces'].apply(lambda x: [word for word in word_tokenize(str(x)) if word not in stop_words]).apply(remove_empty)
df['placeUrl'] = df['placeUrl'].apply(lambda x: [word for word in word_tokenize(str(x)) if word not in stop_words]).apply(remove_empty)
df

Unnamed: 0,placeName,placeTags,numPeopleVisited,numPeopleWant,placeDesc,placeShortDesc,placesNearby,placeAddress,placeAlt,placeLong,placeEditors,placePubDate,placeRelatedLists,placeRelatedPlaces,placeUrl
0,"[109, East, Palace]","['manhattan, project, 'secret, 'nuclear, 'scie...",419,472,"[When, need, dropped, top-secret, research, fa...","[This, innocuous, New, Mexico, storefront, sec...","['Palace, Governors, 'Spitz, Clock, 'La, Conqu...","[109, East, Palace, Santa, Fe, New, Mexico, 87...",35.6875,-105.9372,"['michaelksugar, 'tylercole, 'Collector, Exper...",2014-03-31,"['Camp, Century, Project, Iceworm, 'Red, Gate,...",[nan],[nan]
1,"[145, Rue, Lafayette]","['urban, planning, 'cities, 'transportation, '...",163,413,"[While, walking, street, ’, forgiven, thinking...","[What, looks, like, normal, building, really, ...","['Le, Louxor, Palais, du, Cinema, 'Sape, Co, `...","[145, Rue, la, FayetteParisFrance]",48.8792,2.3562,"['EVA, 'carllenox, 'erjeffery, 'SEANETTA]",2018-06-15,"[``, Vauxhall, Bridge, 's, Miniature, St., Pau...",[nan],[nan]
2,"[17, Room, Ruin]","['abandoned, houses, 'native, americans, 'aban...",161,1444,"[Outside, Bluff, Utah, massive, 100-foot-deep,...","[A, well-preserved, Ancestral, Puebloan, ruin,...","['House, Fire, Ruin, 'Forrest, Gump, Point, 'F...","[Bluff, UtahUnited, States]",37.2748,-109.5102,"['JWill, 'Molly, McBride, Jacobson, 'djm213, '...",2017-05-10,"['Keller, House, 'Cow, Springs, Trading, Post,...",[nan],[nan]
3,"[1890s, Alien, Gravesite]","['aliens, 'graves, 'gravestones, 'cemeteries]",249,1194,"[While, receives, fanfare, today, small, town,...","[This, small, town, Texas, cemetery, said, bur...","['Chef, Point, Bar, Restaurant, 'Billy, Bob, ’...","[507, Cemetery, RdAurora, Texas, 76078United, ...",33.0534,-97.5000,"['EricGrundhauser, 'Molly, McBride, Jacobson, ...",2014-11-21,"['Betty, Barney, Hill, Graves, 'The, Menster, ...",[nan],[nan]
4,"[University, Virginia, ’, Hidden, Chemical, He...","['thomas, jefferson, 'unesco, 'hidden, 'science]",130,436,"[Nestled, ground, floor, UNESCO, World, Herita...","[Hidden, 165, years, inside, building, designe...","[``, University, Virginia, 's, Seven, Society,...","[1721, University, AveCharlottesville, Virgini...",38.0357,-78.5033,"['Collin, 'blimpcaptain]",2016-05-09,"['Kamerlingh, Onnes, Laboratory, Plaque, ``, U...",[nan],[nan]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7195,"[Zorthian, Ranch]","['film, locations, 'farms, 'outsider, art, 'co...",75,468,"[California, ’, Zorthian, Ranch, strange, comp...","[The, cobbled, together, compound, deceased, a...","['The, Bunny, Museum, 'Cobb, Estate, 'Mountain...","[4010, Fair, Oaks, AveAltadena, California, 91...",34.2111,-118.1404,"['ashleypinnick, 'Rachel, 'Martin, 'MagnumPI, ...",2013-08-27,"['Milwaukee, Art, Museum, 'Rocca, Sinibalda, C...",[nan],"[http, //www.zorthianranch.com/]"
7196,"[Zuccari, Palace]","['castles, 'architectural, oddities, 'architec...",432,1348,"[In, Rome, thousands, churches, old, buildings...","[Architectural, monsters, devouring, palace, '...","['The, Head, St., John, Baptist, San, Silvestr...","[34, Via, GregorianaRome, 00187Italy]",41.9051,12.4844,"['pboduch, 'beefjorky, 'diffendale, 'Collector...",2014-03-06,"['Johnstone, Castle, 'Castillo, Ortega-Douglas...",[nan],[nan]
7197,"[Zwack, Unicum, Museum]","['food, museums, 'food, 'museums, collections,...",201,418,"[Sometimes, referred, Hungarian, national, spi...","[Central, Europe, 's, largest, collection, min...","['Little, Nemecsek, Statue, 'Gömböc, 'Paul, St...","[1, Dandár, u.Budapest, 1095Hungary]",47.4759,19.0697,"['Shotsy, 'Molly, McBride, Jacobson, 'reley250...",2014-10-14,"['Baked, Bean, Museum, Excellence, 'National, ...",[nan],"[http, //www.zwack.hu/en/zwack-muzeumok/zwack-..."
7198,"[The, Zymoglyphic, Museum]","['obscura, day, locations, 'wonder, cabinets, ...",219,1982,"[The, Zymoglyphic, Museum, houses, cabinet, cu...","[Private, collection, art, inspired, cabinets,...","['Morrison, Street, Minigallery, 'Lincoln, Str...","[6225, SE, Alder, St.Portland, Oregon, 97215Un...",45.5177,-122.5998,"['michelle, 'Martin, 'wythe, 'medhere, 'zymogl...",2013-06-13,"['Ilana, Goor, Museum, 'Milwaukee, Art, Museum...",[nan],"[http, //www.zymoglyphic.org]"


In [9]:
# stemming, we throw back to the key word removing suffixes and prefixes

from nltk.stem.snowball import SnowballStemmer
englishStemmer = SnowballStemmer("english")

df['placeDesc'] = df['placeDesc'].apply(lambda row: [englishStemmer.stem(sentence) for sentence in row])

In [None]:
#for each word that is the description we created a dictionary, so we obtain a word and an integer number

dictionary = OrderedDict()
term_id = 0

def create_dictionary():
    with open('./vocabulary.json', 'w+', encoding='utf-8') as dict_file :
        json.dump(dictionary, dict_file)
    dict_file.close()

def fill_dictionary(values):
    global term_id
    for word in values:
        dictionary[word] = term_id + 1
        term_id += 1

df['placeDesc'].apply(lambda values: fill_dictionary(values))
create_dictionary()

In [None]:
# with the inverse dictionary we can see if in the description column for each row we are checking if that word is in that row and we obtain the idex of that row
# finally we have the integer of that word and the list of the indexes of the rows where we find that word

def inverse_dictionary(data):
    inverse_dict = OrderedDict()
    f = open('vocabulary.json')
    jsonData = json.load(f)
    l = list(jsonData.keys())

    for word in tqdm(l):
        o = [word in x for x in data]
        idxs = [i for i, x in enumerate(o) if x == True]
        inverse_dict[jsonData[word]] = idxs
    f.close()
    return inverse_dict

inverse_dict = inverse_dictionary(df['placeDesc'])

In [None]:
with open('./inverse_vocabulary.json', 'w+', encoding='utf-8') as dict_file :
    json.dump(inverse_dict, dict_file)
dict_file.close()

In [11]:
df_original['index'] = df_original.index

In [12]:
query_original = input("Type query: " ) #creating a query
query = query_original.lower().split(' ')

row_query_list = []
row_query = set()

f = open('vocabulary.json')
vocabulary = json.load(f)
f.close()

with open('./inverse_vocabulary.json', encoding='utf-8') as dict_file :
    jsonData = json.load(dict_file)
    for word in query:
        try:
            term_id = vocabulary[word]
            documents_id = jsonData[str(term_id)]
            row_query_list.append(set(documents_id))
        except KeyError:
            row_query.clear()
        row_query = set.intersection(*row_query_list)
dict_file.close()

print("searching on query: " +'\033[1m' + query_original + '\033[0m')
if len(row_query) == 0:
    print('\033[1m' + "\nNo result was found. Maybe you're searchin for a stopword. Please retry with more informations." + '\033[0m' + '\n')

#we obtain the words in a database and taking just the indexes with that words
df_original[['placeName','placeDesc','placeUrl','placeShortDesc']][df_original['index'].isin(row_query)]

searching on query: [1mjal mahal[0m


Unnamed: 0,placeName,placeDesc,placeUrl,placeShortDesc
2911,Jal Mahal,"Located in the middle of the Man Sagar Lake, a...",,More than half of this Indian palace is drowne...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# define the cosine similarity
def get_cosine_similarity(word, rows):
    train_set = [word,*df_original['placeDesc'].get(rows)]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)  #finds the tfidf score with normalization
    return cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train)[:,1:]

In [None]:
f = open('vocabulary.json')
vocabulary = json.load(f)
reverse_vocabulary = dict((v, k) for k, v in vocabulary.items())
f.close()


def inverse_tfid_dictionary():
    inverse_dict = OrderedDict()
    f = open('inverse_vocabulary.json')
    jsonData = json.load(f)
    term_ids = list(jsonData.keys())
    for term_id in tqdm(term_ids):
        tfId_scores = get_cosine_similarity(reverse_vocabulary[int(term_id)], jsonData[term_id])
        result = list(zip(jsonData[term_id], tfId_scores[0]))
        inverse_dict[term_id] = result
    f.close()
    return inverse_dict

In [None]:
inverse_dict = inverse_tfid_dictionary()

In [None]:
with open('./inverse_tf_ids_vocabulary.json', 'w+', encoding='utf-8') as dict_file :
    json.dump(inverse_dict, dict_file)
dict_file.close()

In [13]:
query_original = input("Type query: " )
query = query_original.lower().split(' ')

row_query = set()

f = open('vocabulary.json')
vocabulary = json.load(f)
f.close()

with open('./inverse_tf_ids_vocabulary.json', encoding='utf-8') as dict_file :
    jsonData = json.load(dict_file)
    for word in query:
        try:
            term_id = vocabulary[word]
            documents_ids = jsonData[str(term_id)]
            documents_ids = [tuple(x) for x in documents_ids]
            row_query.update(set(documents_ids))
        except KeyError:
            row_query.clear()
dict_file.close()

row_query = list(row_query)
row_query.sort(key=lambda tup: tup[1], reverse=True)
top_5 = row_query[0:5] #taking the first five similar rows
heapq.heapify(top_5) #putting then in a heap structure


print("searching on query: " +'\033[1m' + query_original + '\033[0m')
if len(row_query) == 0:
    print('\033[1m' + "\nNo result was found. Maybe you're searchin for a stopword. Please retry with more informations." + '\033[0m' + '\n')

top_5_df_rows = df_original[['placeName','placeDesc','placeUrl','placeShortDesc','index']][df_original['index'].isin([doc_id[0] for doc_id in top_5])] #taking the original df adding 5 similar rows
tuples_as_df = pd.DataFrame(top_5, columns=['index','similarity'])
top_5_df_rows = top_5_df_rows.merge(tuples_as_df, on="index")
top_5_df_rows.drop(['index'], axis=1)
# create a new column similarity to see on the output and merging the tuples based on the rows' index

searching on query: [1mcolosseum rome[0m


Unnamed: 0,placeName,placeDesc,placeUrl,placeShortDesc,similarity
0,The Museum of Roman Ships at Fiumicino,An accidental discovery delivered this unique ...,,Roman-era ships unearthed...,0.054451
1,The Relic Crypt of St. Helena at Église Saint-...,Empress St. Helena was directly responsible fo...,http://saintleuparis.catholique.fr,This little-known relic crypt holds stolen par...,0.059747
2,Twin Lakes Capitoline Wolf,"By 1931, Benito Mussolini had been in power fo...",,Mussolini gave Cincinnati a bronze statue of R...,0.063769
3,Via Appia,"Walking from the center of Rome, Via Appia, or...",,2300-year-old Roman road connecting the empire.,0.080873
4,Villa Doria Pamphili Park,Built by the noble Roman Pamphili family in 16...,https://www.villapamphili.it/,A huge enchanting public park just outside the...,0.067127


# **3. Define a new Score**
Now it's your turn. Build a new metric to rank places based on the queries of their users.

In this scenario, a single user can give input more information than a single textual query, so you need to consider all this information and think of a creative and logical way to answer the user's requests.

Practically:

The user will enter a text query. As a starting point, get the query-related documents by exploiting the search engine of Step 3.1.

Once you have the documents, you need to sort them according to your new score. In this step, you won't have any more to take into account just the plot of the documents; you must use the remaining variables in your dataset (or new possible variables that you can create from the existing ones). You must use a heap data structure (you can use Python libraries) for maintaining the top-k documents.

Q: How to sort them? A: Allow the user to specify more information that you find in the documents and define a new metric that ranks the results based on the new request. You can also use other information regarding the place to score some places above others.

N.B.: You have to define a scoring function, not a filter!

In [15]:
def get_custom_similarity(rows):
    df_rows_visited_people = [*df_original['numPeopleVisited'].get(rows)]
    normed_similarity_coeff = [0] if sum(df_rows_visited_people)== 0 else df_rows_visited_people/np.linalg.norm(df_rows_visited_people)  #added a new similarity coefficient taking the column of the people visited that place
    return normed_similarity_coeff

In [16]:
def inverse_custom_dictionary():
    inverse_dict = OrderedDict()
    f = open('inverse_vocabulary.json')
    jsonData = json.load(f)
    term_ids = list(jsonData.keys())
    for term_id in tqdm(term_ids):
        custom_scores = get_custom_similarity(jsonData[term_id])
        result = list(zip(jsonData[term_id], custom_scores))
        inverse_dict[term_id] = result
    f.close()
    return inverse_dict

In [17]:
inverse_dict = inverse_custom_dictionary()

100%|██████████| 54205/54205 [00:14<00:00, 3761.68it/s]


In [18]:
#asking to the user to write a place, city etc... and then if want to see the most visited or not
query_original = input("Type query: " )
query = query_original.lower().split(' ')

while True:
    try:
        sort_param = int(input("Do you want to see the most visited places?: 0 for NO, any number for YES" ))
        break
    except:
        continue


row_query = set()

f = open('vocabulary.json')
vocabulary = json.load(f)
f.close()

with open('./inverse_custom_score_vocabulary.json', encoding='utf-8') as dict_file :
    jsonData = json.load(dict_file)
    for word in query:
        try:
            term_id = vocabulary[word]
            documents_ids = inverse_dict[str(term_id)]
            documents_ids = [tuple(x) for x in documents_ids]
            row_query.update(set(documents_ids))
        except KeyError:
            row_query.clear()
dict_file.close()

row_query = list(row_query)
row_query.sort(key=lambda tup: tup[1], reverse=True if sort_param!=0 else False)
top_5 = row_query[0:5]

print("searching on query: " +'\033[1m' + query_original + '\033[0m')
if len(row_query) == 0:
    print('\033[1m' + "\nNo result was found. Maybe you're searchin for a stopword. Please retry with more informations." + '\033[0m' + '\n')

top_5_df_rows = df_original[['placeName','placeDesc','placeUrl','placeShortDesc','index']][df_original['index'].isin([doc_id[0] for doc_id in top_5])]
tuples_as_df = pd.DataFrame(top_5, columns=['index','custom_score'])
top_5_df_rows = top_5_df_rows.merge(tuples_as_df, on="index")
top_5_df_rows.sort_values(by="custom_score", ascending=False)
# we serched "moscow" and the output gives the 5 rows that corresponds to the 5 places of that query
# that helps to the user to choose the place based on popularity, also if that are not close. the last table will be visualized on a map in the next exercise

searching on query: [1mMoscow[0m


Unnamed: 0,placeName,placeDesc,placeUrl,placeShortDesc,index,custom_score
3,Pergamon Museum,"Situated on Berlin’s Museum Island, the Pergam...",,"The most visited, and possibly most controvers...",4598,0.859723
2,Moscow Metro Stations,"Subway stations are often sordid spots, the da...",http://www.mosmetro.ru/,The gorgeous entrances into the city's underbe...,3886,0.276974
0,Lenin's Mausoleum,Lenin’s mausoleum in Red Square offers up one ...,,This building holds the embalmed remains of th...,3367,0.264174
1,Linnahall,"When Moscow hosted the 1980 Summer Olympics, t...",http://www.linnahall.ee,An empty relic of the Moscow Olympics crumbles...,3416,0.207286
4,World Traveler Signpost,Non-Mainers may be rightfully confounded by th...,,Turns out you can see the world without ever l...,7099,0.120176


# **4. Visualizing the most relevant Places**

Show a map with the places found with the score defined in point 3. Ensure you can at least identify and visualize the name, city, country, address and the number of people who visited each place.


In [20]:
import folium

map_df = df_original[df_original['placeName'].isin(top_5_df_rows['placeName'])]
sorted= map_df.sort_values(by=r'numPeopleVisited', ascending=False)

Globe= [53.0000,9.0000] #taking the geolocalization  with the latitude and longitutde of the dataframe
my_map = folium.Map(location = Globe, zoom_start =1.5)

for row in sorted.iterrows():
    row_values = row[1]
    location = [row_values['placeAlt'], row_values['placeLong']]
    popup= '<strong>' + '-Name: '+ row_values['placeName'] + '<br>' + '-Number of Visitors: '+ str(row_values['numPeopleVisited'])+ '<br>'+ '-Address: ' +row_values['placeAddress']+ '<strong>'
    marker= folium.Marker(location=location, popup= popup)
    marker.add_to(my_map)

display(my_map)
# with the popup visualize the name, number of visitors the address of the city, to do it we have to do the doubleclick on that tag on the map
# to observe the exact localization it's necessary to zoom-in
# to visualize all the tags on the map it's necessary to zoom-out