# **Homework 3 - Places of the world**

In [8]:
#import libraries

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
from os import listdir
from os.path import isfile, join
import os
import shutil
import urllib
from datetime import datetime
import csv
import nltk
from nltk.corpus import stopwords

# **1. Data collection**
For this homework, there is no provided dataset. Instead, you have to build your own. Your search engine will run on text documents. So, here we detail the procedure to follow for the data collection.

**1.1. Get the list of places**
We start with the list of places to include in your corpus of documents. In particular, we focus on the Most popular places. Next, we want you to collect the URL associated with each site in the list from this list. The list is long and split into many pages. Therefore, we ask you to retrieve only the URLs of the places listed in the first 400 pages (each page has 18 places, so that you will end up with 7200 unique place URLs).

The output of this step is a .txt file whose single line corresponds to the place's URL.

In [None]:
url = 'https://www.atlasobscura.com/places?page={}&sort=likes_count'
result = requests.get(url)

print(result)
print(result.text)

In [None]:
soup = BeautifulSoup(result.text)

In [None]:
mydivs = soup.find_all("a", {"class": "content-card content-card-place"})
'https://www.atlasobscura.com' + mydivs[1]['href']

In [None]:
with open('places_url.txt', 'w') as f:
    for i in range(1,401):
        url = 'https://www.atlasobscura.com/places?page={}&sort=likes_count'.format(i)
        result = requests.get(url)
        soup = BeautifulSoup(result.text)
        mydivs = soup.find_all("a", {"class": "content-card content-card-place"})
        for anchor in mydivs:
            f.write('https://www.atlasobscura.com' + anchor['href'] + "\n")
    f.close();

**1.2. Crawl places**

Once you get all the URLs in the first 400 pages of the list, you:

Download the HTML corresponding to each of the collected URLs.
After you collect a single page, immediately save its HTML in a file. In this way, if your program stops for any reason, you will not lose the data collected up to the stopping point.
Organize the entire set of downloaded HTML pages into folders. Each folder will contain the HTML of the places on page 1, page 2, ... of the list of locations.

Tip: Due to a large number of pages you should download, you can use some methods that can help you shorten the time it takes. If you employed a particular process or approach, kindly describe it.

In [None]:
def saveHtmlFiles():
    notDownloadedUrls = []
    with open('places_url.txt') as file:
        for url in file:
            try:
                urllib.request.urlretrieve(url,url.split('/')[-1].replace('\n','')+'.html')
            except Exception as e:
                notDownloadedUrls.append(url)
                continue
    file.close()
    return notDownloadedUrls

In [None]:
notDownloadedUrls = saveHtmlFiles()

while notDownloadedUrls:
    with open('places_url.txt', 'w') as f:
        for url in notDownloadedUrls:
            f.write(url + "\n")
    f.close();
    notDownloadedUrls = saveHtmlFiles()

**1.3 Parse downloaded pages**

At this point, you should have all the HTML documents about the places of interest, and you can start to extract the places' information. The list of the information we desire for each place and their format is as follows:

In [7]:
file_path = os.getcwd()
files = []
with open('places_url.txt') as file:
    for url in file:
        files.append(url.split('/')[-1].replace('\n','')+'.html')
    file.close()
file_list = np.array_split(files,400)


In [None]:
for i in range(400):
    directory = 'page_'+ str(i+1)

    target_dir = os.path.join(file_path, directory)

    os.mkdir(target_dir)

    for file_name in file_list[i]:
        shutil.move(os.path.join(file_path, file_name), target_dir)

In [2]:
headerList = ['placeName','placeTags','numPeopleVisited','numPeopleWant','placeDesc','placeShortDesc','placesNearby','placeAddress','placeAlt','placeLong','placeEditors','placePubDate','placeRelatedLists','placeRelatedPlaces','placeUrl']
for i in range(1,401):
    dir = os.getcwd() + '\page_{}'.format(i)
    files = [f for f in listdir(dir) if isfile(join(dir, f)) and f.lower().endswith(('.html'))]
    for file in files:
        with open(os.path.join(dir,file), 'rb+') as fp:
            soup = BeautifulSoup(fp, "html.parser")

            placeName = '' if soup.select('h1')[0].text.strip() is None else soup.select('h1')[0].text.strip()

            placeTags = '' if soup.find('div', {'class': 'item-tags'}) is None else [a.text.strip() for a in soup.find('div', {'class': 'item-tags'}).select('a', {'class': 'itemTags__link'})]

            numPeopleVisited = ('' if soup.find('aside', {'class': 'DDPage__item-actions'}) is None else soup.find('aside', {'class': 'DDPage__item-actions'}).select('div', {'class': 'title-md item-action-count'})[3].text.strip())

            numPeopleWant = ('' if soup.find('aside', {'class': 'DDPage__item-actions'}) is None else soup.find('aside', {'class': 'DDPage__item-actions'}).select('div', {'class': 'title-md item-action-count'})[4].text.strip())

            numPeopleWant = '' if numPeopleVisited == '' else int(''.join(filter(str.isdigit, numPeopleVisited)))

            placeDesc = '' if soup.find('div', {'id': 'place-body'}) is None else soup.find('div', {'id': 'place-body'}).findNext().text.strip()

            placeShortDesc = '' if soup.find('h3', {'class': 'DDPage__header-dek'}) is None else soup.find('h3', {'class': 'DDPage__header-dek'}).text.strip()

            placesNearby = '' if soup.findAll('div', {'class': 'DDPageSiderailRecirc__item-title'}) is None else set([a.text.strip() for a  in soup.findAll('div', {'class': 'DDPageSiderailRecirc__item-title'})])

            placeAddress = '' if soup.find('address', {'class': 'DDPageSiderail__address'}) is None else soup.find('address', {'class': 'DDPageSiderail__address'}).text.strip()

            placeAlt = '' if soup.find('div', {'class': 'DDPageSiderail__coordinates'}) is None else soup.find('div', {'class': 'DDPageSiderail__coordinates'}).text.strip().split(',')[0].strip()

            placeLong = '' if soup.find('div', {'class': 'DDPageSiderail__coordinates'}) is None else soup.find('div', {'class': 'DDPageSiderail__coordinates'}).text.strip().split(',')[1].strip()

            placeEditors = '' if soup.findAll('a', {'class': 'DDPContributorsList__contributor'}) is None else [ a.text.strip() for a in soup.findAll('a', {'class': 'DDPContributorsList__contributor'})]

            placePubDate = '' if soup.find('div', {'class': 'DDPContributor__name'}) is None else datetime.strptime(soup.find('div', {'class': 'DDPContributor__name'}).text,'%B %d, %Y').date()

            placeRelatedLists = '' if soup.find('div', {'class': 'CardRecircSection__title'}, text='Related Places') is None else ([a.findNext('span').text.strip() for a in soup.find('div', {'class': 'CardRecircSection__title'}, text='Related Places').findNext('div', {'class': 'CardRecircSection__card-grid'}).findAll('a')])

            placeRelatedPlaces = '' if soup.find('div', {'class': 'CardRecircSection__title'}, text='Appears in') is None else ([a.findNext('span').text.strip() for a in soup.find('div', {'class': 'CardRecircSection__title'}, text='Appears in').findNext('div', {'class': 'CardRecircSection__card-grid'}).findAll('a')])

            placeUrl = '' if soup.find('div', {'class': 'DDPageSiderail__website'}) is None else soup.find('div', {'class': 'DDPageSiderail__website'}).find('a')['href'].strip()

        fp.close()

        valuesList = [placeName,placeTags,numPeopleVisited,numPeopleWant,placeDesc,placeShortDesc,placesNearby,placeAddress,placeAlt,placeLong,placeEditors,placePubDate,placeRelatedLists,placeRelatedPlaces,placeUrl]
        with open(os.path.join(dir, file.title().replace('.Html','').lower() +'.tsv'), 'w+', encoding="utf-8") as tsvfile:
            writer = csv.writer(tsvfile, delimiter='\t')
            writer.writerow(headerList)
            writer.writerow(valuesList)
        tsvfile.close()

In [3]:
for i in range(1,401):
    dir = os.getcwd() + '\page_{}'.format(i)
    target_dir = os.getcwd() + '/tsv_files'
    files = [f for f in listdir(dir) if isfile(join(dir, f)) and f.lower().endswith(('.tsv'))]
    for file in files:
        shutil.move(os.path.join(dir, file.title().lower()), target_dir)

# **2. Search Engine**

Now, we want to create two different Search Engines that, given as input a query, return the places that match the query.

First, you must pre-process all the information collected for each place by:

- Removing stopwords
- Removing punctuation
- Stemming
- Anything else you think it's needed

For this purpose, you can use the nltk library.

In [42]:
stop_words = stopwords.words('english')
tsv_dir = os.getcwd() + '/tsv_files'
files = [f for f in listdir(tsv_dir) if isfile(join(tsv_dir, f)) and f.lower().endswith(('.tsv'))]
for file in files:
    df = pd.read_csv(os.path.join(tsv_dir, file.title().lower()), delimiter="\t")
    df['placeName'] = df['placeName'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))
    df['placeTags'] = df['placeTags'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))
    df['placeDesc'] = df['placeDesc'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))
    df['placeShortDesc'] = df['placeShortDesc'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))
    df['placesNearby'] = df['placesNearby'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))
    df['placeAddress'] = df['placeAddress'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))
    df['placeEditors'] = df['placeEditors'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))
    df['placeRelatedLists'] = df['placeRelatedLists'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))
    df['placeRelatedPlaces'] = df['placeRelatedPlaces'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))
    df['placesUrl'] = df['placeUrl'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in stop_words]))