In [None]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import lxml
import numpy as np

import os

import re
from bs4 import BeautifulSoup 
import requests

from datetime import datetime 

# 1. Data collection

## 1.1. Get the list of places


In [None]:
places = []

for i in range(1,401):
    main_url = 'https://www.atlasobscura.com/places?page=' + str(i) +'&sort=likes_count'
    cont = requests.get(main_url)
    soup = BeautifulSoup(cont.text)
    for place in soup.find_all('a', {'class':'content-card content-card-place'}):
        places.append('https://www.atlasobscura.com'+place.get('href'))

f = open('places.txt','w+')

for place in places:
    f.write(place+'\n')

f.close()

_____________

## 1.2 Crawl places

In [None]:
f = open('places.txt','r')

lines = f.readlines()

dic = {}
for i in range(0,7200,18):
    dic[1+i//18] = lines[i:i+18]

for page in range(0,401):
    try:
        os.mkdir(r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page))
    except FileExistsError:
        pass
    
    path = r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page)
    os.chdir(path)

    for place in dic[page]:
        place_name = place[36:len(place)-1]
        vanilla = requests.get(place[:-1],allow_redirects=False,headers = {'User-agent': 'your bot 0.1'})
        
        with open(place_name+".txt",'w+',encoding="utf-8") as new_file:
            new_file.write(vanilla.text)

os.chdir(r'C:\\Users\\Leonardo\\ADM_HW3')

____________

## 1.3 Parse downloaded pages

We need to define a script to extract useful information from each HTML we collected. In particular we want these information:
1. Place Name (to save as $placeName$): String.
2. Place Tags (to save as $placeTags$): List of Strings.
3. Number of people who have been there (to save as $numPeopleVisited$): Integer.
4. Number of people who want to visit the place(to save as $numPeopleWant$): Integer.
5. Description (to save as $placeDesc$): String. Everything from under the first image up to "know before you go" (orange frame on the example image).
6. Short Description (to save as $placeShortDesc$): String. Everything from the title and location up to the image (blue frame on the example image).
7. Nearby Places (to save as $placeNearby$): Extract the names of all nearby places, but only keep unique values: List of Strings.
8. Address of the place(to save as $placeAddress$): String.
9. Altitude and Longitude of the place's location(to save as $placeAlt$ and $placeLong$): Integers
10. The username of the post editors (to save as $placeEditors$): List of Strings.
11. Post publishing date (to save as $placePubDate$): datetime.
12. The names of the lists that the place was included in (to save as $placeRelatedLists$): List of Strings.
13. The names of the related places (to save as $placeRelatedPlaces$): List of Strings.
14. The URL of the page of the place (to save as $placeURL$):String

We leverage *BeautifulSoup library* to scrape the information, but we need just an additional method to convert into a datetime object the post publishing date since it was a string in the format 'Month Day, Year'. For example we found 'May 8, 2010' for the very first link, and instead we wanted '2010/05/08'. This method does so:

In [None]:
def string_to_datetime(string):
    return str(datetime.strptime(string, '%B %d, %Y').date())

We define a function that builds a dictionary of information for every place we go through: then from this dictionary we'll buil a .tsv for every HTML document we gathered.

In [None]:
def darkAtlasScraper(text):
    
    soup = BeautifulSoup(text)
    
    scraped = {}
    scraped['placeName'] = soup.find_all('h1',{'class':'DDPage__header-title'})[0].contents[0]
           
    scraped['placeTags'] = list(map(lambda s:s.strip(),
                                    [tag.contents[0] for tag in soup.find_all('a',{'class':'itemTags__link js-item-tags-link'})]))
    

    counters = soup.find_all('div',{'class':'title-md item-action-count'})
    scraped['numPeopleVisited'] = int(counters[0].contents[0])
    scraped['numPeopleWant'] = int(counters[1].contents[0])

    place_desc = ''
    for paragraph in soup.find_all('div',{'class':'DDP__body-copy'})[0].find_all('p'):
        for element in paragraph.contents:
            if re.search('<[^>]*>', str(element)):
                element = re.sub('<[^>]*>', "", str(element))
                place_desc += element
            else:
                place_desc += str(element)
    scraped['placeDesc'] = place_desc

    scraped['placeShortDesc'] = soup.find_all('h3',{'class':'DDPage__header-dek'})[0].contents[0].replace(u'\xa0', u'')

    nearby = []
    for nearbies in soup.find_all('div',{'class':'DDPageSiderailRecirc__item-text'}):
        nearby.append(nearbies.find_all('div',{'class':'DDPageSiderailRecirc__item-title'})[0].contents[0])
    scraped['placeNearby'] = nearby

    address = (str(soup.find_all('aside',{'class':'DDPageSiderail__details'})[0]
                   .find_all('address',{'class':'DDPageSiderail__address'})[0]
                   .find_all('div')[0])
                   .split('\n', 1)[0])
    scraped['placeAddress'] = re.sub('<[^>]*>', " ", address)

    coordinates = soup.find_all('div',{'class':'DDPageSiderail__coordinates js-copy-coordinates'})[0].contents[2]
    scraped['placeAlt'] = float(coordinates.split()[0][:-1])
    scraped['placeLong'] = float(coordinates.split()[1])


    editorsoup = soup.find_all('a',{'class':'DDPContributorsList__contributor'})
    scraped['placeEditors'] = [stuff.find_all('span')[0].contents[0] 
                               for stuff in editorsoup 
                               if len(stuff.find_all('span')) > 0]
    if not scraped['placeEditors']:
        editorsoup = soup.find_all('div',{'class':'ugc-editors'})[0].find_all('a',{'class':'DDPContributorsList__contributor'})
        scraped['placeEditors'] = [editors.contents[0]
                                   for editors in editorsoup]
    if not scraped['placeEditors']:
        scraped['placeEditors'] = ''
        
    
    try:
        scraped['placePubDate'] = string_to_datetime(soup.find_all('div',{'class':'DDPContributor__name'})[0].contents[0])
    except IndexError:
        scraped['placePubDate'] = ''

    kircher = soup.find_all('div',{'class':'athanasius'})
    for piece in kircher:
        for piecer in piece.find_all('div',{'class':'CardRecircSection__title'}):
            if piecer.contents[0] == 'Related Places':
                scraped['placeRelatedPlaces'] = [re.sub('<[^>]*>', "", str(chunk.contents[1])) 
                                                 for chunk in piece.find_all('h3',{'class':'Card__heading --content-card-v2-title js-title-content'})]
            elif 'Appears in' in piecer.contents[0]:
                scraped['placeRelatedLists'] =  [re.sub('<[^>]*>', "", str(chunk.contents[1])) 
                                                 for chunk in piece.find_all('h3',{'class':'Card__heading --content-card-v2-title js-title-content'})]
    
    scraped['placeURL'] = 'https://www.atlasobscura.com/places/' + filename[:-4]
    
    return scraped

Now we have to define the script that goes through each folder and for each folder goes through each downloaded HTML, scrapes information and store them in a new .tsv file.

In [None]:
for page in range(1,401):
    path = r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page)
    os.chdir(path)
    
    for filename in os.listdir(os.getcwd()):
        os.chdir(r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page))
        
        new_path = r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page) + '\\' + filename
        soupper = open(new_path, 'r',encoding="utf-8")
        
        os.chdir(r'C:\\Users\\Leonardo\\ADM_HW3\\tsv')
        newer_path = r'C:\\Users\\Leonardo\\ADM_HW3\\tsv\\'+filename[:-4]+'.tsv'
        try:
            infos = darkAtlasScraper(soupper)
        except IndexError:
            print(newer_path)
        with open(newer_path,'w+',encoding="utf-8") as new_file:
            for info in infos.values():
                if type(info) == list:
                    for index in range(len(info)):
                        if index < len(info) - 1:
                            new_file.write(str(info[index])+ ', ')
                        elif index == len(info) - 1:
                            new_file.write(str(info[index]))
                    new_file.write('\t')
                else:
                    new_file.write(str(info))
                    new_file.write('\t')

In [None]:
os.chdir(r'C:\\Users\\Leonardo')

We now have the whole dataset and we are ready to start building the search engines. 

_________________________________________________