In [1]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import lxml
import numpy as np

import os                                                      #Needed to move between OS folders

import re
from bs4 import BeautifulSoup                                  #Scraper 
import requests                                                #URL drainer

from tqdm import tqdm

from datetime import datetime                                  #To be leveraged to define datetime objects  


import nltk                                                    #Text preprocessing library
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer    #Useful already implemented tfidf vectorizer from scikit learn library

from mrjob.job import MRJob
from mrjob.step import MRStep                                  #MapReduce methods to perform the map-shuffle-reduce pattern

# 1. Data collection

## 1.1. Get the list of places


In [None]:
places = []

for i in range(1,401):
    main_url = 'https://www.atlasobscura.com/places?page=' + str(i) +'&sort=likes_count'
    cont = requests.get(main_url)
    soup = BeautifulSoup(cont.text)
    for place in soup.find_all('a', {'class':'content-card content-card-place'}):
        places.append('https://www.atlasobscura.com'+place.get('href'))

f = open('places.txt','w+')

for place in places:
    f.write(place+'\n')

f.close()

_____________

## 1.2 Crawl places

In [None]:
f = open('places.txt','r')

lines = f.readlines()

dic = {}
for i in range(0,7200,18):
    dic[1+i//18] = lines[i:i+18]

for page in range(0,401):
    try:
        os.mkdir(r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page))
    except FileExistsError:
        pass
    
    path = r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page)
    os.chdir(path)

    for place in dic[page]:
        place_name = place[36:len(place)-1]
        vanilla = requests.get(place[:-1],allow_redirects=False,headers = {'User-agent': 'your bot 0.1'})
        
        with open(place_name+".txt",'w+',encoding="utf-8") as new_file:
            new_file.write(vanilla.text)

os.chdir(r'C:\\Users\\Leonardo\\ADM_HW3')

____________

## 1.3 Parse downloaded pages

We need to define a script to extract useful information from each HTML we collected. In particular we want these information:
1. Place Name (to save as $placeName$): String.
2. Place Tags (to save as $placeTags$): List of Strings.
3. Number of people who have been there (to save as $numPeopleVisited$): Integer.
4. Number of people who want to visit the place(to save as $numPeopleWant$): Integer.
5. Description (to save as $placeDesc$): String. Everything from under the first image up to "know before you go" (orange frame on the example image).
6. Short Description (to save as $placeShortDesc$): String. Everything from the title and location up to the image (blue frame on the example image).
7. Nearby Places (to save as $placeNearby$): Extract the names of all nearby places, but only keep unique values: List of Strings.
8. Address of the place(to save as $placeAddress$): String.
9. Altitude and Longitude of the place's location(to save as $placeAlt$ and $placeLong$): Integers
10. The username of the post editors (to save as $placeEditors$): List of Strings.
11. Post publishing date (to save as $placePubDate$): datetime.
12. The names of the lists that the place was included in (to save as $placeRelatedLists$): List of Strings.
13. The names of the related places (to save as $placeRelatedPlaces$): List of Strings.
14. The URL of the page of the place (to save as $placeURL$):String

We leverage *BeautifulSoup library* to scrape the information, but we need just an additional method to convert into a datetime object the post publishing date since it was a string in the format 'Month Day, Year'. For example we found 'May 8, 2010' for the very first link, and instead we wanted '2010/05/08'. This method does so:

In [2]:
def string_to_datetime(string):
    return str(datetime.strptime(string, '%B %d, %Y').date())

We define a function that builds a dictionary of information for every place we go through: then from this dictionary we'll buil a .tsv for every HTML document we gathered.

In [44]:
def darkAtlasScraper(text):
    
    soup = BeautifulSoup(text)
    
    scraped = {'placeName': 'NaN',
               'placeTags': 'NaN',
               'numPeopleVisited': 'NaN',
               'numPeopleWant': 'NaN',
               'placeDesc': 'NaN',
               'placeShortDesc':'Nan',
               'placeNearby': 'NaN',
               'placeAddress': 'NaN',
               'placeAlt': 'NaN',
               'placeLong': 'NaN',
               'placeEditors': 'NaN',
               'placePubDate': 'NaN',
               'placeRelatedPlaces': 'NaN',
               'placeRelatedLists': 'NaN',
               'placeURL': 'NaN'}          
    
    try:
        scraped['placeName'] = soup.find_all('h1',{'class':'DDPage__header-title'})[0].contents[0]
    except IndexError:
        pass
           
    try:
        scraped['placeTags'] = list(map(lambda s:s.strip(),
                                        [tag.contents[0] for tag in soup.find_all('a',{'class':'itemTags__link js-item-tags-link'})]))
    except IndexError:
        pass
    
    
    counters = soup.find_all('div',{'class':'title-md item-action-count'})
    try:
        scraped['numPeopleVisited'] = int(counters[0].contents[0])
    except IndexError:
        pass
    try:
        scraped['numPeopleWant'] = int(counters[1].contents[0])
    except IndexError:
        pass
    

    place_desc = ''
    for paragraph in soup.find_all('div',{'class':'DDP__body-copy'})[0].find_all('p'):
        for element in paragraph.contents:
            if re.search('<[^>]*>', str(element)):
                element = re.sub('<[^>]*>', "", str(element))
                place_desc += element
            else:
                place_desc += str(element)
    scraped['placeDesc'] = place_desc
    
    try:
        scraped['placeShortDesc'] = soup.find_all('h3',{'class':'DDPage__header-dek'})[0].contents[0].replace(u'\xa0', u'')
    except IndexError:
        pass

    nearby = []
    try:
        for nearbies in soup.find_all('div',{'class':'DDPageSiderailRecirc__item-text'}):
            nearby.append(nearbies.find_all('div',{'class':'DDPageSiderailRecirc__item-title'})[0].contents[0])
        scraped['placeNearby'] = nearby
    except IndexError:
        pass
    
    try:
        address = (str(soup.find_all('aside',{'class':'DDPageSiderail__details'})[0]
                           .find_all('address',{'class':'DDPageSiderail__address'})[0]
                           .find_all('div')[0])
                           .split('\n', 1)[0])
        scraped['placeAddress'] = re.sub('<[^>]*>', " ", address)
    except IndexError:
        pass
    
    coordinates = soup.find_all('div',{'class':'DDPageSiderail__coordinates js-copy-coordinates'})[0].contents[2]
    scraped['placeAlt'] = float(coordinates.split()[0][:-1])
    scraped['placeLong'] = float(coordinates.split()[1])


    editorsoup = soup.find_all('a',{'class':'DDPContributorsList__contributor'})
    scraped['placeEditors'] = [stuff.find_all('span')[0].contents[0] 
                               for stuff in editorsoup 
                               if len(stuff.find_all('span')) > 0]
    if not scraped['placeEditors']:
        zzz = soup.find_all('div',{'class':'ugc-editors'})
        flag = 0
        for soupper in zzz:
            if soupper.find_all('h6')[0].contents[0] == 'Added by':
                flag = 1
                break
        try:
            editorsoup = soup.find_all('div',{'class':'ugc-editors'})[flag].find_all('a',{'class':'DDPContributorsList__contributor'})
            scraped['placeEditors'] = [editors.contents[0]
                                       for editors in editorsoup]
        except IndexError:
            pass
            
    try:
        scraped['placePubDate'] = string_to_datetime(soup.find_all('div',{'class':'DDPContributor__name'})[0].contents[0])
    except IndexError:
        pass

    kircher = soup.find_all('div',{'class':'athanasius'})
    for piece in kircher:
        for piecer in piece.find_all('div',{'class':'CardRecircSection__title'}):
            if piecer.contents[0] == 'Related Places':
                scraped['placeRelatedPlaces'] = [re.sub('<[^>]*>', "", str(chunk.contents[1])) 
                                                 for chunk in piece.find_all('h3',{'class':'Card__heading --content-card-v2-title js-title-content'})]
            elif 'Appears in' in piecer.contents[0]:
                scraped['placeRelatedLists'] =  [re.sub('<[^>]*>', "", str(chunk.contents[1])) 
                                                 for chunk in piece.find_all('h3',{'class':'Card__heading --content-card-v2-title js-title-content'})]
    
    scraped['placeURL'] = 'https://www.atlasobscura.com/places/' + filename[:-4]
    
    return scraped

Now we have to define the script that goes through each folder and for each folder goes through each downloaded HTML, scrapes information and store them in a new .tsv file.

In [45]:
for page in range(1,401):
    path = r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page)
    os.chdir(path)
    
    for filename in os.listdir(os.getcwd()):
        os.chdir(r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page))
        
        new_path = r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page) + '\\' + filename
        soupper = open(new_path, 'r',encoding="utf-8")
        
        os.chdir(r'C:\\Users\\Leonardo\\ADM_HW3\\tsv')
        newer_path = r'C:\\Users\\Leonardo\\ADM_HW3\\tsv\\'+filename[:-4]+'.tsv'
        try:
            infos = darkAtlasScraper(soupper)
        except IndexError:
            print(newer_path)
        with open(newer_path,'w+',encoding="utf-8") as new_file:
            for info in infos.values():
                if type(info) == list:
                    for index in range(len(info)):
                        if index < len(info) - 1:
                            new_file.write(str(info[index])+ ', ')
                        elif index == len(info) - 1:
                            new_file.write(str(info[index]))
                    new_file.write('\t')
                else:
                    new_file.write(str(info))
                    new_file.write('\t')

In [None]:
os.chdir(r'C:\\Users\\Leonardo')

We then decided to build a .csv to collect all the data and to make them always available in a practical format. 

In [29]:
path= r"C:\Users\Leonardo\ADM_HW3\tsv"
final_dataset = []
filenames = os.listdir(path)

for file in filenames:
    if file.endswith('tsv'):
        file_path = os.path.join(path,file)
        try:
            df = pd.read_csv(file_path, sep ="\t", header=None, quoting=3)
            final_dataset.append(df)
        except:
            print(file)
            pass
    
final_dataset = pd.concat(final_dataset)
final_dataset.to_csv('final_dataset.csv', index = False)

In [30]:
path = r"C:\Users\Leonardo\Documents\GitHub\3HW-ADM-Fabri.Dinino.Aur\final_dataset.csv"

mostPopularPlaces = pd.read_csv(path)
mostPopularPlaces = mostPopularPlaces.iloc[:, :-2]
mostPopularPlaces.columns = ['placeName', 'placeTags', 'numPeopleVisited', 'numPeopleWant', 'placeDesc', 'placeShortDesc', 'placeNearby', 
        'placeAddress', 'placeAlt', 'placeLong', 'placeEditors', 'placePubDate', 'placeRelatedPlaces', 'placeRelatedLists', 'placeURL']

We now have the whole dataset and we are ready to start building the search engines. 

In [32]:
mostPopularPlaces.head(20)


Unnamed: 0,placeName,placeTags,numPeopleVisited,numPeopleWant,placeDesc,placeShortDesc,placeNearby,placeAddress,placeAlt,placeLong,placeEditors,placePubDate,placeRelatedPlaces,placeRelatedLists,placeURL
0,109 East Palace,"manhattan project, secret, nuclear, science",417,470,When you need to be dropped off at a top-secre...,This innocuous New Mexico storefront was once ...,"Palace of the Governors, Spitz Clock, La Conqu...","109 East Palace Santa Fe, New Mexico, 87501 ...",35.6875,-105.9372,"tylercole, Collector of Experiences, nanpalmero",2014-03-31,"Camp Century (Project Iceworm), Red Gate Woods...",,https://www.atlasobscura.com/places/109-east-p...
1,145 Rue Lafayette,"urban planning, cities, transportation, archit...",163,412,"While walking down the street, you’d be forgiv...",What looks like a normal building is really a ...,"WWII Bunker Under Gare de l'Est, Le Louxor Pal...",145 Rue la Fayette Paris France,48.8792,2.3562,"erjeffery, SEANETTA, carllenox",2018-06-15,Vauxhall Bridge's Miniature St. Paul's Cathedr...,,https://www.atlasobscura.com/places/145-rue-la...
2,17 Room Ruin,"abandoned houses, native americans, abandoned,...",161,1442,"Outside Bluff, Utah, a massive 100-foot-deep a...",A well-preserved Ancestral Puebloan ruin tucke...,"House on Fire Ruin, Forrest Gump Point, Four C...","Bluff, Utah United States",37.2748,-109.5102,"Anynamewilldo, djm213, Molly McBride Jacobson",2017-05-10,"Keller House, Cow Springs Trading Post, Foinik...",The United States of Abandoned Places,https://www.atlasobscura.com/places/17-room-ruin
3,1890s Alien Gravesite,"aliens, graves, gravestones, cemeteries",248,1189,"While it receives no fanfare today, the small ...",This small town Texas cemetery is said to be t...,"The Lost Arm of F. Stewart, Chef Point Bar & R...","507 Cemetery Rd Aurora, Texas, 76078 United S...",33.0534,-97.5,"Jzohmnbie, Darrell Powers, celiarosegoes, dung...",2014-11-21,"Betty and Barney Hill Graves, The Menster Chri...",17 Places to Hunt for Aliens Besides Area 51,https://www.atlasobscura.com/places/1800-s-ali...
4,University of Virginia’s Hidden Chemical Hearth,"thomas jefferson, unesco, hidden, science",130,436,Nestled in the ground floor of a UNESCO World ...,Hidden for 165 years inside a building designe...,"University of Virginia's Seven Society, The Ra...","1721 University Ave Charlottesville, Virginia...",38.0357,-78.5033,blimpcaptain,2016-05-09,"Kamerlingh Onnes Laboratory Plaque, University...",,https://www.atlasobscura.com/places/1820s-chem...
5,1895 Looff Carousel,"carousels, amazing automata, outsider architec...",811,768,Amidst the glut of shops and restaurants that ...,One of the few remaining carousels built by ma...,San Diego Police Department Jail Cells and Pol...,"817 West Harbor Drive San Diego, California, ...",32.7086,-117.169,"Rachel, e1savage, jondi letnap, lastpearl, Avo...",2014-06-06,"Tom Mankiewicz Conservation Carousel, Leroy Ki...",,https://www.atlasobscura.com/places/1895-looff...
6,1904 World's Fair Flight Cage,"flight, world's fair, zoos, birds, animals",868,452,When the St. Louis 1904 flight cage was built ...,What was meant to be a temporary exhibit is no...,"Turtle Playground, World's Largest Amoco Sign,...","Government Drive St. Louis, Missouri United S...",38.6367,-90.2922,Collector of Experiences,2017-10-06,"Sir Nils Olav, Monument to Carrier Pigeons, 'S...",,https://www.atlasobscura.com/places/1904-world...
7,1940 Air Terminal Museum,"aviation, art deco, airplanes, museums and col...",194,544,Once the commercial hub of an exciting new way...,This historical air terminal in Houston is slo...,"Smither Park, The Orange Show, Eclectic Menage...","8301-8399 Travelair Street Houston, Texas, 77...",29.6476,-95.2869,"lfhphoto, larryhockett, Edward Denny, michaela...",2013-02-15,"Valle Planes of Fame Air Museum, Belgrade Avia...",Unusual Wedding Venues,https://www.atlasobscura.com/places/1940-air-t...
8,1950s All-Electric House,"energy, aletrail, electricity, electrical oddi...",163,651,This home of the future from the past features...,This retro-future model home is packed to the ...,"Lawrin Gravesite, The National Museum of Toys ...","8788 Metcalf Ave Overland Park, Kansas United...",38.9691,-94.6686,"Collector of Experiences, alisam9, andrewgusta...",2017-06-08,"The House of Wonders, The Z Machine, Pentagon ...","Favorites, Off-Centered Ale Trail: Kansas City",https://www.atlasobscura.com/places/1950s-alle...
9,Papal Hearts at Santi Vincenzo e Anastasio a T...,"hearts, relics, relics and reliquaries, church...",331,901,"In 2002, Pope John Paul II gifted use of Santi...",This former Roman Catholic Church still contai...,"Vicus Caprarius, Galleria Sciarra, The Head of...","Vicolo dei Modelli, 73 Rome, 00187 Italy",41.9007,12.4837,"tinolasvegas, contender, Rachel",2013-05-19,Lipsanothecae of the Chapel of Saint Francesco...,"The Ultimate Guide to Scattered Body Parts, An...",https://www.atlasobscura.com/places/22-papal-h...


_________________________________________________

## 2. Search Engine

In [78]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [79]:
def ntlk_analysis(info):
    final_words = []
    tokens = word_tokenize(info.lower())
    stop_words = set(stopwords.words("english"))
    ps = PorterStemmer()
    for token in tokens:
        if token not in stop_words and token.encode().isalpha():
            stemming_token = ps.stem(token)
            final_words.append(stemming_token)
            
    return final_words

## 2.1. Conjunctive query

In [87]:
nltk.download()
listOfWords = []
for index, row in mostPopularPlaces['placeDesc'].iteritems():
    description = mostPopularPlaces['placeDesc'][index]
    listOfWords.append(ntlk_analysis(description))

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [1]:
mostPopularPlaces['listOfWords'] = listOfWords

NameError: name 'listOfWords' is not defined

In [28]:
mostPopularPlaces['listOfWords'].head(15)

KeyError: 'listOfWords'

### 2.1.1) Create your index!

In [8]:
vocabolary = {}

for index, list_ in mostPopularPlaces['listOfWords'].iteritems():
    for word in list_:
        if word in vocabolary:
            if index not in vocabolary[word]:
                vocabolary[word].append(index)
        else:
            vocabolary[word] = [index]

vocabolary = dict(sorted(vocabolary.items()))
with open('vocabolary.txt', 'w') as file:
    file.write(str(vocabolary))

### 2.1.2) Execute the query

In [11]:
def search_match(query,df):
    result = []
    query = ntlk_analysis(query)
    match = {key: [] for key in query}

    for key,list_of_values in vocabolary.items():
        if key in match:
            for value in list_of_values:
                if value not in match[key]:
                    match[key].append(value)
                    
    final_values = list(match.values())
    intersection = set.intersection(*map(set,final_values))
    
    for index in intersection:
        series = df[['placeName', 'placeDesc','placeURL']].loc[index]
        result.append(series)
    
    return pd.DataFrame(result)

In [12]:
first_search_engine = search_match('american museum', mostPopularPlaces)
first_search_engine

Unnamed: 0,placeName,placeDesc,placeURL
4097,National Museum of Health and Medicine,"Once housed in downtown Washington, D.C., the ...",https://www.atlasobscura.com/places/national-m...
4098,National Museum of the Pacific War,Dedicated specifically to remembering the stor...,https://www.atlasobscura.com/places/national-m...
6147,The Old Patent Model Museum,Before the Smithsonian reopened the building i...,https://www.atlasobscura.com/places/the-old-pa...
4100,National Watch and Clock Museum,"In our age, state-of-the-art clocks keep time ...",https://www.atlasobscura.com/places/national-w...
2053,Fraunces Tavern,Fraunces Tavern was originally the home of ear...,https://www.atlasobscura.com/places/fraunces-t...
...,...,...,...
4085,National Cowboy and Western Heritage Museum,To truly understand the era and the land that ...,https://www.atlasobscura.com/places/national-c...
4086,National Cryptologic Museum,Update as of October 2021: Currently closed fo...,https://www.atlasobscura.com/places/national-c...
1018,Catoctin Furnace,"In the early 1770s, Thomas Johnson discovered ...",https://www.atlasobscura.com/places/catoctin-f...
6653,Urology Museum,It wasn’t long a go that a bladder stone was a...,https://www.atlasobscura.com/places/urology-mu...


# 2.2. Conjunctive query & Ranking score

We can leverage tfidf method from sklearn, or we can implement the tfidf from scratch. We will implement both solution as exercise, but we'll use *TfdifVectorizer* to focus on tuning the model on the data. 

In [2]:
tfidf = TfidfVectorizer(input='content', lowercase=False, tokenizer=lambda text:text)
result = tfidf.fit_transform(webdata.list_words)
tfidf.get_feature_names()

NameError: name 'webdata' is not defined