In [56]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import lxml
import numpy as np

import os                                                      #Needed to move between OS folders

import re
from bs4 import BeautifulSoup                                  #Scraper 
import requests                                                #URL drainer

from tqdm import tqdm

from datetime import datetime                                  #To be leveraged to define datetime objects  

from collections import Counter
from functools import reduce

import nltk                                                    #Text processing library
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')                                         #Download command on NLTK library to get specific 
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer    #Useful already implemented tfidf vectorizer from scikit learn library

from mrjob.job import MRJob
from mrjob.step import MRStep                                  #MapReduce methods to perform the map-shuffle-reduce pattern

import heapq

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Marina\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marina\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ModuleNotFoundError: No module named 'mrjob'

# 1. Data collection

## 1.1. Get the list of places


In [None]:
places = []

for i in range(1,401):
    main_url = 'https://www.atlasobscura.com/places?page=' + str(i) +'&sort=likes_count'
    cont = requests.get(main_url)
    soup = BeautifulSoup(cont.text)
    for place in soup.find_all('a', {'class':'content-card content-card-place'}):
        places.append('https://www.atlasobscura.com'+place.get('href'))

f = open('places.txt','w+')

for place in places:
    f.write(place+'\n')

f.close()

_____________

## 1.2 Crawl places

In [None]:
f = open('places.txt','r')

lines = f.readlines()

dic = {}
for i in range(0,7200,18):
    dic[1+i//18] = lines[i:i+18]

for page in range(0,401):
    try:
        os.mkdir(r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page))
    except FileExistsError:
        pass
    
    path = r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page)
    os.chdir(path)

    for place in dic[page]:
        place_name = place[36:len(place)-1]
        vanilla = requests.get(place[:-1],allow_redirects=False,headers = {'User-agent': 'your bot 0.1'})
        
        with open(place_name+".txt",'w+',encoding="utf-8") as new_file:
            new_file.write(vanilla.text)

os.chdir(r'C:\\Users\\Leonardo\\ADM_HW3')

____________

## 1.3 Parse downloaded pages

We need to define a script to extract useful information from each HTML we collected. In particular we want these information:
1. Place Name (to save as $placeName$): String.
2. Place Tags (to save as $placeTags$): List of Strings.
3. Number of people who have been there (to save as $numPeopleVisited$): Integer.
4. Number of people who want to visit the place(to save as $numPeopleWant$): Integer.
5. Description (to save as $placeDesc$): String. Everything from under the first image up to "know before you go" (orange frame on the example image).
6. Short Description (to save as $placeShortDesc$): String. Everything from the title and location up to the image (blue frame on the example image).
7. Nearby Places (to save as $placeNearby$): Extract the names of all nearby places, but only keep unique values: List of Strings.
8. Address of the place(to save as $placeAddress$): String.
9. Altitude and Longitude of the place's location(to save as $placeAlt$ and $placeLong$): Integers
10. The username of the post editors (to save as $placeEditors$): List of Strings.
11. Post publishing date (to save as $placePubDate$): datetime.
12. The names of the lists that the place was included in (to save as $placeRelatedLists$): List of Strings.
13. The names of the related places (to save as $placeRelatedPlaces$): List of Strings.
14. The URL of the page of the place (to save as $placeURL$):String

We leverage *BeautifulSoup library* to scrape the information, but we need just an additional method to convert into a datetime object the post publishing date since it was a string in the format 'Month Day, Year'. For example we found 'May 8, 2010' for the very first link, and instead we wanted '2010/05/08'. This method does so:

In [None]:
def string_to_datetime(string):
    return str(datetime.strptime(string, '%B %d, %Y').date())

We define a function that builds a dictionary of information for every place we go through: then from this dictionary we'll buil a .tsv for every HTML document we gathered.

In [None]:
def darkAtlasScraper(text):
    
    soup = BeautifulSoup(text)
    
    scraped = {'placeName': 'NaN',
               'placeTags': 'NaN',
               'numPeopleVisited': 'NaN',
               'numPeopleWant': 'NaN',
               'placeDesc': 'NaN',
               'placeShortDesc':'Nan',
               'placeNearby': 'NaN',
               'placeAddress': 'NaN',
               'placeAlt': 'NaN',
               'placeLong': 'NaN',
               'placeEditors': 'NaN',
               'placePubDate': 'NaN',
               'placeRelatedPlaces': 'NaN',
               'placeRelatedLists': 'NaN',
               'placeURL': 'NaN'}          
    
    try:
        scraped['placeName'] = soup.find_all('h1',{'class':'DDPage__header-title'})[0].contents[0]
    except IndexError:
        pass
           
    try:
        scraped['placeTags'] = list(map(lambda s:s.strip(),
                                        [tag.contents[0] for tag in soup.find_all('a',{'class':'itemTags__link js-item-tags-link'})]))
    except IndexError:
        pass
    
    
    counters = soup.find_all('div',{'class':'title-md item-action-count'})
    try:
        scraped['numPeopleVisited'] = int(counters[0].contents[0])
    except IndexError:
        pass
    try:
        scraped['numPeopleWant'] = int(counters[1].contents[0])
    except IndexError:
        pass
    

    place_desc = ''
    for paragraph in soup.find_all('div',{'class':'DDP__body-copy'})[0].find_all('p'):
        for element in paragraph.contents:
            if re.search('<[^>]*>', str(element)):
                element = re.sub('<[^>]*>', "", str(element))
                place_desc += element
            else:
                place_desc += str(element)
    scraped['placeDesc'] = place_desc
    
    try:
        scraped['placeShortDesc'] = soup.find_all('h3',{'class':'DDPage__header-dek'})[0].contents[0].replace(u'\xa0', u'')
    except IndexError:
        pass

    nearby = []
    try:
        for nearbies in soup.find_all('div',{'class':'DDPageSiderailRecirc__item-text'}):
            nearby.append(nearbies.find_all('div',{'class':'DDPageSiderailRecirc__item-title'})[0].contents[0])
        scraped['placeNearby'] = nearby
    except IndexError:
        pass
    
    try:
        address = (str(soup.find_all('aside',{'class':'DDPageSiderail__details'})[0]
                           .find_all('address',{'class':'DDPageSiderail__address'})[0]
                           .find_all('div')[0])
                           .split('\n', 1)[0])
        scraped['placeAddress'] = re.sub('<[^>]*>', " ", address)
    except IndexError:
        pass
    
    coordinates = soup.find_all('div',{'class':'DDPageSiderail__coordinates js-copy-coordinates'})[0].contents[2]
    scraped['placeAlt'] = float(coordinates.split()[0][:-1])
    scraped['placeLong'] = float(coordinates.split()[1])


    editorsoup = soup.find_all('a',{'class':'DDPContributorsList__contributor'})
    scraped['placeEditors'] = [stuff.find_all('span')[0].contents[0] 
                               for stuff in editorsoup 
                               if len(stuff.find_all('span')) > 0]
    if not scraped['placeEditors']:
        zzz = soup.find_all('div',{'class':'ugc-editors'})
        flag = 0
        for soupper in zzz:
            if soupper.find_all('h6')[0].contents[0] == 'Added by':
                flag = 1
                break
        try:
            editorsoup = soup.find_all('div',{'class':'ugc-editors'})[flag].find_all('a',{'class':'DDPContributorsList__contributor'})
            scraped['placeEditors'] = [editors.contents[0]
                                       for editors in editorsoup]
        except IndexError:
            pass
            
    try:
        scraped['placePubDate'] = string_to_datetime(soup.find_all('div',{'class':'DDPContributor__name'})[0].contents[0])
    except IndexError:
        pass

    kircher = soup.find_all('div',{'class':'athanasius'})
    for piece in kircher:
        for piecer in piece.find_all('div',{'class':'CardRecircSection__title'}):
            if piecer.contents[0] == 'Related Places':
                scraped['placeRelatedPlaces'] = [re.sub('<[^>]*>', "", str(chunk.contents[1])) 
                                                 for chunk in piece.find_all('h3',{'class':'Card__heading --content-card-v2-title js-title-content'})]
            elif 'Appears in' in piecer.contents[0]:
                scraped['placeRelatedLists'] =  [re.sub('<[^>]*>', "", str(chunk.contents[1])) 
                                                 for chunk in piece.find_all('h3',{'class':'Card__heading --content-card-v2-title js-title-content'})]
    
    scraped['placeURL'] = 'https://www.atlasobscura.com/places/' + filename[:-4]
    
    return scraped

Now we have to define the script that goes through each folder and for each folder goes through each downloaded HTML, scrapes information and store them in a new .tsv file.

In [None]:
for page in range(1,401):
    path = r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page)
    os.chdir(path)
    
    for filename in os.listdir(os.getcwd()):
        os.chdir(r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page))
        
        new_path = r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page) + '\\' + filename
        soupper = open(new_path, 'r',encoding="utf-8")
        
        os.chdir(r'C:\\Users\\Leonardo\\ADM_HW3\\tsv')
        newer_path = r'C:\\Users\\Leonardo\\ADM_HW3\\tsv\\'+filename[:-4]+'.tsv'
        try:
            infos = darkAtlasScraper(soupper)
        except IndexError:
            print(newer_path)
        with open(newer_path,'w+',encoding="utf-8") as new_file:
            for info in infos.values():
                if type(info) == list:
                    for index in range(len(info)):
                        if index < len(info) - 1:
                            new_file.write(str(info[index])+ ', ')
                        elif index == len(info) - 1:
                            new_file.write(str(info[index]))
                    new_file.write('\t')
                else:
                    new_file.write(str(info))
                    new_file.write('\t')

In [None]:
os.chdir(r'C:\\Users\\Leonardo')

We then decided to build a .csv to collect all the data and to make them always available in a practical format. 

In [None]:
path= r"C:\Users\Leonardo\ADM_HW3\tsv"
final_dataset = []
filenames = os.listdir(path)

for file in filenames:
    if file.endswith('tsv'):
        file_path = os.path.join(path,file)
        try:
            df = pd.read_csv(file_path, sep ="\t", header=None, quoting=3)
            final_dataset.append(df)
        except:
            print(file)
            pass
    
final_dataset = pd.concat(final_dataset)
final_dataset.to_csv('final_dataset.csv', index = False)

In [3]:
path = "final_dataset.csv"

mostPopularPlaces = pd.read_csv(path)
mostPopularPlaces = mostPopularPlaces.iloc[:, :-2]
mostPopularPlaces.columns = ['placeName', 'placeTags', 'numPeopleVisited', 'numPeopleWant', 'placeDesc', 'placeShortDesc', 'placeNearby', 
        'placeAddress', 'placeAlt', 'placeLong', 'placeEditors', 'placePubDate', 'placeRelatedPlaces', 'placeRelatedLists', 'placeURL']

We now have the whole dataset and we are ready to start building the search engines. 

_________________________________________________

## 2. Search Engine

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')                           #Download command on NLTK library to get specific 
nltk.download('stopwords')

In [5]:
def ntlk_analysis(info):
    final_words = []
    tokens = word_tokenize(info.lower())
    stop_words = set(stopwords.words("english"))
    ps = PorterStemmer()
    for token in tokens:
        if token not in stop_words and token.encode().isalpha():
            stemming_token = ps.stem(token)
            final_words.append(stemming_token)
            
    return final_words

## 2.1. Conjunctive query

In [None]:
listOfWords = []
for index, row in mostPopularPlaces['placeDesc'].iteritems():
    description = mostPopularPlaces['placeDesc'][index]
    listOfWords.append(ntlk_analysis(description))

In [None]:
mostPopularPlaces['listOfWords'] = listOfWords

In [None]:
mostPopularPlaces['listOfWords'].head(15)

### 2.1.1) Create your index!

In [None]:
vocabulary = {}
term_id = 1
for word in sorted(list(word_occ.keys())):
    vocabulary[word] = term_id
    term_id += 1

with open('vocabulary.txt', 'w') as file:
    for word, term_id in vocabulary.items():
        file.write('%s:%s\n' % (word, term_id))

In [24]:
'''
To open the vocabulary from vocabulary.text use this script
'''
vocabulary = {}
with open('vocabulary.txt', 'r') as file:
    for line in file.readlines():
        word = line.split(':')[0]
        term_id = line.split(':')[1]
        vocabulary[word] = int(term_id)


In [None]:
inverted_index = {}

for index, list_ in mostPopularPlaces['listOfWords'].iteritems():
    for word in list_:
        if word in inverted_index:
            if index not in inverted_index[word]:
                inverted_index[word].append(index)
        else:
            inverted_index[word] = [index]

inverted_index = dict(sorted(inverted_index.items()))
with open('inverted_index1.txt', 'w') as file:
    for word, indexing in inverted_index.items():
        texted_indexing = str(indexing[0]) + ', ' + ', '.join(list(map(str, indexing[1:])))
        file.write('%s:%s\n' % (word, texted_indexing))

In [10]:
'''
To open the vocabulary from vocabulary.text use this script
'''

inverted_index = {}
with open('inverted_index1.txt', 'r') as file:
    for line in file.readlines():
        word = line.split(':')[0]
        indexing = line.split(':')[1].strip('\n')
        indexing = indexing.split(',')
        if indexing[-1] == ' ':
            indexing = indexing[:-1]
        inverted_index[word] = list(map(int, [string.strip() for string in indexing]))

### 2.1.2) Execute the query

In [28]:
def search_match(query,df):
    result = []
    query = ntlk_analysis(query)
    match = {key: [] for key in query}

    for key,list_of_values in inverted_index.items():
        if key in match:
            for value in list_of_values:
                if value not in match[key]:
                    match[key].append(value)
                    
    final_values = list(match.values())
    intersection = set.intersection(*map(set,final_values))
    
    for index in intersection:
        series = df[['placeName', 'placeDesc','placeURL']].loc[index]
        result.append(series)
    
    return pd.DataFrame(result)

In [30]:
first_search_engine = search_match('american museum', mostPopularPlaces)
first_search_engine

Unnamed: 0,placeName,placeDesc,placeURL
1024,Catoctin Furnace,"In the early 1770s, Thomas Johnson discovered ...",https://www.atlasobscura.com/places/catoctin-f...
6657,U-505,One of only four remaining U-boats in the worl...,https://www.atlasobscura.com/places/u-505
1542,Diego Rivera's Detroit Industry,"In a city overflowing with street art, murals,...",https://www.atlasobscura.com/places/diego-rive...
1031,Cave Hill Cemetery,Holding the bodies of a number of influential ...,https://www.atlasobscura.com/places/cave-hill-...
3593,Mansfield Memorial Museum,"Back at the 1939 World’s Fair in New York, Ele...",https://www.atlasobscura.com/places/mansfield-...
...,...,...,...
1002,Castle Post,Looking more like an abandoned Medieval Times ...,https://www.atlasobscura.com/places/castle-post
495,Belgrade Tesla Museum,"In the center of Belgrade, a villa holds the w...",https://www.atlasobscura.com/places/belgrade-t...
2548,Harvard Museum of Natural History,Collecting three different institutions into o...,https://www.atlasobscura.com/places/harvard-mu...
3060,KattenKabinet,The death of a pet can inspire a number of rea...,https://www.atlasobscura.com/places/kattenkabinet


# 2.2. Conjunctive query & Ranking score

We can leverage tfidf method from sklearn, or we can implement the tfidf from scratch. We will implement both solution as exercise, but we'll use *TfdifVectorizer* to focus on tuning the model on the data. 

In [None]:
### word_occ = Counter(reduce(lambda x,y:x+y, mostPopularPlaces.listOfWords))

# 3. Define a new score!


In [91]:
mean1 = round(mostPopularPlaces['numPeopleVisited'].mean(),2)
mean2 = round(mostPopularPlaces['numPeopleWant'].mean(), 2)

In [92]:
print(mean1, mean2)

417.14 910.36


In [93]:
def new_score_popularity(query, df, mean1, mean2, k):
    result = []
    query = ntlk_analysis(query)
    match = {key: [] for key in query}

    for key,list_of_values in inverted_index.items():
        if key in match:
            for value in list_of_values:
                if value not in match[key]:
                    match[key].append(value)


    final_values = list(match.values())
    intersection = set.intersection(*map(set,final_values))

    rank = []
    score = 0
    for idx in intersection:
        score = int(df.loc[idx]['numPeopleVisited'] > mean1) + int(df.loc[idx]['numPeopleWant'] > mean2)
        rank.append((idx,score))
        
    heapq.heapify(rank) 
    rank = (heapq.nlargest(n = k, iterable = rank, key = lambda x:x[1])) 

    for idx,score in rank:
        series = df[['placeName', 'placeDesc','placeURL']].loc[idx]
        series['newScore'] = score
        result.append(series)
    
    return pd.DataFrame(result)
    

In [94]:
second_search_engine = new_score_popularity('american museum', mostPopularPlaces, mean1, mean2, 10)
second_search_engine

Unnamed: 0,placeName,placeDesc,placeURL,newScore
33,A Christmas Story House and Museum,"For many Americans, the 1983 film A Christmas ...",https://www.atlasobscura.com/places/a-christma...,2
392,Baldwin's Book Barn,There are places where history takes longer to...,https://www.atlasobscura.com/places/baldwins-b...,2
193,American Classic Arcade Museum,"Housed inside New Hampshire’s Funspot, which h...",https://www.atlasobscura.com/places/american-c...,2
634,Blue Mustang,The “Blue Mustang” sculpture was created by th...,https://www.atlasobscura.com/places/blue-mustang,2
1051,CDC Museum,"In the 1995 hit film Outbreak, residents of th...",https://www.atlasobscura.com/places/cdc-museum,2
1362,Crazy Horse Memorial,When the carving of Mount Rushmore began in 19...,https://www.atlasobscura.com/places/crazy-hors...,2
2723,Horniman Museum and Gardens,London’s Horniman Museum has been showing off ...,https://www.atlasobscura.com/places/horniman-m...,2
924,Canyons of the Ancients,Ripe for quiet reflection and simply awe-inspi...,https://www.atlasobscura.com/places/canyons-of...,2
1972,Flushing Meadows-Corona Park,"After a long ride from Manhattan, most get off...",https://www.atlasobscura.com/places/flushing-m...,2
586,Biosphere of Montreal,As their contribution to Montreal’s 1967 World...,https://www.atlasobscura.com/places/biosphere-...,2


In [98]:
def new_score_jaccard(query, df, k):
    result = []
    query = ntlk_analysis(query)
    match = {key: [] for key in query}

    for key,list_of_values in inverted_index.items():
        if key in match:
            for value in list_of_values:
                if value not in match[key]:
                    match[key].append(value)


    final_values = list(match.values())
    intersection = set.intersection(*map(set,final_values))

    rank = []
    score = 0
    for idx in intersection:
        if type(df.loc[idx]['placeTags']) == str:
            tags = ntlk_analysis(df.loc[idx]['placeTags'])
            jaccard_idx = len(set(tags).intersection(query))/ len(set(tags).union(query))
            rank.append((idx,jaccard_idx))
        else:
            rank.append((idx, -1))
        
    #rank.sort(key = lambda x:x[1], reverse = True)
    heapq.heapify(rank) 
    rank = (heapq.nlargest(n = k, iterable = rank, key = lambda x:x[1])) 

    for idx,score in rank:
        series = df[['placeName', 'placeDesc','placeURL']].loc[idx]
        series['newScore'] = score
        result.append(series)
    
    return pd.DataFrame(result)

In [99]:
third_search_engine = new_score_jaccard('american museum', mostPopularPlaces,10)
third_search_engine

Unnamed: 0,placeName,placeDesc,placeURL,newScore
1701,East Kong Yick Building at the Wing Luke Museum,"In 1910, 170 early Chinese pioneers pooled the...",https://www.atlasobscura.com/places/east-kong-...,0.5
4142,National World War II Museum,"Perhaps once thought too narrowly focused, th...",https://www.atlasobscura.com/places/national-w...,0.333333
4979,Rock Art Ranch,"Rock Art Ranch, near Winslow, Arizona, is a pr...",https://www.atlasobscura.com/places/rock-art-r...,0.333333
3528,Lunch Box Museum,Lunch boxes bring back a certain sense of nost...,https://www.atlasobscura.com/places/lunch-box-...,0.333333
3734,Metropolitan Pit Stop,Metropolitan Pit Stop was founded by Jimmy Val...,https://www.atlasobscura.com/places/metropolit...,0.333333
3990,Museum of the American Cocktail,They say that New Orleans is the home of the f...,https://www.atlasobscura.com/places/museum-ame...,0.333333
3807,Mitsitam Native Foods Cafe,"A visit to the National Mall in Washington, D....",https://www.atlasobscura.com/places/mitsitam-n...,0.333333
2548,Harvard Museum of Natural History,Collecting three different institutions into o...,https://www.atlasobscura.com/places/harvard-mu...,0.333333
1636,Drayton Hall,Considered one of the most beautiful examples ...,https://www.atlasobscura.com/places/drayton-hall,0.285714
2791,Hugh Mercer Apothecary Shop,"Hugh Mercer was a Scot, a warrior, a friend of...",https://www.atlasobscura.com/places/hugh-merce...,0.285714


In [97]:
mostPopularPlaces.loc[1701]['placeTags']

'museums'