In [26]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import lxml
import numpy as np

import os                                                      #Needed to move between OS folders

import re
from bs4 import BeautifulSoup                                  #Scraper 
import requests                                                #URL drainer

from tqdm import tqdm

from datetime import datetime                                  #To be leveraged to define datetime objects  

from collections import Counter
from functools import reduce

import nltk                                                    #Text processing library
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')                                         #Download command on NLTK library to get specific 
nltk.download('stopwords')

from sklearn.feature_extraction.text import TfidfVectorizer    #Useful already implemented tfidf vectorizer from scikit learn library

from mrjob.job import MRJob
from mrjob.step import MRStep                                  #MapReduce methods to perform the map-shuffle-reduce pattern

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Leonardo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Leonardo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1. Data collection

## 1.1. Get the list of places


In [None]:
places = []

for i in range(1,401):
    main_url = 'https://www.atlasobscura.com/places?page=' + str(i) +'&sort=likes_count'
    cont = requests.get(main_url)
    soup = BeautifulSoup(cont.text)
    for place in soup.find_all('a', {'class':'content-card content-card-place'}):
        places.append('https://www.atlasobscura.com'+place.get('href'))

f = open('places.txt','w+')

for place in places:
    f.write(place+'\n')

f.close()

_____________

## 1.2 Crawl places

In [None]:
f = open('places.txt','r')

lines = f.readlines()

dic = {}
for i in range(0,7200,18):
    dic[1+i//18] = lines[i:i+18]

for page in range(0,401):
    try:
        os.mkdir(r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page))
    except FileExistsError:
        pass
    
    path = r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page)
    os.chdir(path)

    for place in dic[page]:
        place_name = place[36:len(place)-1]
        vanilla = requests.get(place[:-1],allow_redirects=False,headers = {'User-agent': 'your bot 0.1'})
        
        with open(place_name+".txt",'w+',encoding="utf-8") as new_file:
            new_file.write(vanilla.text)

os.chdir(r'C:\\Users\\Leonardo\\ADM_HW3')

____________

## 1.3 Parse downloaded pages

We need to define a script to extract useful information from each HTML we collected. In particular we want these information:
1. Place Name (to save as $placeName$): String.
2. Place Tags (to save as $placeTags$): List of Strings.
3. Number of people who have been there (to save as $numPeopleVisited$): Integer.
4. Number of people who want to visit the place(to save as $numPeopleWant$): Integer.
5. Description (to save as $placeDesc$): String. Everything from under the first image up to "know before you go" (orange frame on the example image).
6. Short Description (to save as $placeShortDesc$): String. Everything from the title and location up to the image (blue frame on the example image).
7. Nearby Places (to save as $placeNearby$): Extract the names of all nearby places, but only keep unique values: List of Strings.
8. Address of the place(to save as $placeAddress$): String.
9. Altitude and Longitude of the place's location(to save as $placeAlt$ and $placeLong$): Integers
10. The username of the post editors (to save as $placeEditors$): List of Strings.
11. Post publishing date (to save as $placePubDate$): datetime.
12. The names of the lists that the place was included in (to save as $placeRelatedLists$): List of Strings.
13. The names of the related places (to save as $placeRelatedPlaces$): List of Strings.
14. The URL of the page of the place (to save as $placeURL$):String

We leverage *BeautifulSoup library* to scrape the information, but we need just an additional method to convert into a datetime object the post publishing date since it was a string in the format 'Month Day, Year'. For example we found 'May 8, 2010' for the very first link, and instead we wanted '2010/05/08'. This method does so:

In [2]:
def string_to_datetime(string):
    return str(datetime.strptime(string, '%B %d, %Y').date())

We define a function that builds a dictionary of information for every place we go through: then from this dictionary we'll buil a .tsv for every HTML document we gathered.

In [44]:
def darkAtlasScraper(text):
    
    soup = BeautifulSoup(text)
    
    scraped = {'placeName': 'NaN',
               'placeTags': 'NaN',
               'numPeopleVisited': 'NaN',
               'numPeopleWant': 'NaN',
               'placeDesc': 'NaN',
               'placeShortDesc':'Nan',
               'placeNearby': 'NaN',
               'placeAddress': 'NaN',
               'placeAlt': 'NaN',
               'placeLong': 'NaN',
               'placeEditors': 'NaN',
               'placePubDate': 'NaN',
               'placeRelatedPlaces': 'NaN',
               'placeRelatedLists': 'NaN',
               'placeURL': 'NaN'}          
    
    try:
        scraped['placeName'] = soup.find_all('h1',{'class':'DDPage__header-title'})[0].contents[0]
    except IndexError:
        pass
           
    try:
        scraped['placeTags'] = list(map(lambda s:s.strip(),
                                        [tag.contents[0] for tag in soup.find_all('a',{'class':'itemTags__link js-item-tags-link'})]))
    except IndexError:
        pass
    
    
    counters = soup.find_all('div',{'class':'title-md item-action-count'})
    try:
        scraped['numPeopleVisited'] = int(counters[0].contents[0])
    except IndexError:
        pass
    try:
        scraped['numPeopleWant'] = int(counters[1].contents[0])
    except IndexError:
        pass
    

    place_desc = ''
    for paragraph in soup.find_all('div',{'class':'DDP__body-copy'})[0].find_all('p'):
        for element in paragraph.contents:
            if re.search('<[^>]*>', str(element)):
                element = re.sub('<[^>]*>', "", str(element))
                place_desc += element
            else:
                place_desc += str(element)
    scraped['placeDesc'] = place_desc
    
    try:
        scraped['placeShortDesc'] = soup.find_all('h3',{'class':'DDPage__header-dek'})[0].contents[0].replace(u'\xa0', u'')
    except IndexError:
        pass

    nearby = []
    try:
        for nearbies in soup.find_all('div',{'class':'DDPageSiderailRecirc__item-text'}):
            nearby.append(nearbies.find_all('div',{'class':'DDPageSiderailRecirc__item-title'})[0].contents[0])
        scraped['placeNearby'] = nearby
    except IndexError:
        pass
    
    try:
        address = (str(soup.find_all('aside',{'class':'DDPageSiderail__details'})[0]
                           .find_all('address',{'class':'DDPageSiderail__address'})[0]
                           .find_all('div')[0])
                           .split('\n', 1)[0])
        scraped['placeAddress'] = re.sub('<[^>]*>', " ", address)
    except IndexError:
        pass
    
    coordinates = soup.find_all('div',{'class':'DDPageSiderail__coordinates js-copy-coordinates'})[0].contents[2]
    scraped['placeAlt'] = float(coordinates.split()[0][:-1])
    scraped['placeLong'] = float(coordinates.split()[1])


    editorsoup = soup.find_all('a',{'class':'DDPContributorsList__contributor'})
    scraped['placeEditors'] = [stuff.find_all('span')[0].contents[0] 
                               for stuff in editorsoup 
                               if len(stuff.find_all('span')) > 0]
    if not scraped['placeEditors']:
        zzz = soup.find_all('div',{'class':'ugc-editors'})
        flag = 0
        for soupper in zzz:
            if soupper.find_all('h6')[0].contents[0] == 'Added by':
                flag = 1
                break
        try:
            editorsoup = soup.find_all('div',{'class':'ugc-editors'})[flag].find_all('a',{'class':'DDPContributorsList__contributor'})
            scraped['placeEditors'] = [editors.contents[0]
                                       for editors in editorsoup]
        except IndexError:
            pass
            
    try:
        scraped['placePubDate'] = string_to_datetime(soup.find_all('div',{'class':'DDPContributor__name'})[0].contents[0])
    except IndexError:
        pass

    kircher = soup.find_all('div',{'class':'athanasius'})
    for piece in kircher:
        for piecer in piece.find_all('div',{'class':'CardRecircSection__title'}):
            if piecer.contents[0] == 'Related Places':
                scraped['placeRelatedPlaces'] = [re.sub('<[^>]*>', "", str(chunk.contents[1])) 
                                                 for chunk in piece.find_all('h3',{'class':'Card__heading --content-card-v2-title js-title-content'})]
            elif 'Appears in' in piecer.contents[0]:
                scraped['placeRelatedLists'] =  [re.sub('<[^>]*>', "", str(chunk.contents[1])) 
                                                 for chunk in piece.find_all('h3',{'class':'Card__heading --content-card-v2-title js-title-content'})]
    
    scraped['placeURL'] = 'https://www.atlasobscura.com/places/' + filename[:-4]
    
    return scraped

Now we have to define the script that goes through each folder and for each folder goes through each downloaded HTML, scrapes information and store them in a new .tsv file.

In [45]:
for page in range(1,401):
    path = r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page)
    os.chdir(path)
    
    for filename in os.listdir(os.getcwd()):
        os.chdir(r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page))
        
        new_path = r'C:\\Users\\Leonardo\\ADM_HW3\\page ' + str(page) + '\\' + filename
        soupper = open(new_path, 'r',encoding="utf-8")
        
        os.chdir(r'C:\\Users\\Leonardo\\ADM_HW3\\tsv')
        newer_path = r'C:\\Users\\Leonardo\\ADM_HW3\\tsv\\'+filename[:-4]+'.tsv'
        try:
            infos = darkAtlasScraper(soupper)
        except IndexError:
            print(newer_path)
        with open(newer_path,'w+',encoding="utf-8") as new_file:
            for info in infos.values():
                if type(info) == list:
                    for index in range(len(info)):
                        if index < len(info) - 1:
                            new_file.write(str(info[index])+ ', ')
                        elif index == len(info) - 1:
                            new_file.write(str(info[index]))
                    new_file.write('\t')
                else:
                    new_file.write(str(info))
                    new_file.write('\t')

In [None]:
os.chdir(r'C:\\Users\\Leonardo')

We then decided to build a .csv to collect all the data and to make them always available in a practical format. 

In [29]:
path= r"C:\Users\Leonardo\ADM_HW3\tsv"
final_dataset = []
filenames = os.listdir(path)

for file in filenames:
    if file.endswith('tsv'):
        file_path = os.path.join(path,file)
        try:
            df = pd.read_csv(file_path, sep ="\t", header=None, quoting=3)
            final_dataset.append(df)
        except:
            print(file)
            pass
    
final_dataset = pd.concat(final_dataset)
final_dataset.to_csv('final_dataset.csv', index = False)

In [5]:
path = r"C:\Users\Leonardo\Documents\GitHub\3HW-ADM-Fabri.Dinino.Aur\final_dataset.csv"

mostPopularPlaces = pd.read_csv(path)
mostPopularPlaces = mostPopularPlaces.iloc[:, :-2]
mostPopularPlaces.columns = ['placeName', 'placeTags', 'numPeopleVisited', 'numPeopleWant', 'placeDesc', 'placeShortDesc', 'placeNearby', 
        'placeAddress', 'placeAlt', 'placeLong', 'placeEditors', 'placePubDate', 'placeRelatedPlaces', 'placeRelatedLists', 'placeURL']

We now have the whole dataset and we are ready to start building the search engines. 

_________________________________________________

## 2. Search Engine

In [12]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')                           #Download command on NLTK library to get specific 
nltk.download('stopwords')

In [13]:
def ntlk_analysis(info):
    final_words = []
    tokens = word_tokenize(info.lower())
    stop_words = set(stopwords.words("english"))
    ps = PorterStemmer()
    for token in tokens:
        if token not in stop_words and token.encode().isalpha():
            stemming_token = ps.stem(token)
            final_words.append(stemming_token)
            
    return final_words

## 2.1. Conjunctive query

In [16]:
listOfWords = []
for index, row in mostPopularPlaces['placeDesc'].iteritems():
    description = mostPopularPlaces['placeDesc'][index]
    listOfWords.append(ntlk_analysis(description))

In [17]:
mostPopularPlaces['listOfWords'] = listOfWords

In [18]:
mostPopularPlaces['listOfWords'].head(15)

0     [need, drop, research, facil, exist, address, ...
1     [walk, street, forgiven, think, parisian, buil...
2     [outsid, bluff, utah, massiv, alcov, loom, san...
3     [receiv, fanfar, today, small, town, aurora, t...
4     [nestl, ground, floor, unesco, world, heritag,...
5     [amidst, glut, shop, restaur, make, san, diego...
6     [loui, flight, cage, built, world, fair, year,...
7     [commerci, hub, excit, new, way, travel, art, ...
8     [home, futur, past, featur, aqua, tile, bathro...
9     [pope, john, paul, ii, gift, use, santi, vince...
10    [beeton, christma, annual, huge, popular, vict...
11    [updat, hour, church, elvi, longer, physic, lo...
12    [could, quit, possibl, frivol, yet, excit, adv...
13    [els, new, york, citi, would, find, stunningli...
14    [airplan, aficionado, rejoic, heaven, found, g...
Name: listOfWords, dtype: object

### 2.1.1) Create your index!

In [49]:
vocabulary = {}
term_id = 1
for word in sorted(list(word_occ.keys())):
    vocabulary[word] = term_id
    term_id += 1

with open('vocabulary.txt', 'w') as file:
    for word, term_id in vocabulary.items():
        file.write('%s:%s\n' % (word, term_id))


In [80]:
'''
To open the vocabulary from vocabulary.text use this script

vocabulary = {}
with open('vocabulary.txt', 'r') as file:
    for line in file.readlines():
        word = line.split(':')[0]
        term_id = line.split(':')[1]
        vocabulary[word] = int(term_id)
'''

In [75]:
inverted_index = {}

for index, list_ in mostPopularPlaces['listOfWords'].iteritems():
    for word in list_:
        if word in inverted_index:
            if index not in inverted_index[word]:
                inverted_index[word].append(index)
        else:
            inverted_index[word] = [index]

inverted_index = dict(sorted(inverted_index.items()))
with open('inverted_index1.txt', 'w') as file:
    for word, indexing in inverted_index.items():
        texted_indexing = str(indexing[0]) + ', ' + ', '.join(list(map(str, indexing[1:])))
        file.write('%s:%s\n' % (word, texted_indexing))

In [98]:
'''
To open the inverted_index from file use this script
inverted_index = {}
with open('inverted_index1.txt', 'r') as file:
    for line in file.readlines():
        word = line.split(':')[0]
        indexing = line.split(':')[1].strip('\n')
        if indexing[:-1] == '':
            indexing.pop()
        inverted_index[word] = indexing.split(', ')
'''

In [99]:
inverted_index

{'aa': ['5229', '6710'],
 'aaa': ['3554', ''],
 'aaf': ['39', ''],
 'aak': ['3709', ''],
 'aalborg': ['3411', ''],
 'aan': ['552', ''],
 'aarhu': ['7170', ''],
 'aaron': ['1099',
  '2500',
  '2872',
  '3586',
  '5021',
  '5379',
  '6023',
  '6159',
  '6819',
  '6923'],
 'aarti': ['416', ''],
 'ab': ['4226', ''],
 'aback': ['3986', ''],
 'abakanowicz': ['2444', ''],
 'abalon': ['2798', '4250'],
 'abandn': ['5526', ''],
 'abandon': ['21',
  '23',
  '24',
  '40',
  '41',
  '42',
  '43',
  '44',
  '45',
  '46',
  '47',
  '48',
  '49',
  '50',
  '51',
  '52',
  '53',
  '54',
  '55',
  '56',
  '57',
  '58',
  '59',
  '60',
  '61',
  '62',
  '63',
  '65',
  '66',
  '67',
  '70',
  '71',
  '72',
  '73',
  '75',
  '76',
  '87',
  '94',
  '101',
  '106',
  '113',
  '115',
  '148',
  '151',
  '152',
  '195',
  '235',
  '239',
  '243',
  '274',
  '280',
  '283',
  '287',
  '291',
  '309',
  '332',
  '333',
  '363',
  '369',
  '371',
  '372',
  '381',
  '393',
  '394',
  '411',
  '412',
  '441',
  

### 2.1.2) Execute the query

In [41]:
def search_match(query,df):
    result = []
    query = ntlk_analysis(query)
    match = {key: [] for key in query}

    for key,list_of_values in vocabolary.items():
        if key in match:
            for value in list_of_values:
                if value not in match[key]:
                    match[key].append(value)
                    
    final_values = list(match.values())
    intersection = set.intersection(*map(set,final_values))
    
    for index in intersection:
        series = df[['placeName', 'placeDesc','placeURL']].loc[index]
        result.append(series)
    
    return pd.DataFrame(result)

In [44]:
first_search_engine = search_match('monster', mostPopularPlaces)
first_search_engine

Unnamed: 0,placeName,placeDesc,placeURL
1272,Colossal Squid,"Te Papa is New Zealand’s national museum, and ...",https://www.atlasobscura.com/places/colossal-s...
1537,Rattlesnake Bridge,"Tucson, Arizona, is a city with a devotion to ...",https://www.atlasobscura.com/places/diamondbac...
256,Ape Canyon,Ape Canyon is a narrowing gorge sitting just t...,https://www.atlasobscura.com/places/ape-canyon
3840,Monster Kabinett,The Monster Kabinett is a wonderland (so to sp...,https://www.atlasobscura.com/places/monster-ka...
6402,The Wichita Troll,"Without a plaque or other indicator, the hidde...",https://www.atlasobscura.com/places/the-wichit...
...,...,...,...
3065,Kawaii Monster Cafe,"If anyone is responsible for the extravagant, ...",https://www.atlasobscura.com/places/kawaii-mon...
6650,Two Bit Circus Micro-Amusement Park,Step right up! Multidisciplinary tech collecti...,https://www.atlasobscura.com/places/two-bit-ci...
6651,Two Guns,It has all the workings of a modern-day Hamlet...,https://www.atlasobscura.com/places/two-guns
3324,Le Boudoir,"It started in 1755, when Marie Antoinette, who...",https://www.atlasobscura.com/places/le-boudoir...


# 2.2. Conjunctive query & Ranking score

We can leverage tfidf method from sklearn, or we can implement the tfidf from scratch. We will implement both solution as exercise, but we'll use *TfdifVectorizer* to focus on tuning the model on the data. 

In [None]:
### word_occ = Counter(reduce(lambda x,y:x+y, mostPopularPlaces.listOfWords))