In [116]:
from bs4 import BeautifulSoup
import numpy as np
from pandarallel import pandarallel
import pandas as pd
import re
import requests

# SCRAPING ALL THE LANDMARKS IN WASHINGTON STATE

In [117]:
landmark_data = pd.read_csv("../data/Google Landmarks Dataset/train_label_to_category.csv")
landmark_data

Unnamed: 0,landmark_id,category
0,0,http://commons.wikimedia.org/wiki/Category:Hap...
1,1,http://commons.wikimedia.org/wiki/Category:Lui...
2,2,http://commons.wikimedia.org/wiki/Category:Gra...
3,3,http://commons.wikimedia.org/wiki/Category:Twe...
4,4,http://commons.wikimedia.org/wiki/Category:San...
...,...,...
203089,203089,http://commons.wikimedia.org/wiki/Category:Isa...
203090,203090,http://commons.wikimedia.org/wiki/Category:Mar...
203091,203091,http://commons.wikimedia.org/wiki/Category:Sil...
203092,203092,http://commons.wikimedia.org/wiki/Category:Hoa_Lu


In [212]:
def dms_to_dd(degrees=0, minutes=0, seconds=0, direction='N'):
    dd = float(degrees) + float(minutes)/60 + float(seconds)/(60*60);
    if direction == 'W' or direction == 'S':
        dd *= -1
    return dd

def parse_dms(dms):
    if dms is None:
        return None

    pattern = r"""([+-]?\d{1,3})[°\s]*([\d]{1,2})?[′'´\s]*([\d]{1,2}(?:\.\d+)?)?[″"\s]*([NSEWnsew]?)"""
    
    match = re.match(pattern, dms.strip())
    if not match:
        return None
        #raise ValueError(f"Invalid DMS format: {dms}")

    degrees, minutes, seconds, direction = match.groups()
    
    # Convert to decimal degrees
    degrees = float(degrees) if degrees else 0
    minutes = float(minutes) if minutes else 0
    seconds = float(seconds) if seconds else 0 

    coords_dd = dms_to_dd(degrees, minutes, seconds, direction)

    return coords_dd

In [184]:
def get_soup_data(landmark_url):
    try:
        html_text = requests.get(landmark_url)
        soup = BeautifulSoup(html_text.content, "html.parser")
    
        url_if_redirected = soup.find("div", class_ = "category-redirect-header")
    
        if (url_if_redirected):
            redirected_url_extension = url_if_redirected.find("a")['href']
            base_url = landmark_url.split("/w")[0]
            landmark_url = base_url + redirected_url_extension
    
            html_text = requests.get(landmark_url)
            soup = BeautifulSoup(html_text.content, "html.parser")

        return (landmark_url, soup)
    except Exception as e:
        print(e)
        return (landmark_url, None)

In [185]:
def get_landmark_name(landmark_url):
    title = landmark_url.split("Category:")[1].replace("_", " ")

    return title

def get_supercategory_from_soup(soup):
    supercategory = None
    supercategory_location = soup.find(string = "Instance of")
    
    if supercategory_location:
        supercategory_tag = supercategory_location.next_element
        supercategory = [supercategory for supercategory in supercategory_tag.stripped_strings][0]

    return supercategory

def get_location_address_from_soup(soup):
    location_address = None
    location = soup.find(string = "Location")
    
    if location:
        location_text = location.next_element
        location_address = [
            location.replace(",", "").strip()
            for location
            in location_text.stripped_strings
            if location.replace(",", "").strip() 
        ]

        SEPARATOR = ", "
        location_address = SEPARATOR.join(location_address)

    return location_address

def get_location_coords_from_soup(soup):
    latitude, longitude = None, None
    coords = [location for location in soup.find_all("a", class_ = "external text") if "geohack.tool" in location["href"]]
    coords = [coords for coords in coords if coords.text.find(",") != -1]
    
    if len(coords) != 0:
        coords_expanded = coords[0].text.replace("\xa0", "").split(",")

        try:
            latitude = coords_expanded[0].strip()
            longitude = coords_expanded[1].strip()
        except:
            pass

    return (latitude, longitude)

In [186]:
def get_landmark_data(landmark_url):
    url, soup = get_soup_data(landmark_url)

    title = get_landmark_name(url)
    supercategory = get_supercategory_from_soup(soup)
    location_address = get_location_address_from_soup(soup)
    latitude, longitude = get_location_coords_from_soup(soup)

    return (title, supercategory, location_address, latitude, longitude)

In [187]:
%%time

pandarallel.initialize()
landmark_data_soup = landmark_data.parallel_apply(lambda row: get_landmark_data(row['category']), axis='columns', result_type='expand')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
CPU times: user 560 ms, sys: 456 ms, total: 1.02 s
Wall time: 2h 5min 47s


In [188]:
landmark_data_soup = landmark_data_soup.rename(columns={
    0: 'name',
    1: 'supercategory',
    2: 'location',
    3: 'latitude',
    4: 'longitude'
})
landmark_data_soup

Unnamed: 0,name,supercategory,location,latitude,longitude
0,Happy Valley Racecourse,horse racing venue,"Happy Valley, Wan Chai District, Hong Kong, PRC",22°16′22.08″N,114°10′55.2″E
1,Luitpoldpark (M%C3%BCnchen),park,"Munich, Upper Bavaria, Bavaria, Germany",48°10′17.38″N,11°34′10.83″E
2,Grand Ventron,mountain,"Kruth, Cornimont, Ventron, France",47°57′34″N,6°55′32″E
3,"Tweed Heads, New South Wales",town,"New South Wales, Tweed Shire, Australia",28°10′59.88″S,153°33′00″E
4,Santa Maria Immacolata della Concezione (Rome),church building,"Rome, Metropolitan City of Rome, Lazio, Italy",41°53′45.3″N,12°30′01.8″E
...,...,...,...,...,...
203089,Isabel Pass,,,,
203090,Mare %C3%A0 Joncs,waterhole,"Cilaos, Réunion, South Indian Ocean Defense an...",21°08′02.04″S,55°28′36.12″E
203091,Silesian Beskids Landscape Park,landscape park in Poland,"Silesian Voivodeship, Poland",49°43′00″N,18°54′00″E
203092,Hoa Lu,ancient city,"Ninh Bình, Vietnam",20°15′19″N,105°57′15″E


In [191]:
landmark_data_soup['latitude'] = landmark_data_soup['latitude'].map(parse_dms)
landmark_data_soup['longitude'] = landmark_data_soup['longitude'].map(parse_dms)

In [192]:
landmark_full_info = pd.concat([landmark_data, landmark_data_soup], axis='columns')

In [193]:
landmark_full_info

Unnamed: 0,landmark_id,category,name,supercategory,location,latitude,longitude
0,0,http://commons.wikimedia.org/wiki/Category:Hap...,Happy Valley Racecourse,horse racing venue,"Happy Valley, Wan Chai District, Hong Kong, PRC",22.272800,114.182000
1,1,http://commons.wikimedia.org/wiki/Category:Lui...,Luitpoldpark (M%C3%BCnchen),park,"Munich, Upper Bavaria, Bavaria, Germany",48.171494,11.569675
2,2,http://commons.wikimedia.org/wiki/Category:Gra...,Grand Ventron,mountain,"Kruth, Cornimont, Ventron, France",47.959444,6.925556
3,3,http://commons.wikimedia.org/wiki/Category:Twe...,"Tweed Heads, New South Wales",town,"New South Wales, Tweed Shire, Australia",-28.183300,153.550000
4,4,http://commons.wikimedia.org/wiki/Category:San...,Santa Maria Immacolata della Concezione (Rome),church building,"Rome, Metropolitan City of Rome, Lazio, Italy",41.895917,12.500500
...,...,...,...,...,...,...,...
203089,203089,http://commons.wikimedia.org/wiki/Category:Isa...,Isabel Pass,,,,
203090,203090,http://commons.wikimedia.org/wiki/Category:Mar...,Mare %C3%A0 Joncs,waterhole,"Cilaos, Réunion, South Indian Ocean Defense an...",-21.133900,55.476700
203091,203091,http://commons.wikimedia.org/wiki/Category:Sil...,Silesian Beskids Landscape Park,landscape park in Poland,"Silesian Voivodeship, Poland",49.716667,18.900000
203092,203092,http://commons.wikimedia.org/wiki/Category:Hoa_Lu,Hoa Lu,ancient city,"Ninh Bình, Vietnam",20.255278,105.954167


In [242]:
landmark_full_info_notna = landmark_full_info.loc[landmark_full_info['location'].notna()]

In [290]:
# initial filter to remove non-washington-state landmarks
landmark_washington = landmark_full_info_notna.loc[
    (landmark_full_info_notna['location'].str.contains('washington', case = False)) & 
    (~landmark_full_info_notna['location'].str.contains('washington d.c.', case = False)) & 
    (~landmark_full_info_notna['location'].str.contains('washington county', case = False))
]

In [291]:
# 1 degree wiggle room
# accounts for landmarks that go through the border of Washington state
VARIANCE_CONSTANT = 1.0

wa_lat1 = parse_dms("45° 33′ N") - VARIANCE_CONSTANT
wa_lat2 = parse_dms("49°N") + VARIANCE_CONSTANT

wa_long1 = parse_dms("124°46′ W") - VARIANCE_CONSTANT
wa_long2 = parse_dms("116°55′ W") + VARIANCE_CONSTANT

# sometimes, a few places in Washington State don't have coordinates on wikimedia
# the first clause is a "just in cause" one
landmark_washington = landmark_washington.loc[
    (landmark_washington['location'].str.contains('Washington, Pacific Northwest', case = False)) |
    (
        (landmark_washington['latitude'].between(wa_lat1, wa_lat2)) &
        (landmark_washington['longitude'].between(wa_long1, wa_long2))
    )
]

In [292]:
# reorder the columns
landmark_washington = landmark_washington[['landmark_id', 'name', 'supercategory', 'location', 'latitude', 'longitude', 'category']]

In [293]:
# utf-8-sig requires because some locations contain non-ASCII characters
# e.g. accent marks
landmark_washington.to_csv("../data/landmarks_washington_full.csv", index = False, encoding='utf-8-sig')

# GETTING ALL THE WASHINGTON LANDMARKS THAT ARE CLEAN

In [294]:
landmark_images = pd.read_csv("../data/Google Landmarks Dataset/train.csv")
landmark_images.head()

Unnamed: 0,id,url,landmark_id
0,6e158a47eb2ca3f6,https://upload.wikimedia.org/wikipedia/commons...,142820
1,202cd79556f30760,http://upload.wikimedia.org/wikipedia/commons/...,104169
2,3ad87684c99c06e1,http://upload.wikimedia.org/wikipedia/commons/...,37914
3,e7f70e9c61e66af3,https://upload.wikimedia.org/wikipedia/commons...,102140
4,4072182eddd0100e,https://upload.wikimedia.org/wikipedia/commons...,2474


In [303]:
landmark_cleaned_images = pd.read_csv("../data/Google Landmarks Dataset/train_clean.csv")
landmark_cleaned_images.head()

Unnamed: 0,landmark_id,images
0,1,17660ef415d37059 92b6290d571448f6 cd41bf948edc...
1,7,25c9dfc7ea69838d 28b13f94a6f1f3c1 307d6584f473...
2,9,0193b65bb58d2c77 1a30a51a287ecf69 1f4e8ab1f1b2...
3,11,1a6cb1deed46bb17 1cc2c8fbc83e1a0c 2361b8da868c...
4,12,0a199c97c382b1ff 1492a5d344495391 290097bd36a6...


In [309]:
landmark_cleaned_images['images'] = landmark_cleaned_images['images'].str.split(" ")
landmark_cleaned_images = landmark_cleaned_images.explode('images').reset_index(drop=True)
landmark_cleaned_images.head()

Unnamed: 0,landmark_id,images
0,1,17660ef415d37059
1,1,92b6290d571448f6
2,1,cd41bf948edc0340
3,1,fb09f1e98c6d2f70
4,7,25c9dfc7ea69838d


In [320]:
landmark_washington_ids = landmark_washington['landmark_id'].unique()

landmark_washington_images = landmark_images.loc[
    landmark_images['landmark_id'].isin(landmark_washington_ids)
]

landmark_washington_cleaned_images = landmark_washington_images.loc[
    landmark_washington_images['id'].isin(landmark_cleaned_images['images'])
]

landmark_washington_cleaned_images = landmark_washington_cleaned_images.sort_values('landmark_id')
landmark_washington_cleaned_images = landmark_washington_cleaned_images.reset_index(drop=True)
landmark_washington_cleaned_images = landmark_washington_cleaned_images.rename(columns = {
    'id': 'image_id'
})
landmark_washington_cleaned_images = landmark_washington_cleaned_images[['landmark_id', 'image_id', 'url']]
landmark_washington_cleaned_images.head()

Unnamed: 0,landmark_id,image_id,url
0,262,172999b2cc578a66,https://upload.wikimedia.org/wikipedia/commons...
1,262,7c9eb6d53e98e77e,https://upload.wikimedia.org/wikipedia/commons...
2,262,b897ea4300767588,https://upload.wikimedia.org/wikipedia/commons...
3,262,9981810cd64b2e5b,https://upload.wikimedia.org/wikipedia/commons...
4,262,ed77d4f562272cde,https://upload.wikimedia.org/wikipedia/commons...


In [321]:
landmark_washington_cleaned_images.to_csv("../data/landmarks_washington_clean_images.csv", index = False, encoding='utf-8-sig')