In [1]:
#Import dependancies for the script
import requests as r #make web requests
from bs4 import BeautifulSoup as bs #Parses HTML so it is easier to work with
import pandas as pd #creates dataFrame objects to view large quantities of data

In [2]:
soups = [] #array to hold parsed HTML pages

#The actual number of pages is 1560, but that will take ages to run.
#Changing the value below will vary how many pages are scraped for contact info.
#For each page scraped here, the details of 5 embassies are discovered.
numberOfPages = 2
for x in range(1, numberOfPages):
    url = "http://www.embassy-worldwide.com/embassy/page/" + str(x)
    page = r.get(url)
    soup = bs(page.content, 'html.parser')
    soups.append(soup)
    #print("Getting " + url)

In [3]:
elems = []
links = []


#This bit goes through all the pages scraped and finds the links to individial embassy pages, where the details are
for s in range(len(soups)):
    localElems = soups[s].find_all('h2', class_='entry-title')
    elems += localElems

for elem in elems:
    link = elem.find('a')['href']
    links.append(link)
    
    #uncomment these for testing to visualise progress, slows down the process though
    #print(link)
    
#print(links)

In [4]:
pages = []

#same as above but for the individual embassy pages
for link in links:
    url = link
    page = r.get(url)
    soup = bs(page.content, 'html.parser')
    pages.append(soup)
    #uncomment when testing to show progress, but significantly slows the script down
    #print("Getting: " + url)
    
#print(pages)

In [5]:
#The emails on this site are stored with a very basic encryption, to prevent exactly what we're doing here.
#The encryption on the site is decrypted at load-time with javascript running locally on the browser.

#Because the scraper will not load and run the javascript,
#this function decrypts the email addresses after they are grabbed.

def decodeEmail(e):
    de = ""
    k = int(e[:2], 16)

    for i in range(2, len(e)-1, 2):
        de += chr(int(e[i:i+2], 16)^k)

    return de

In [6]:
dataSet = []

#this bit goes through each embassy page, locates the important data, and stores it to an array.
#Each of those arrays is then appended to a larger 2 Dimensional array which will store all the data.
for i in range(len(pages)):
    embassyData = []
    
    title = pages[i].find('h1', class_='emb-single-title').text
    embassyData.append(title)
    
    embassyDataList = pages[i].find('ul', class_='embassy-data').find_all('li')
    
    for x in range(len(embassyDataList)):     
        out = ""
        if embassyDataList[x].get('id') == 'email':
            out += decodeEmail(embassyDataList[x].find('a').get('data-cfemail'))
        else:
            out += embassyDataList[x].text[len(embassyDataList[x].find('span').text):]
        embassyData.append(out)        
    
    dataSet.append(embassyData)

#uncomment to print the raw 2D array with all the data
#not reccomended with large sample sizes as will be a hige wall of text, and will make the script lag

#print(dataSet)


In [7]:
#convert 2D array to dataFrame for nicer presentation and processing, and give the columns names
df = pd.DataFrame(dataSet)
df.columns=['NAME', 'ADDRESS', 'CITY', 'EMAIL', 'FAX', 'PHONE', 'WEBSITE']
df

Unnamed: 0,NAME,ADDRESS,CITY,EMAIL,FAX,PHONE,WEBSITE
0,"Consulate of Singapore in New York, USA","318 East 48th St, New York, NY 10017",New York,singcon_nyc@mfa.sg,212-826-5028,212-223-3331,http://www.mfa.gov.sg/newyork-consul
1,"New Zealand Honorary Consulate in Bountiful, U...",1655 Linden Lane Bountiful UT 84010,Bountiful,Iain.mckay1@hotmail.com,+1 801 296 2494,,
2,"New Zealand Consulate-General New York, United...","41st Floor, 295 Madison Ave Manhattan 10017 Ne...",New York,+1 212 832 4038,,,
3,"New Zealand Consulate-General Los Angeles, Uni...","Suite 600E, 2425 Olympic Blvd Los Angeles CA 9...",Los Angeles,contact@nzcgla.com,+1 310 566 6556,+1 310 566 6555,
4,"New Zealand Honorary Consulate Sacramento, Uni...",44743 N El Macero Dr El Macero CA 95618,El Macero,starrned@msn.com,+1 530 756 7032,+1 530 756 8013,
5,"New Zealand Honorary Consulate Portland, Unite...","PMB #481 25 NW 23rd Place, Suite 6 Portland OR...",Portland,cjs@theswindells.org,+1 503 803 7129,,
6,"New Zealand Honorary Consulate Florida, United...",PO Box 98 West Palm Beach FL 33402,West Palm Beach,nzconsulateflorida@gmail.com,+1 (561) 899 9719,,
7,"New Zealand Honorary Consulate Vermont, United...",211 Ordway Shore Rd Shelburne VT 05482,Shelburne,georgeburrill@me.com,+1 802 489 5677,,
8,"New Zealand Honorary Consulate Houston, United...",3300 N Sam Houston Pkwy E Houston TX 77032,Houston,connelly@nzhonoraryconsul.org,+1 713 501 5418,,
9,"New Zealand Honorary Consulate Boston, United ...",57 N Main St Concord NH 03302,Boston,nzconsulate@preti.com,+603 226 263 7,+603 225 822 8,


In [8]:
#Some of the pages do not display an email, and this messes up the script providing incorrect values for the columns.
#Code below sanitises the output so that only valid emails remain in the dataFrame.

#rows with invalid email addresses are dropped, and that is reported to the console.
#Shouldn't be too many so shouldnt have a large performance effect, but commenting out the print statements will speed it up.

invalidEmails = []
for x in range(len(df['EMAIL'])):
    if ("@" in df['EMAIL'][x]) == False:
        print("Invalid email address detected: " + df['EMAIL'][x])
        print("Dropping row " + str(x))
        invalidEmails.append(x)

for invEm in invalidEmails:
    df = df.drop(index=invEm)
    
df
        
    

Invalid email address detected: +1 212 832 4038
Dropping row 2


Unnamed: 0,NAME,ADDRESS,CITY,EMAIL,FAX,PHONE,WEBSITE
0,"Consulate of Singapore in New York, USA","318 East 48th St, New York, NY 10017",New York,singcon_nyc@mfa.sg,212-826-5028,212-223-3331,http://www.mfa.gov.sg/newyork-consul
1,"New Zealand Honorary Consulate in Bountiful, U...",1655 Linden Lane Bountiful UT 84010,Bountiful,Iain.mckay1@hotmail.com,+1 801 296 2494,,
3,"New Zealand Consulate-General Los Angeles, Uni...","Suite 600E, 2425 Olympic Blvd Los Angeles CA 9...",Los Angeles,contact@nzcgla.com,+1 310 566 6556,+1 310 566 6555,
4,"New Zealand Honorary Consulate Sacramento, Uni...",44743 N El Macero Dr El Macero CA 95618,El Macero,starrned@msn.com,+1 530 756 7032,+1 530 756 8013,
5,"New Zealand Honorary Consulate Portland, Unite...","PMB #481 25 NW 23rd Place, Suite 6 Portland OR...",Portland,cjs@theswindells.org,+1 503 803 7129,,
6,"New Zealand Honorary Consulate Florida, United...",PO Box 98 West Palm Beach FL 33402,West Palm Beach,nzconsulateflorida@gmail.com,+1 (561) 899 9719,,
7,"New Zealand Honorary Consulate Vermont, United...",211 Ordway Shore Rd Shelburne VT 05482,Shelburne,georgeburrill@me.com,+1 802 489 5677,,
8,"New Zealand Honorary Consulate Houston, United...",3300 N Sam Houston Pkwy E Houston TX 77032,Houston,connelly@nzhonoraryconsul.org,+1 713 501 5418,,
9,"New Zealand Honorary Consulate Boston, United ...",57 N Main St Concord NH 03302,Boston,nzconsulate@preti.com,+603 226 263 7,+603 225 822 8,


In [9]:
#exports the above dataframe as a CSV file in the local directory.
df.to_csv('EmbassyData.csv')