In [1]:
import numpy as np
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
from collections import Counter

In [2]:
base_url = 'https://en.wikipedia.org'
url = 'https://en.wikipedia.org/wiki/List_of_international_airports_by_country'
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, 'lxml')
airport_size_criteria = set(['Small', 'Medium', 'Large'])
canada_data = Counter()
us_data = Counter()

## Pulling Canadian Data

In [3]:
canadian_rows = soup.find(id='Canada').parent.find_next_sibling('table').contents[1].find_all('tr')
for row_ind in range(1, len(canadian_rows)):
    airport_size = canadian_rows[row_ind].contents[7].getText().strip()
    if airport_size not in airport_size_criteria:
        continue
    follow_page = canadian_rows[row_ind].find_all('a')[0]['href']
    url_follow = base_url + follow_page
    soup_follow = BeautifulSoup(urllib.request.urlopen(url_follow))
    province_exists = soup_follow.find('a', text='Province').parent if soup_follow.find('a', text='Province') else soup_follow.find('th', text='Province')
    row_data_contents = province_exists.next_sibling.contents
    if len(row_data_contents) == 1:
        prov_element = row_data_contents[0]
        canada_data[prov_element.getText()] += 1
    else:
        # has a flag image
        prov_element = row_data_contents[1]
        canada_data[prov_element.getText()] += 1

In [4]:
canada_data

Counter({'Alberta': 2,
         'British Columbia': 3,
         'Manitoba': 1,
         'New Brunswick': 1,
         'Newfoundland and Labrador': 1,
         'Nova Scotia': 1,
         'Ontario': 2,
         'Quebec': 2,
         'Saskatchewan': 2})

## United States

In each row of the Wikipedia table, the first column (index 1) is the 

In [5]:
us_rows = soup.find(id='United_States').parent.find_next_sibling('table').contents[1].find_all('tr')
for row_ind in range(1, len(us_rows)):
    airport_size = us_rows[row_ind].contents[7].getText().strip()
    if airport_size not in airport_size_criteria:
        continue
    if us_rows[row_ind].find_all('a')[0].get_text() == 'Kona':
        us_data['Hawaii'] += 1
        continue
    if us_rows[row_ind].find_all('a')[0].get_text() == 'Washington, D.C.':
        us_data['District of Columbia'] += 1
        continue
    follow_page = us_rows[row_ind].find_all('a')[0]['href']
    url_follow = base_url + follow_page
    soup_follow = BeautifulSoup(urllib.request.urlopen(url_follow))
    province_exists = soup_follow.find('table', class_='infobox geography vcard').contents[0].find(text='State').parent
    row_data_contents = province_exists.next_sibling if province_exists.name == 'th' else province_exists.parent.next_sibling
    if len(row_data_contents.contents) == 1:
        # if only a td tag
        if row_data_contents.name == 'td':
            us_data[row_data_contents.getText().strip()] += 1
        else:
            prov_element = row_data_contents.contents[0]
            us_data[prov_element.getText().strip()] += 1
    else:
        # has a flag image
        prov_element = row_data_contents.contents[0].find_next_sibling('a')
        us_data[prov_element.getText().strip()] += 1

In [6]:
del us_data['Hawaii']

In [7]:
comb_data = us_data + canada_data

In [8]:
df_values = pd.DataFrame({'location': list(comb_data.keys()), 'numAirports': list(comb_data.values())})

## Adding population data

In [9]:
canada_pop_data = pd.read_csv('external_data/canada_pop.csv')

In [10]:
us_pop_data = pd.read_csv('external_data/us_pop.csv')

In [11]:
# Canada Land Area
soup_c_area = BeautifulSoup(urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_Canadian_provinces_and_territories_by_area'))

land_area_table = soup_c_area.find_all('tbody')[1].contents

province_name = []

canada_land_area = []

for itr in range(2, len(land_area_table) - 1, 2):
    table_cols = land_area_table[itr].contents
    province_name.append(table_cols[3].contents[1].get_text().strip())
    canada_land_area.append(int(table_cols[7].get_text().strip().replace(',', '')))
        

In [12]:
canada_land_df = pd.DataFrame({'Geography': province_name, 'Land':canada_land_area})

In [13]:
merged_data = pd.merge(canada_pop_data, canada_land_df, how='left', on='Geography')
merged_data['Density'] = merged_data['Q1 2020'] / merged_data['Land']

In [14]:
merged_data.drop(merged_data.columns[[1,2]], axis = 1, inplace=True)

In [17]:
dens_data = merged_data.append(us_pop_data, ignore_index=True)
dens_data = dens_data[dens_data['Geography'] != 'Hawaii']

In [20]:
full_data = pd.merge(dens_data, df_values, left_on='Geography', right_on='location', how='left')

In [26]:
del full_data['location']

In [29]:
full_data.fillna(0, inplace=True)

In [30]:
full_data

Unnamed: 0,Geography,Density,numAirports
0,Newfoundland and Labrador,3.611737,1.0
1,Prince Edward Island,72.383524,0.0
2,Nova Scotia,47.463193,1.0
3,New Brunswick,28.273933,1.0
4,Quebec,16.198092,2.0
5,Ontario,41.518722,2.0
6,Manitoba,6.445157,1.0
7,Saskatchewan,5.172650,2.0
8,Alberta,17.794944,2.0
9,British Columbia,14.307637,3.0


In [None]:
pd.to_csv()