## HTML Country List Maker

- This Notebook serves to create 2 html documents that contain the same list of countries based on our original data, only one is ordered by continent and the other isn't.

- Importance: Have a standardized list of countries for the website

In [200]:
import pandas as pd
import numpy as np

In [201]:
df = pd.read_csv('input/origin_data.csv')

In [202]:
#for the purpose of DoMAD, we will show data for Special Administrrative Regions (Like Hong Kong) and other territories (like Greenland) as their own countries. 
non_countries = ['Caucasian and Central Asia', 'Caucasus and Central Asia', 'Eastern Asia (including Japan)', \
                'Eastern Asia (not including Japan)', 'Eastern Europe', 'Europe', 'High income', 'High income: OECD', \
                'High income: nonOECD', 'Latin America and Caribbean', 'Low & middle income', 'Low income', \
                'Lower middle income', 'Middle income', 'Not classified', 'Nothern America', 'Northern America', \
                'Oceania', 'Oceania (not including Australia and New Zealand)', 'Southern Asia', 'South Eastern Asia', \
                'Sub-Saharan Africa', 'Upper middle income', 'Western Asia', 'World']

#get all indices to drop
dropIndices = []
for country in non_countries:
    dropIndices += list(df.loc[df["Country Name"] == country].index)

#drop all indices
df.drop(dropIndices, inplace = True)

In [227]:
country_list = df["Country Name"].unique()
abbrv_list = df["Country Code"].unique()

#make an abbreviation dictionary, where given the country name, it gives the abbreviation for that country
#this will be used later in the notebook
abbrv_dict = {}
for i in range(len(country_list)):
    abbrv_dict[country_list[i]] = abbrv_list[i]

In [245]:
def write_country_list(abbrv_list, country_list):
    html_text = ""
    with open("input/country_list.html", 'r') as f:
        for i, line in enumerate(f.readlines()[:]):
            try:
                #Just go through the pre-existing html and replace all countries with the pre-existing list.
                optval = line[line.find('="')+1:line.rfind('">')+1]
                temp_line = line.replace(optval, abbrv_list[i])
                option_text = line[line.find('">')+2:line.rfind('<')]
                temp_line = temp_line.replace(option_text, country_list[i])
                html_text += temp_line
            except:
                continue
    with open("output/country_list.html", "w") as f:
        f.write(html_text)
    return

Now that we have the normal list of countries, we need to make the continentally ordered list.

In [206]:
#Norm set serves as the standardized set of countries.
#This allows us to use the set difference method, a great way to compare which items overlap between two sets.
#In this case we will want the difference between the list of countries we already use and the list of countries
#that are already in continental order
norm_set = set(country_list)

Let's get a sets of the countries in continental order.

In [207]:
def read_adapt_existing_country_list(change_function = None):
    continental_sorted = {}
    with open('input/countries_by_continents.html', 'r') as f:
        for i, line in enumerate(f.readlines()[3:-2]):
            if line.find("optgroup label") == -1:
                cur_country = line[line.find('">')+2:line.rfind('<')]
                # Make changes to the country names while reading them in
                if change_function:
                        cur_country = change_function(cur_country)
                continental_sorted[continent_name].append(cur_country)
            else:
                ix1 = line.find('"') #find the quotation marks to get the continent name
                ix2 = line.rfind('"')
                continent_name = line[ix1+1:ix2]
                continental_sorted[continent_name] = []
                
                
    for key in continental_sorted:
        continental_sorted[key] = set(continental_sorted[key][:-1]) #don't include the last one since it is always an empty string
        
    return continental_sorted

countries_by_continent = read_adapt_existing_country_list()

In [208]:
def diff_checker(continentally_sorted_countries, norm_set):
    for key in continentally_sorted_countries:
        print("================================")
        print("Differences for:", key)
        print("Pre-existing countries NOT in personal data list")
        print(continentally_sorted_countries[key].difference(norm_set))
    #     print("Personal data countries NOT in pre-existing list")
    #     print(norm_set.difference(countries_by_continent[key]))
    
diff_checker(countries_by_continent, norm_set)

Differences for: North America
Pre-existing countries NOT in personal data list
{'Saint Lucia', 'Bahamas', 'US Virgin Islands', 'Saint Kitts and Nevis', 'Saint Vincent and the Grenadines', 'United States Minor Outlying Islands'}
Differences for: South America
Pre-existing countries NOT in personal data list
{'Falkland Islands (Malvinas)', 'French Guiana', 'Venezuela'}
Differences for: Europe
Pre-existing countries NOT in personal data list
{'Croatia (Hrvatska)', 'Svalbard and Jan Mayen Islands', 'Holy See (Vatican City State)', 'Macedonia', 'Slovakia'}
Differences for: Asia
Pre-existing countries NOT in personal data list
{'Cocos (Keeling) Islands', 'Hong Kong', 'Syria', 'Korea, Republic of', 'Myanmar (Burma)', 'Taiwan', 'Yemen', 'Lao', 'British Indian Ocean Territory', 'Christmas Island', 'Kyrgyzstan', 'East Timor', "Korea, Democratic People's Republic of", 'Iran'}
Differences for: Australia / Oceania
Pre-existing countries NOT in personal data list
{'Pitcairn', 'French Polynesia (Tah

Now I gotta go through by hand to make the pre-existing ones adhere to the personal data list namings. Looking at the original data (aka. norm_set), we see that all names with 'saint' in them, have 'saint' abbreviated to 'st.'. So a good thing to do is go through the pre-existing data and make it change all 'saint'(s) to 'st.'(s). The rest of them I'm just going to hard-code.

In [209]:
def country_name_adapter(cname):
    #given country name cname, convert the name to the standard
    #return adapted cname, in this case change if it includes the word 'saint'
    
    if 'Saint' in cname:
        cname = cname.replace('Saint', 'St.')
    elif cname == 'Bahamas':
        cname = 'Bahamas, The'
    elif cname == 'US Virgin Islands':
        cname = 'Virgin Islands (U.S.)'
    elif cname == 'Falkland Islands (Malvinas)':
        cname = 'Falkland Islands'
    elif cname == 'French Guiana':
        cname = 'French Guyana'
    elif cname == 'Venezuela':
        cname = 'Venezuela, RB'
    elif cname == 'Croatia (Hrvatska)':
        cname = 'Croatia'
    elif cname == 'Macedonia':
        cname = 'Macedonia, FYR'
    elif cname == 'Slovakia':
        cname = 'Slovak Republic'
    elif cname == 'Hong Kong':
        cname = 'Hong Kong SAR, China'
    elif cname == 'Syria':
        cname = 'Syrian Arab Republic'
    elif cname == 'Korea, Republic of':
        cname = 'Korea, Rep.'
    elif cname == 'Myanmar (Burma)':
        cname = 'Myanmar'
    elif cname == 'Taiwan':
        cname = 'Taiwan, China'
    elif cname == 'Yemen':
        cname = 'Yemen, Rep.'
    elif cname == 'Lao':
        cname = 'Lao PDR'
    elif cname == 'Kyrgyzstan':
        cname = 'Kyrgyz Republic'
    elif cname == 'East Timor':
        cname = 'Timor-Leste'
    elif cname == "Korea, Democratic People's Republic of":
        cname = 'Korea, Dem. Rep.'
    elif cname == 'Iran':
        cname = 'Iran, Islamic Rep.'
    elif cname == 'French Polynesia (Tahiti)':
        cname = 'French Polynesia'
    elif cname == 'Micronesia, Federated States of':
        cname = 'Micronesia, Fed. Sts.'
    elif cname == 'Wallis and Futuna Islands':
        cname = 'Wallis and Futuna'
    elif cname == 'Congo, the Democratic Republic of the':
        cname = 'Congo, Dem. Rep.'
    elif cname == 'Congo':
        cname = 'Congo, Rep.'
    elif cname == "Cote d'Ivoire (Ivory Coast)":
        cname = "Cote d'Ivoire"
    elif cname == 'Dijibouti':
        cname = 'Djibouti'
    elif cname == 'Egypt':
        cname = 'Egypt, Arab Rep.'
    elif cname == 'Gambia':
        cname = 'Gambia, The'
    
    return cname

cbc = read_adapt_existing_country_list(country_name_adapter)
diff_checker(cbc, norm_set)

Differences for: North America
Pre-existing countries NOT in personal data list
{'United States Minor Outlying Islands'}
Differences for: South America
Pre-existing countries NOT in personal data list
set()
Differences for: Europe
Pre-existing countries NOT in personal data list
{'Holy See (Vatican City State)', 'Svalbard and Jan Mayen Islands'}
Differences for: Asia
Pre-existing countries NOT in personal data list
{'British Indian Ocean Territory', 'Christmas Island', 'Cocos (Keeling) Islands'}
Differences for: Australia / Oceania
Pre-existing countries NOT in personal data list
{'Tokelau', 'Pitcairn'}
Differences for: Africa
Pre-existing countries NOT in personal data list
set()


So we've gotten the results we wanted, the ones that are left are countries/territories that are not in the original data given. The left over ones are not in our pre-existing list of countries, so we let's remove them from this.

In [210]:
remove_cbc = {'North America': set(['United States Minor Outlying Islands']), 
           'South America': set([]), 
           'Europe': set(['Holy See (Vatican City State)', 'Svalbard and Jan Mayen Islands']),
           'Asia': set(['British Indian Ocean Territory', 'Christmas Island', 'Cocos (Keeling) Islands']), 
           'Australia / Oceania': set(['Tokelau', 'Pitcairn']),
           'Africa': set([])}

for key in cbc:
    cbc[key] = cbc[key].difference(remove_cbc[key])

For that we're going to need to make one big set from all of the continental sets.

In [211]:
#cbc is a dictionary of sets containing the countries in the continent
def combine_cbc(cbc):
    cbc_in_one_set = cbc['North America']
    for key in cbc:
        cbc_in_one_set = cbc[key].union(cbc_in_one_set)
    return cbc_in_one_set

combined_cbc = combine_cbc(cbc)

In [212]:
print(norm_set.difference(combined_cbc))

{'Kosovo', 'West Bank and Gaza', 'Sint Maarten (Dutch part)', 'St. Martin (French part)', 'Macao SAR, China', 'Channel Islands', 'Curacao', 'Saint Pierre et Miquelon', 'Isle of Man', 'BES Islands'}


Now I'm just going through manually and looking up the continent of each and then adding them to the appropriate continent set. 

In [213]:
add_cbc = {'North America': set(['Sint Maarten (Dutch part)', 'St. Martin (French part)', 'Saint Pierre et Miquelon']), 
           'South America': set(['Curacao', 'BES Islands']), 
           'Europe': set(['Kosovo', 'Channel Islands', 'Isle of Man']),
           'Asia': set(['West Bank and Gaza', 'Macao SAR, China', ]), 
           'Australia / Oceania': set([]),
           'Africa': set([])}

for key in cbc:
    cbc[key] = cbc[key].union(add_cbc[key])

In [214]:
final_combined_cbc = combine_cbc(cbc)
print(norm_set.difference(final_combined_cbc))
print(final_combined_cbc.difference(norm_set))

set()
set()


As we can see there is no difference between the sets. This means that all of the countries are the same, but in the cbc dictionary we now have them ordered by continent. Now all that is left is to write it out in html.

In [247]:
def write_cbc_list(cbc):
    #initialize the html
    html_text = "<label for='addressCountry'>Country</label>\n<select name='addressCountry'>\n  <option></option>\n"

    for key in cbc:
        html_text += "  <optgroup label='" + key + "'>\n"
        for country in sorted(list(cbc[key])):
            html_text += "    <option value='" + abbrv_dict[country] + "'>" + country + '</option>\n'
        html_text += "  </optgroup>\n"
    html_text+= "</select>"
    
    with open("output/countries_by_continents.html", 'w') as f:
        f.write(html_text)
    return

In [248]:
write_country_list(abbrv_list, country_list)
write_cbc_list(cbc)

And we're done! We got it!