# Extracting Wikipedia Data
We'll extract some information about border lenghts and neighbours, wars, basic infrastructure, and general info like Top religions, area in KM, and country position.

### Preparations
Let's write some functions and lists to make our jobs easier for the next tasks

In [1]:
import pandas as pd

import requests
import json
from bs4 import BeautifulSoup

from fuzzywuzzy import process
import re

In [2]:
# I'll use a processed Data Set from an earlier notebook as the stardard for Country Names
df = pd.read_csv("..\Data_Sets\processed\economicData_1960-2022_noNaN-drops.csv") 
refNames = pd.DataFrame({    
    'Standard Names': df['Country Name'].unique(),
    'targetNames': df['Country Name'].unique()
})

In [3]:
def getSoupFromWiki(
        link,
        BASE_URL = "https://en.wikipedia.org/w/api.php",
        action = "parse",
        format = "json"
    ):

    '''
        Given a title, it searches a link and returns a soup.
    '''

    params = {
        "action": action,
        "page": link,
        "format": format
    }

    response = requests.get(BASE_URL, params=params)

    def isReponseOK(response=response):
        if response.status_code//100 != 2: # if status code isn't in the 200s
            display(f'Something unexpected happened. Status Code returned: {response.status_code}')
            return False
        else:
            # display(f'Successful Response: {response.status_code}')
            # display(f'Processing current link: {link}')
            return True

    if not isReponseOK():
        return []

    def htmlParser_getSoup(response):
        data = response.json()

        # The main content of the page is in ['parse']['text']['*']
        page_html = data['parse']['text']['*']

        # We can now use BeautifulSoup to parse this HTML
        soup = BeautifulSoup(page_html, 'html.parser')

        return soup

    soup = htmlParser_getSoup(response)

    # Check for redirect indication, and open the first valid link
    if "This is a redirect from a title" in soup.text or "Redirect to:" in soup.text:
        newTitle = soup.find("a").text
        if newTitle:
            params['page'] = newTitle

            response = requests.get(BASE_URL, params=params)
            #display(f'Redirecting from {link} to {newTitle}')
            #display('----------------------------------------')
            return htmlParser_getSoup(response)         
    
    #display('----------------------------------------')
    return soup
    

In [4]:
def fuzzySearchName(name, refNames=refNames, score_threshold = 45):
    '''
        Given a country name, returns the closest match from refNames using fuzzy search.
        If no close match is found, it returns "PLEASE FILL MANUALLY".
    '''
    
    # If the name is empty or null, return "UNKNOWN"
    if pd.isna(name) or not name.strip():
        return "UNKNOWN"

    match, score, _ = process.extractOne(name, refNames['Standard Names'])
    
    # Hardcoding exceptions, due to sharing common words (South, North)
    dubiousFuzzyNames = [
        'North Korea', 'South Korea', 'South Africa', 'North Macedonia',
        'Democratic Republic of Congo', 'Republic of Congo', 'British Empire'
    ]

    if match in dubiousFuzzyNames:
        score_threshold = max(89, score_threshold)
        

    # If a close match is found, return the match
    if score > score_threshold:
        return match
    
    # If no close match is found, ask for manual input
    return "PLEASE FILL MANUALLY"


In [5]:
# Mapping relevant names to a dataFrame, so our fuzzySearch will work properly
expRefNames = pd.DataFrame({
    'Standard Names': ['Ottoman Empire', 'Weimar Republic', 'Qing Dynasty', 'Bitterenders', "Ha'il", "Ikhwan", "Najran", "British Empire",
                       "England", 'Kurdistan', "French", "Soviet Union", "Kurdish", "Ararat", "Khan", "Saqqawists", "Khanty", "Muhammad Umar",
                       "Spanish", "Polish", "Palestine", "Czechoslovakia", "Rhodesia", "Zaire", "Turkistan", "hamas", 'Dutch Empire', 'Portuguese',
                       "Czechia", "Kyrgyzstan", "Slovakia"],
    'targetNames': ['Turkey', 'Germany', 'China', 'South Africa', "Saudi Arabia", "Saudi Arabia", "Saudi Arabia", "United Kingdom",
                    "United Kingdom", 'Iraq', "France", "Russia", "Turkey", "Turkey", "Afghanistan", "Afghanistan", "Russia", "Kazakhstan",
                    "Spain", "Poland", "Israel", "Czech Republic", "Zimbabwe", "Democratic Republic of Congo", "Pakistan", "Israel", 'Netherlands', 'Portugal',
                    "Czech Republic", "Kyrgyz Republic", "Slovak Republic"]
})

# Concatenate the dataframes along columns
merged_refNames = pd.concat([refNames, expRefNames], axis=0).reset_index(drop=True)

### Country Border information
We're extracting:
- For every country, who their neighbouring countries are
- What is the length of the borders in KM (not meaning the circumference of the country, it counts just the parts that touches other countries by Land)
    - This means that, at least for our purposes, this data could be more complete. As what (could) actually matter for our model is if a country is close to another, not necessarily touching eachother.
    - That is why we'll later join this info with country position, so we'll add more countries to the list, with some sort of distance threshold.

In [6]:
soup = getSoupFromWiki("List_of_countries_and_territories_by_number_of_land_borders")

# Find the table with neighbouring countries info (which is the first)
table = soup.find('table')

In [7]:
countriesBordersList = {}
for tr in table.findAll('tr')[2:]: # Skip the 2 lines-header row by using slicing
    tds = tr.findAll('td')
    
    countryName = tds[0].find('b').find('a').text 
    
    neighboursBorders_inKM = tds[1].text.strip()    
    neighbouring_countries = []
    
    # Extracting neighbouring countries
    links_in_td = tds[5].findAll('a')
    if links_in_td:
        for a in links_in_td:
            if '[' not in a.text: # Excluding reference links
                currentName = a.text.strip()
                currentName = fuzzySearchName(currentName) # Correcting names with fuzzySearch
                
                if currentName in refNames['Standard Names'].values: # Discarding mismatchs, only interested in one of the 185 Countries
                    neighbouring_countries.append(currentName)

    neighbouring_countries = list(set(neighbouring_countries))
    # Adding countryName as Key and neighbouring_countries as Value in countriesBordersList dict
    countriesBordersList[countryName] = {
        'Borders Length (in KM)': neighboursBorders_inKM,
        'Neighbouring Countries': neighbouring_countries
    }

In [8]:
# Pivoting the dictionary, before creating the dataFrame
data_list = [{'Country Name': country, **values} for country, values in countriesBordersList.items()]

# Create a DataFrame from the list of dictionaries
df_CountryNeigh = pd.DataFrame(data_list)

In [9]:
# Fixing Country Name with the standard i'm using in other Data Sets
df_CountryNeigh['New Country Name'] = df_CountryNeigh['Country Name'].apply(fuzzySearchName)

df_CountryNeigh.loc[df_CountryNeigh['Country Name'] == 'Kyrgyzstan', 'New Country Name'] = 'Kyrgyz Republic'
df_CountryNeigh.loc[df_CountryNeigh['Country Name'] == 'Slovakia', 'New Country Name'] = 'Slovak Republic'

In [10]:
# Cleaning Countries in the dataFrame:
# - Removing duplicates (where the original Country Name isn't in refNames)
# - The only exception is Bahamas, otherwise all other countries work.
mask_oldToActual = df_CountryNeigh['Country Name'].isin(refNames['Standard Names'])
mask_duplicatedEntries = df_CountryNeigh['New Country Name'].duplicated(keep=False)
condition = (~mask_oldToActual) & (mask_duplicatedEntries) & (df_CountryNeigh['Country Name'] != 'Bahamas')

df_CountryNeigh = df_CountryNeigh[~condition]


In [11]:
# Removing empty row
df_CountryNeigh = df_CountryNeigh[~(df_CountryNeigh['New Country Name'] == 'UNKNOWN')]

# Removing old names, and renaming New Country Name column
df_CountryNeigh.drop('Country Name', axis = 1, inplace=True)
df_CountryNeigh.rename(columns = {'New Country Name': 'Country Name'}, inplace=True)


# Manually adding Kosovo, which is missing from the wikipedia page
kosovo = {
    'Country Name': 'Kosovo',
    'Borders Length (in KM)': 743.556,
    'Neighbouring Countries': ['Albania', 'Montenegro', 'North Macedonia', 'Serbia']
}
df_CountryNeigh = pd.concat([df_CountryNeigh, pd.DataFrame([kosovo])], ignore_index=True)

In [12]:
df_CountryNeigh.head(3)

Unnamed: 0,Borders Length (in KM),Neighbouring Countries,Country Name
0,5529,"[Iran, Turkmenistan, Tajikistan, Pakistan, Uzb...",Afghanistan
1,720,"[Montenegro, Serbia, Greece, North Macedonia]",Albania
2,6470,"[Tunisia, Niger, Mauritania, Mali, Libya, Moro...",Algeria


### War information
We'll gather all wars between countries (and also significant Civil Wars/Rebellions) from 1900s onward. We'll need to clean them while collecting, as only Wars with recognizable states will be accepted. We'll discard all other instances.

This means this probably won't be as thorough/accurate as manual data insertion, but it'll take us 90% there with a fraction of the effort.
With such info, we'll estimate:
- Distinct Count of neighbours each country has warred with (This means we'll need to filter Civil Wars [Wars with self] for this one)
- Count of Wars each country has had with neighbours
- Total Count of Wars each country has had

In [13]:
soup = getSoupFromWiki("Lists_of_wars")

In [14]:
# Getting all past wars links
pastWars_links = soup.findAll('a', string=re.compile("List of wars:"))
pastWars_linkList = []

# Inserting links from 1900s onward into list
for i, link in enumerate(pastWars_links):
    if i > 3:
        pastWars_linkList.append(link.get('href')[6:])


We'll now iterate over all tables from the given links

In [15]:
print(merged_refNames.columns)

Index(['Standard Names', 'targetNames'], dtype='object')


In [16]:
# Fixing URLs format
pastWars_linkList = [link.replace(f"%E2%80%93", '–') for link in pastWars_linkList]

# These dicts will contain Lists as value, storing every Country that warred against the 'key' Country.
opponentsByCountry = {}

for link in pastWars_linkList:
    # For each link, find all tables
    soup = getSoupFromWiki(link)
    tables = soup.findAll('table')    
    
    # ------------------------------------- Iterating over each 
    for table in tables:
        # For each table, fetch all rows for parties involved (A vs B)
        for tr in table.findAll('tr')[2:]: # Skip the 2 lines-header row by using slicing
            tds = tr.findAll('td')

            # local store, to permutate later
            countriesA = []
            countriesB = []
            
            try:
                sideACountries = tds[3].findAll('a')
                sideBCountries = tds[4].findAll('a')

                # ---------------- Side A
                for link in sideACountries:
                    countryA = fuzzySearchName(link.text, merged_refNames, score_threshold = 80) # 80 is a good balance, empirically tested on this dataSet

                    # If name is valid, change it's 'wikiName' to the targetName I've set before, then append it
                    if countryA not in ['PLEASE FILL MANUALLY', 'UNKNOWN']:
                        countryA = merged_refNames.loc[merged_refNames['Standard Names'] == countryA]['targetNames'].iloc[0]
                        countriesA.append(countryA)

                # ---------------- Side B
                for link in sideBCountries:
                    countryB = fuzzySearchName(link.text, merged_refNames, score_threshold = 85) # I'm more rigorous, since the losing side is more likely to have odd names

                    # If name is valid, change it's 'wikiName' to the targetName I've set before, then append it
                    if countryB not in ['PLEASE FILL MANUALLY', 'UNKNOWN']:
                        countryB = merged_refNames.loc[merged_refNames['Standard Names'] == countryB]['targetNames'].iloc[0]
                        countriesB.append(countryB)

                # Append to the actual dictionary
                for country in countriesA:
                    if country in opponentsByCountry:
                        # If we've seen the country before, only extend the list.
                        opponentsByCountry[country].extend(countriesB)
                    else:
                        # Else, create a new Key with copied Values
                        opponentsByCountry[country] = countriesB.copy()
                        

            except:
                print(f'Current tr has failed. Skipping it this iteration.\nPS: Probably just the last row though')      

Current tr has failed. Skipping it this iteration.
PS: Probably just the last row though
Current tr has failed. Skipping it this iteration.
PS: Probably just the last row though
Current tr has failed. Skipping it this iteration.
PS: Probably just the last row though
Current tr has failed. Skipping it this iteration.
PS: Probably just the last row though


### Landlocked Countries
We'll extract:
- Which countries are landlocked
- Neighbour count who has access to the sea (if landlocked) # I'll default 6, if the country isn't landlocked. (as this is the max value + 1, from landlocked countries)

In [17]:
soup = getSoupFromWiki('Landlocked_country')
rows = soup.find('tbody').findAll('tr')

df_landLockedCountries = pd.DataFrame(columns=['Country Name', 'isLandLocked', 'n_accessToSea'])
set_countries_landLocked = set()

for row in rows[2:]:
    tds = row.findAll('td')
    try:
        countryName = fuzzySearchName(tds[0].find('a').text, refNames=merged_refNames, score_threshold = 75)
        if countryName == "PLEASE FILL MANUALLY" or countryName == "UNKNOWN":
            continue

        # Reverting alternative names back to our naming convention (targetNames)
        countryName = merged_refNames.loc[merged_refNames['Standard Names'] == countryName]['targetNames'].iloc[0]

        # Neighbours with access to the ocean
        n_accessToSea = tds[7].text[0]

        # We'll populate a set, to get countries outside of this list
        set_countries_landLocked.add(countryName)

        # Populating dataFrame
        new_row = pd.DataFrame({
            'Country Name': [countryName],
            'isLandLocked': [True],
            'n_accessToSea': [n_accessToSea]
        })

        df_landLockedCountries = pd.concat([df_landLockedCountries, new_row], ignore_index=True)


        #print(f'Country: {countryName}  |  Neighbours with access to ocean: {n_accessToSea}')
    except Exception as e:
        print(f"Skipping invalid row. Error: {e}.")


Skipping invalid row. Error: list index out of range.
Skipping invalid row. Error: 'NoneType' object has no attribute 'text'.
Skipping invalid row. Error: 'NoneType' object has no attribute 'text'.


In [18]:
#Now let's populate df_landLockedCountries with countries that does have access to the ocean

set_countries_accessOcean = set(df['Country Name'].unique())

odd = set(df_landLockedCountries['Country Name'].unique())

diff = set_countries_accessOcean - set_countries_landLocked

for country in diff:
    new_row = pd.DataFrame({
            'Country Name': [country],
            'isLandLocked': [False],
            'n_accessToSea': [6]
        })

    df_landLockedCountries = pd.concat([df_landLockedCountries, new_row], ignore_index=True)

display(f'Shape of df_landLockedCountries: {df_landLockedCountries.shape}')
display(df_landLockedCountries.head(2))
display(df_landLockedCountries.tail(2))



'Shape of df_landLockedCountries: (185, 3)'

Unnamed: 0,Country Name,isLandLocked,n_accessToSea
0,Afghanistan,True,3
1,Armenia,True,3


Unnamed: 0,Country Name,isLandLocked,n_accessToSea
183,Panama,False,6
184,Equatorial Guinea,False,6


### General Country Info
We'll extract:
- Top 3 religions
    - Cleaning Strategy:
        - Select only the first 3 elements
        - If \n is found, get only the first name in that string [\n means wiki is subdividing religions into sub-groups]
            - To Achieve this, select whatever is between one \n from another \n
            - Then, we'll select only alphabetical letters from the resulting string
        - Standardize names to reduce categories (i.e. Islam = Sunni Islam)
- Country Position in Globe

In [20]:
# Compiled regex patterns
PERCENTAGE_PATTERN = re.compile(r'(\d+(\.\d+)?)%')
TEXT_CLEANUP_PATTERN = re.compile(r'[a-zA-Z]+')

def clean_extracted_religions(data_list):
    religions = []
    
    if isinstance(data_list, str):
        data_list = list([data_list]) # Forcing a list, for consistency

    for li in data_list[:3]: # Get only the top 3 items
        try:
            text = li.get_text()
        except:
            # If there's just one element, this is triggered
            # As single elements don't have .get_text() method
            text = li
        # Extract between first and second \n
        text = text.split('\n')[1] if '\n' in text else text
        
        # Check for excluded strings
        if any(dash in text.lower() for dash in ['—', '-', 'other', 'undeclared']):
            continue
        
        # Check size of Religion. If lower than 4%, skip it
        percentage_match = PERCENTAGE_PATTERN.search(text)
        
        if percentage_match:
            percentage = float(percentage_match.group(1))
            if percentage < 4:
                continue
        
        # Remove text after ( or [
        text = re.split(r'[\(\[]', text)[0].strip()
        
        # Extract only alphabetical characters
        cleaned_text = ' '.join(TEXT_CLEANUP_PATTERN.findall(text))        

        religions.append(cleaned_text.lower())
    
    return religions

def get_religion_from_table(table):
    religion_data = table.select_one('th.infobox-label:-soup-contains("Religion") + td.infobox-data')
    # display('++++++++++++++++++++++++ religion_data from get_religion_from_table function')
    # display(religion_data)

    # If there's no line element, return the first link
    if not religion_data.find('li'):
        allRows = religion_data.find('a').text
        # display('"No Lines" method used')
        # display(allRows)
        return clean_extracted_religions(allRows)
    
    if religion_data:
        try:
            # Get all lines within the same hierarchical level as the first li
            allRows = [religion_data.find('li')] + religion_data.find('li').find_next_siblings('li')
            # display('Line method used')
            # display(allRows)
        except:
            # If the method above fails, we use another approach
            allRows = religion_data.find_all('a')
            # display('"All Links" method used')
            # display(allRows)
        return clean_extracted_religions(allRows)
    return []

def get_coordinates_from_soup(soup):
    # Extract decimal coordinates from the span with class "geo-dec"
    decimal_coords = soup.find('span', class_='geo-dec')
    #display(decimal_coords)
    if decimal_coords:
        lat, lon = decimal_coords.text.split()
        return lat[:-2]  # Remove the last two characters '°N'
    else:
        return None

countriesList = df['Country Name'].copy()
countriesList = countriesList.replace("Georgia", "Georgia_(country)")
countriesList = countriesList.replace("Micronesia", "Federated_States_of_Micronesia")
countriesList = countriesList.unique()

religions_dict = {}
distanceFromEquator = {}

for country in countriesList:
    try:
        #display(f'Processing {country}')
        # get the first table
        soup = getSoupFromWiki(country)
        table = soup.find('table', class_='infobox ib-country vcard') or soup.find('table', class_='infobox ib-pol-div vcard')

        # get the top strip
        coords = get_coordinates_from_soup(soup)
        if coords:
            distanceFromEquator[country] = coords
        else:
            display(f"Coordinates for {country} not found!")
        # #display(soup)
        # display(table)

        if table:
        # ------------------------- RELIGION SCRAPING ---------------------------
            religions = get_religion_from_table(table)
            if religions:
                religions_dict[country] = religions
            else:
                pass
                #display(f'No religion found for {country}')
        else:
            pass
            #display(f'No infobox found for {country}')

    except Exception as e:
        pass
        #display(f'Some error occurred. Skipping {country}.<br>Error:{e}')


In [21]:
# Functions for debugging the dictionaries
def flatten(lst):
    '''
        Recursively flatten an iterable.
    '''
    for item in lst:
        if isinstance(item, (list, tuple, set, dict)):
            yield from flatten(item)
        else:
            yield item

def getUniqueValuesWithFrequency(d):
    '''
        Get unique values and their frequencies from a dictionary.
    '''
    flattened_values = list(flatten(d.values()))
    unique_values = set(flattened_values)
    frequency = {value: flattened_values.count(value) for value in unique_values}
    return frequency


def findMatchingKeys_fromDict(d, target_string):
    '''
        Return a list of Keys that contains the target string
    '''
    matching_keys = []  # List to store keys that contain the target string
    for key, value_list in d.items():
        if target_string in value_list:
            matching_keys.append(key)
    return matching_keys

def getEmptyKeys(d):
    '''
        Return a list of keys with no values or empty lists as values.
    '''
    return [key for key, value in d.items() if not value or value == []]

In [22]:
# ---------- Manual Cleaning of RELIGIONS
# Replacing religion on specific countries (manual wiki extraction)
religions_dict['Azerbaijan'] = ['islam', 'christianity']
religions_dict['Canada'] = ['christianity', 'no religion', 'islam']
religions_dict["Cote d'Ivoire"] = ['islam', 'christianity', 'no religion']
religions_dict['Egypt'] = ['islam', 'christianity']
religions_dict['Eritrea'] = ['christianity', 'islam']
religions_dict['Germany'] = ['christianity', 'no religion']
religions_dict['Hong Kong'] = ['no religion', 'buddhism', 'christianity']
religions_dict['Ireland'] = ['christianity', 'no religion']
religions_dict['Israel'] = ['judaism', 'islam']
religions_dict['Japan'] = ['no religion', 'buddhism']
religions_dict['Macau'] = ['folk', 'buddhism', 'no religion']
religions_dict['Nigeria'] = ['christianity', 'islam']
religions_dict['Turkey'] = ['islam', 'no religion']
religions_dict['United Kingdom'] = ['no religion', 'christianity', 'islam']

# Renaming and deleting some religions

# # I'll merge some quite different religions on purpose.
# # The goal is to have as few significant features as possible
# # Given that we'll do one-hot-encoding on them afterwards

wordsToReplace = {
'animism': 'folk', 'atheist': 'no religion', 'catholicism': 'christianity', 'chondoism': 'folk',
'folk religions': 'folk', 'hanafi sunni': 'islam', 'methodism': 'christianity', 'no religion folk': 'no religion',
'practicing catholic': 'christianity', 'protestant': 'christianity', 'protestantism': 'christianity',
'roman catholic': 'christianity', 'shi a': 'islam', 'shia': 'islam', 'sunni': 'islam', 'sunni islam': 'islam',
'tai folk religion': 'folk', 'traditional faiths': 'folk', 'unaffiliated': 'no religion', 'shamanism': 'folk', 'judaism': 'christianity'
}

wordsToRemove = ['no data', 'no response', 'official']

# Replacing religions and removing unwanted words
for key, value_list in religions_dict.items():
    new_list = []
    for word in value_list:
        if word not in wordsToRemove:
            new_list.append(wordsToReplace.get(word, word))
    religions_dict[key] = new_list

display(len(religions_dict))


185

In [23]:
len(distanceFromEquator)

185

### Country Area and Exclusive Economic Zone data collection

In [24]:
soup = getSoupFromWiki('List_of_countries_and_dependencies_by_area').find('table', class_ = 'wikitable sortable')

rows = soup.findAll('tr')

dict_countryAreaSize = dict()

for row in rows[1:]:
    tds = row.findAll('td')
    #display(f'processing {row}')

    # Storing value | tds[1] is country, tds[2] is total area
    if tds and len(tds) > 2:
        value = tds[2].text
        value_split = value.split('(')[0].strip().replace(',', '')
        dict_countryAreaSize[fuzzySearchName(tds[1].text, score_threshold=90)] = value_split

# Manually adding some countries
dict_countryAreaSize["Cote d'Ivoire"] = 322463
dict_countryAreaSize["Hong Kong"] = 2755
dict_countryAreaSize["Kyrgyz Republic"] = 199951
dict_countryAreaSize["Macau"] = 31
dict_countryAreaSize["Republic of Congo"] = 342000
dict_countryAreaSize["Democratic Republic of Congo"] = 342000
dict_countryAreaSize["Slovak Republic"] = 49037
dict_countryAreaSize["Brunei Darussalam"] = 5765
dict_countryAreaSize["The Bahamas"] = 13943
dict_countryAreaSize["The Gambia"] = 11295
dict_countryAreaSize["Timor-Leste"] = 14919
dict_countryAreaSize["United States"] = 9833517

dict_countryAreaSize.pop("PLEASE FILL MANUALLY", None)

display(len(dict_countryAreaSize))

185

In [25]:
tables = getSoupFromWiki('Exclusive_economic_zone').findAll('table')[11]
rows = tables.findAll('tr')

dict_countryEEZ_area = dict()

for row in rows[1:]:
    tds = row.findAll('td')

    value = tds[2].text.strip().split('[')[0].replace(',', '')
    if not value:
        value = 0

    # populating countries and values
    dict_countryEEZ_area[fuzzySearchName(tds[1].text.split('[')[0].strip(), score_threshold=90)] = value

# Manually adding some countries
dict_countryEEZ_area["Brunei Darussalam"] = 10090
dict_countryEEZ_area["Hong Kong"] = 20
dict_countryEEZ_area["Kyrgyz Republic"] = 0
dict_countryEEZ_area["Macau"] = 20
dict_countryEEZ_area["Micronesia"] = 2996419
dict_countryEEZ_area["Slovak Republic"] = 0
dict_countryEEZ_area["The Bahamas"] = 654715
dict_countryEEZ_area["The Gambia"] = 23112
dict_countryEEZ_area["Timor-Leste"] = 70326

dict_countryEEZ_area.pop("PLEASE FILL MANUALLY", None)

display(len(dict_countryEEZ_area))

185

### Infrastructure data collection
Now let's collect some basic infrastructure stats. We'll use the latest available numbers from wiki.
- Railroads
    - RailLength/Country Area (already calculated by wiki)
    - % of the total electrified
- Ports
    - Number of Ports
    - Container port traffic per country


In [26]:
# RAILS
table = getSoupFromWiki('List_of_countries_by_rail_transport_network_size').find('table')
rows = table.findAll('tr')

dict_countryRails = dict()

for row in rows[1:]:
    tds = row.findAll('td')    

    try:
        lengthPerArea = float(tds[4].text.strip().replace(',', ''))
    
        totalElectrified = float(tds[3].text.strip().split('%')[0].replace(',', '')) / 100

        #populating countries and values
        dict_countryRails[fuzzySearchName(tds[0].text.strip(), score_threshold=90)] = [lengthPerArea, round(totalElectrified, 2)]
    except Exception as e:
        display(e)
        continue

dict_countryRails.pop("PLEASE FILL MANUALLY", None)

set_A = set(dict_countryRails.keys())
set_B = set(refNames['Standard Names'])
missingCountries = set_B - set_A

for country in missingCountries:
    dict_countryRails[country] = [0, 0]

display(len(dict_countryRails))

IndexError('list index out of range')

185

In [27]:
# Ports

links = [
    'List_of_ports_and_harbours_of_the_Atlantic_Ocean',
    'List_of_ports_and_harbours_of_the_Indian_Ocean',
    'List_of_ports_and_harbors_of_the_Pacific_Ocean',
    'Ports_of_the_Baltic_Sea'
]

dict_countryPorts = dict()

for link in links:
    table = getSoupFromWiki(link).findAll('table', class_ = 'wikitable sortable')[0]
    rows = table.findAll('tr')

    colLocation = 1 if any(word in link for word in ['Indian', 'Baltic']) else 2
    #display(link, colLocation)
    for row in rows[1:]:
        tds = row.findAll('td')
        try:
            countryName = fuzzySearchName(tds[colLocation].text.split(',')[0].strip())

            dict_countryPorts[countryName] = dict_countryPorts.get(countryName, 0) + 1
        except Exception as e:
            display(e)
            continue


In [28]:
dict_countryPorts.pop("PLEASE FILL MANUALLY", None)

# Manually adding Arctic Ocean ports and Northern Sea Ports
dict_countryPorts['Canada'] = dict_countryPorts.get('Canada', 0) + 2
dict_countryPorts['United States'] = dict_countryPorts.get('United States', 0) + 1
dict_countryPorts['Iceland'] = dict_countryPorts.get('Iceland', 0) + 1
dict_countryPorts['Russia'] = dict_countryPorts.get('Russia', 0) + 13
dict_countryPorts['Norway'] = dict_countryPorts.get('Norway', 0) + 5
dict_countryPorts['Belgium'] = dict_countryPorts.get('Belgium', 0) + 6
dict_countryPorts['Denmark'] = dict_countryPorts.get('Denmark', 0) + 13
dict_countryPorts['France'] = dict_countryPorts.get('France', 0) + 1
dict_countryPorts['Germany'] = dict_countryPorts.get('Germany', 0) + 11
dict_countryPorts['Netherlands'] = dict_countryPorts.get('Netherlands', 0) + 14
dict_countryPorts['Norway'] = dict_countryPorts.get('Norway', 0) + 13
dict_countryPorts['United Kingdom'] = dict_countryPorts.get('United Kingdom', 0) + 22

# Manually adding port number (basic google research) // or from https://www.searates.com/maritime/
dict_countryPorts['Comoros'] = 3
dict_countryPorts['Dominica'] = 2
dict_countryPorts['Fiji'] = 5
dict_countryPorts['Hong Kong'] = 1
dict_countryPorts['Jordan'] = 1
dict_countryPorts['Kiribati'] = 5
dict_countryPorts['Macau'] = 1
dict_countryPorts['Micronesia'] = 1
dict_countryPorts['Republic of Congo'] = 4
dict_countryPorts['Samoa'] = 1
dict_countryPorts['Sao Tome and Principe'] = 2
dict_countryPorts['Timor-Leste'] = 11
dict_countryPorts['Tonga'] = 2
dict_countryPorts['Vanuatu'] = 2

# Country without a port, but has access to the sea
dict_countryPorts['Bosnia and Herzegovina'] = 0



landLocked = list(df_landLockedCountries[df_landLockedCountries['isLandLocked'] == True]['Country Name'])

# Filling landlocked countries with 
for country in landLocked:
    dict_countryPorts[country] = 0

display(len(dict_countryPorts))


185

### Joining all the Data

In [29]:
df_master = pd.merge(df_CountryNeigh, df_landLockedCountries, on='Country Name', how='outer')

df_rails = pd.DataFrame(list(dict_countryRails.items()), columns=['Country Name', 'Values'])
df_rails[['Rail Density', 'Pctg of Rail Electrified']] = pd.DataFrame(df_rails.Values.tolist(), index=df_rails.index)

df_master = pd.merge(df_master, df_rails, on='Country Name', how='left')

df_master['Warred Against'] = df_master['Country Name'].map(opponentsByCountry)
df_master['Area Size (km2)'] = df_master['Country Name'].map(dict_countryAreaSize)
df_master['Expanded EconZone Area'] = df_master['Country Name'].map(dict_countryEEZ_area)
df_master['Amount of Ports'] = df_master['Country Name'].map(dict_countryPorts)
df_master['Distance from Equator'] = df_master['Country Name'].map(distanceFromEquator)
df_master['Majoritary Religions'] = df_master['Country Name'].map(religions_dict)

In [30]:
df_master.sort_values(by="Country Name", inplace=True)

# Ensure 'Country Name' is the first column
cols = ['Country Name'] + [col for col in df_master.columns if col != 'Country Name']
df_master = df_master[cols]

df_master.to_csv(f'..\Data_Sets\processed\\addData_fromWiki.csv')
