# Extracting Wikipedia Data
We'll extract some information about border lenghts and neighbours, wars, basic infrastructure, and general info like Top religions, area in KM, type of government and country position.

### Preparations
Let's write some functions and lists to make our jobs easier for the next tasks

In [1]:
import pandas as pd

import requests
import json
from bs4 import BeautifulSoup

from fuzzywuzzy import process
import re

In [13]:
# I'll use a processed Data Set from an earlier notebook as the stardard for Country Names
df = pd.read_csv("..\Data_Sets\processed\economicData_1960-2022_noNaN-drops.csv") 
refNames = pd.DataFrame({    
    'Standard Names': df['Country Name'].unique(),
    'targetNames': df['Country Name'].unique()
})

In [18]:
def getSoupFromWiki(
        link,
        BASE_URL = "https://en.wikipedia.org/w/api.php",
        action = "parse",
        format = "json"
    ):

    '''
        Given a title, it searches a link and returns a soup.
    '''

    params = {
        "action": action,
        "page": link,
        "format": format
    }

    response = requests.get(BASE_URL, params=params)

    def isReponseOK(response=response):
        if response.status_code//100 != 2: # if status code isn't in the 200s
            display(f'Something unexpected happened. Status Code returned: {response.status_code}')
            return False
        else:
            display(f'Success! | {response.status_code}')
            display(f'Processing current link: {link}')
            return True

    if not isReponseOK:
        return []

    def htmlParser_getSoup(response):
        data = response.json()

        # The main content of the page is in ['parse']['text']['*']
        page_html = data['parse']['text']['*']

        # We can now use BeautifulSoup to parse this HTML
        soup = BeautifulSoup(page_html, 'html.parser')

        return soup

    soup = htmlParser_getSoup(response)

    # Check for redirect indication, and open the first valid link
    if "This is a redirect from a title" in soup.text:
        newTitle = soup.find("a").text
        if newTitle:
            params['page'] = newTitle

            display(f'params: {params}')
            response = requests.get(BASE_URL, params=params)
            display('----------------------------------------')
            return htmlParser_getSoup(response)         
    
    display('----------------------------------------')
    return soup
    

In [4]:
def fuzzySearchName(name, refNames=refNames, score_threshold = 45):
    '''
        Given a country name, returns the closest match from refNames using fuzzy search.
        If no close match is found, it returns "PLEASE FILL MANUALLY".
    '''
    
    # If the name is empty or null, return "UNKNOWN"
    if pd.isna(name) or not name.strip():
        return "UNKNOWN"

    match, score, _ = process.extractOne(name, refNames['Standard Names'])
    
    # Hardcoding exceptions, due to sharing common words (South, North)
    dubiousFuzzyNames = [
        'North Korea', 'South Korea', 'South Africa', 'North Macedonia',
        'Democratic Republic of Congo', 'Republic of Congo', 'British Empire'
    ]

    if match in dubiousFuzzyNames:
        score_threshold = max(89, score_threshold)
        

    # If a close match is found, return the match
    if score > score_threshold:
        return match
    
    # If no close match is found, ask for manual input
    return "PLEASE FILL MANUALLY"


In [16]:
countriesBordersList = {}
for tr in table.findAll('tr')[2:]: # Skip the 2 lines-header row by using slicing
    tds = tr.findAll('td')
    
    countryName = tds[0].find('b').find('a').text 
    
    neighboursBorders_inKM = tds[1].text.strip()    
    neighbouring_countries = []
    
    # Extracting neighbouring countries
    links_in_td = tds[5].findAll('a')
    if links_in_td:
        for a in links_in_td:
            if '[' not in a.text: # Excluding reference links
                currentName = a.text.strip()
                currentName = fuzzySearchName(currentName) # Correcting names with fuzzySearch
                
                if currentName in refNames['Standard Names'].values: # Discarding mismatchs, only interested in one of the 185 Countries
                    neighbouring_countries.append(currentName)

    neighbouring_countries = list(set(neighbouring_countries))
    # Adding countryName as Key and neighbouring_countries as Value in countriesBordersList dict
    countriesBordersList[countryName] = {
        'Borders Length (in KM)': neighboursBorders_inKM,
        'Neighbouring Countries': neighbouring_countries
    }

In [17]:
# Pivoting the dictionary, before creating the dataFrame
data_list = [{'Country Name': country, **values} for country, values in countriesBordersList.items()]

# Create a DataFrame from the list of dictionaries
df_CountryNeigh = pd.DataFrame(data_list)

In [18]:
# Fixing Country Name with the standard i'm using in other Data Sets
df_CountryNeigh['New Country Name'] = df_CountryNeigh['Country Name'].apply(fuzzySearchName)

df_CountryNeigh.loc[df_CountryNeigh['Country Name'] == 'Kyrgyzstan', 'New Country Name'] = 'Kyrgyz Republic'
df_CountryNeigh.loc[df_CountryNeigh['Country Name'] == 'Slovakia', 'New Country Name'] = 'Slovak Republic'

In [19]:
pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 135)
pd.set_option('display.max_columns', None)

# Cleaning Countries in the dataFrame:
# - Removing duplicates (where the original Country Name isn't in refNames)
# - The only exception is Bahamas, otherwise all other countries work.
mask_oldToActual = df_CountryNeigh['Country Name'].isin(refNames['Standard Names'])
mask_duplicatedEntries = df_CountryNeigh['New Country Name'].duplicated(keep=False)
condition = (~mask_oldToActual) & (mask_duplicatedEntries) & (df_CountryNeigh['Country Name'] != 'Bahamas')

df_CountryNeigh = df_CountryNeigh[~condition]


In [20]:
# Removing empty row
df_CountryNeigh = df_CountryNeigh[~(df_CountryNeigh['New Country Name'] == 'UNKNOWN')]

# Removing old names, and renaming New Country Name column
df_CountryNeigh.drop('Country Name', axis = 1, inplace=True)
df_CountryNeigh.rename(columns = {'New Country Name': 'Country Name'}, inplace=True)


# Manually adding Kosovo, which is missing from the wikipedia page
kosovo = {
    'Country Name': 'Kosovo',
    'Borders Length (in KM)': 743.556,
    'Neighbouring Countries': ['Albania', 'Montenegro', 'North Macedonia', 'Serbia']
}
df_CountryNeigh = pd.concat([df_CountryNeigh, pd.DataFrame([kosovo])], ignore_index=True)

### War information
We'll gather all wars between countries (and also significant Civil Wars/Rebellions) from 1900s onward. We'll need to clean them while collecting, as only Wars with recognizable states will be accepted. We'll discard all other instances.

This means this probably won't be as thorough/accurate as manual data insertion, but it'll take us 90% there with a fraction of the effort.
With such info, we'll estimate:
- Distinct Count of neighbours each country has warred with (This means we'll need to filter Civil Wars [Wars with self] for this one)
- Count of Wars each country has had with neighbours
- Total Count of Wars each country has had

In [None]:
soup = getSoupFromWiki("Lists_of_wars")

In [None]:
# Getting all past wars links
pastWars_links = soup.findAll('a', string=re.compile("List of wars:"))
pastWars_linkList = []

# Inserting links from 1900s onward into list
for i, link in enumerate(pastWars_links):
    if i > 3:
        pastWars_linkList.append(link.get('href')[6:])


We'll now iterate over all tables from the given links

In [None]:
print(merged_refNames.columns)

In [None]:
# Fixing URLs format
pastWars_linkList = [link.replace(f"%E2%80%93", '–') for link in pastWars_linkList]

# These dicts will contain Lists as value, storing every Country that warred against the 'key' Country.
opponentsByCountry = {}

for link in pastWars_linkList:
    # For each link, find all tables
    soup = getSoupFromWiki(link)
    tables = soup.findAll('table')    
    
    # ------------------------------------- Iterating over each 
    for table in tables:
        # For each table, fetch all rows for parties involved (A vs B)
        for tr in table.findAll('tr')[2:]: # Skip the 2 lines-header row by using slicing
            tds = tr.findAll('td')

            # local store, to permutate later
            countriesA = []
            countriesB = []
            
            try:
                sideACountries = tds[3].findAll('a')
                sideBCountries = tds[4].findAll('a')

                # ---------------- Side A
                for link in sideACountries:
                    countryA = fuzzySearchName(link.text, merged_refNames, score_threshold = 80) # 80 is a good balance, empirically tested on this dataSet

                    # If name is valid, change it's 'wikiName' to the targetName I've set before, then append it
                    if countryA not in ['PLEASE FILL MANUALLY', 'UNKNOWN']:
                        countryA = merged_refNames.loc[merged_refNames['Standard Names'] == countryA]['targetNames'].iloc[0]
                        countriesA.append(countryA)

                # ---------------- Side B
                for link in sideBCountries:
                    countryB = fuzzySearchName(link.text, merged_refNames, score_threshold = 85) # I'm more rigorous, since the losing side is more likely to have odd names

                    # If name is valid, change it's 'wikiName' to the targetName I've set before, then append it
                    if countryB not in ['PLEASE FILL MANUALLY', 'UNKNOWN']:
                        countryB = merged_refNames.loc[merged_refNames['Standard Names'] == countryB]['targetNames'].iloc[0]
                        countriesB.append(countryB)

                # Append to the actual dictionary
                for country in countriesA:
                    if country in opponentsByCountry:
                        # If we've seen the country before, only extend the list.
                        opponentsByCountry[country].extend(countriesB)
                    else:
                        # Else, create a new Key with copied Values
                        opponentsByCountry[country] = countriesB.copy()
                        

            except:
                print(f'Current tr has failed. Skipping it this iteration.\nPS: Probably just the last row though')          

### General Country Info
We'll extract:
- Top 3 religions
- Country Position in Globe
- Type of Government

In [31]:
# insert code

### Infrastructure data collection
Now let's collect some basic infrastructure stats. We'll use the latest available numbers from wiki.
- Railroads
    - RailLength/Country Area (already calculated by wiki)
    - % of the total electrified
- Road Network
    - Density (km/100 km²)
- Ports
    - Number of Ports
    - Container port traffic per country


In [29]:
#opponentsByCountry['Brazil']