# Extracting Wikipedia Data
We'll extract some information about border lenghts and neighbours, wars, basic infrastructure, and general info like Top religions, area in KM, type of government and country position. 
### Country Border information


In [1]:
import pandas as pd

import requests
import json
from bs4 import BeautifulSoup

from fuzzywuzzy import process
import re

In [2]:
# I'll use a processed Data Set from an earlier notebook as the stardard for Country Names
df = pd.read_csv("..\Data_Sets\processed\economicData_1960-2022_noNaN-drops.csv") 
refNames = pd.DataFrame({
    'Standard Names': df['Country Name'].unique()
})

In [3]:
BASE_URL = "https://en.wikipedia.org/w/api.php"
PARAMS = {
    "action": "parse",
    "page": "List_of_countries_and_territories_by_number_of_land_borders",
    "format": "json"
}

response = requests.get(BASE_URL, params=PARAMS)
data = response.json()

# The main content of the page is in ['parse']['text']['*']
page_html = data['parse']['text']['*']

# We can now use BeautifulSoup to parse this HTML
soup = BeautifulSoup(page_html, 'html.parser')

# Find the table with neighbouring countries info (which is the first)
table = soup.find('table')

response.status_code

200

In [104]:
def fuzzySearchName(name, refNames=refNames, score_threshold = 45):
    '''
        Given a country name, returns the closest match from refNames using fuzzy search.
        If no close match is found, it returns "PLEASE FILL MANUALLY".
    '''
    
    # If the name is empty or null, return "UNKNOWN"
    if pd.isna(name) or not name.strip():
        return "UNKNOWN"

    match, score, _ = process.extractOne(name, refNames['Standard Names'])
    
    # Hardcoding exceptions, due to sharing common words (South, North)
    if match in ['North Korea', 'South Korea', 'South Africa', 'North Macedonia']:
        score_threshold = max(89, score_threshold)
        

    # If a close match is found, return the match
    if score > score_threshold:
        return match
    
    # If no close match is found, ask for manual input
    return "PLEASE FILL MANUALLY"


In [5]:
countriesBordersList = {}
for tr in table.findAll('tr')[2:]: # Skip the 2 lines-header row by using slicing
    tds = tr.findAll('td')
    
    countryName = tds[0].find('b').find('a').text 
    
    neighboursBorders_inKM = tds[1].text.strip()    
    neighbouring_countries = []
    
    # Extracting neighbouring countries
    links_in_td = tds[5].findAll('a')
    if links_in_td:
        for a in links_in_td:
            if '[' not in a.text: # Excluding reference links
                currentName = a.text.strip()
                currentName = fuzzySearchName(currentName) # Correcting names with fuzzySearch
                
                if currentName in refNames['Standard Names'].values: # Discarding mismatchs, only interested in one of the 185 Countries
                    neighbouring_countries.append(currentName)

    neighbouring_countries = list(set(neighbouring_countries))
    # Adding countryName as Key and neighbouring_countries as Value in countriesBordersList dict
    countriesBordersList[countryName] = {
        'Borders Length (in KM)': neighboursBorders_inKM,
        'Neighbouring Countries': neighbouring_countries
    }

In [6]:
# Pivoting the dictionary, before creating the dataFrame
data_list = [{'Country Name': country, **values} for country, values in countriesBordersList.items()]

# Create a DataFrame from the list of dictionaries
df_CountryNeigh = pd.DataFrame(data_list)

In [7]:
# Fixing Country Name with the standard i'm using in other Data Sets
df_CountryNeigh['New Country Name'] = df_CountryNeigh['Country Name'].apply(fuzzySearchName)

df_CountryNeigh.loc[df_CountryNeigh['Country Name'] == 'Kyrgyzstan', 'New Country Name'] = 'Kyrgyz Republic'
df_CountryNeigh.loc[df_CountryNeigh['Country Name'] == 'Slovakia', 'New Country Name'] = 'Slovak Republic'

In [8]:
pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 135)
pd.set_option('display.max_columns', None)

# Cleaning Countries in the dataFrame:
# - Removing duplicates (where the original Country Name isn't in refNames)
# - The only exception is Bahamas, otherwise all other countries work.
mask_oldToActual = df_CountryNeigh['Country Name'].isin(refNames['Standard Names'])
mask_duplicatedEntries = df_CountryNeigh['New Country Name'].duplicated(keep=False)
condition = (~mask_oldToActual) & (mask_duplicatedEntries) & (df_CountryNeigh['Country Name'] != 'Bahamas')

df_CountryNeigh = df_CountryNeigh[~condition]


In [9]:
# Removing empty row
df_CountryNeigh = df_CountryNeigh[~(df_CountryNeigh['New Country Name'] == 'UNKNOWN')]

# Removing old names, and renaming New Country Name column
df_CountryNeigh.drop('Country Name', axis = 1, inplace=True)
df_CountryNeigh.rename(columns = {'New Country Name': 'Country Name'}, inplace=True)


# Manually adding Kosovo, which is missing from the wikipedia page
kosovo = {
    'Country Name': 'Kosovo',
    'Borders Length (in KM)': 743.556,
    'Neighbouring Countries': ['Albania', 'Montenegro', 'North Macedonia', 'Serbia']
}
df_CountryNeigh = pd.concat([df_CountryNeigh, pd.DataFrame([kosovo])], ignore_index=True)

### War information
We'll gather all wars between countries (and also significant Civil Wars/Rebellions) from 1900s onward. We'll need to clean them while collecting, as only Wars with recognizable states will be accepted. We'll discard all other instances.

This means this probably won't be as thorough/accurate as manual data insertion, but it'll take us 90% there with a fraction of the effort.
With such info, we'll estimate:
- Distinct Count of neighbours each country has warred with (This means we'll need to filter Civil Wars [Wars with self] for this one)
- Count of Wars each country has had with neighbours
- Total Count of Wars each country has had

### NOTE TO SELF
I'll need to keep side A and B of the war (doesn't matter which is winning or losing). So that I can calculate the above list

In [10]:
# Mapping relevant names to a dataFrame, so our fuzzySearch will work properly
expRefNames = pd.DataFrame({
    'Standard Names': ['Ottoman', 'Weimar Republic', 'Qing Dynasty', 'Bitterenders', "Ha'il", "Ikhwan", "Najran", "British Empire",
                       "England", 'Kurdistan', "French", "Soviet Union", "Kurdish", "Ararat", "Khan", "Saqqawists", "Khanty", "Muhammad Umar",
                       "Spanish", "Polish", "Palestine", "Czechoslovakia", "Rhodesia", "Zaire", "Turkistan", "hamas"],
    'targetNames': ['Turkey', 'Germany', 'China', 'South Africa', "Saudi Arabia", "Saudi Arabia", "Saudi Arabia", "United Kingdom",
                    "United Kingdom", 'Iraq', "France", "Russia", "Turkey", "Turkey", "Afghanistan", "Afghanistan", "Russia", "Kazakhstan",
                    "Spain", "Poland", "Israel", "Czech Republic", "Zimbabwe", "Democratic Republic of Congo", "Pakistan", "Israel"]
})

In [11]:
PARAMS = {
    "action": "parse",
    "page": "Lists_of_wars",
    "format": "json"
}

response = requests.get(BASE_URL, params=PARAMS)
response.status_code

data = response.json()
page_html = data['parse']['text']['*']
soup = BeautifulSoup(page_html, 'html.parser')

200

In [54]:
# Getting all past wars links
pastWars_links = soup.findAll('a', string=re.compile("List of wars:"))
pastWars_linkList = []

# Inserting links from 1900s onward into list
for i, link in enumerate(pastWars_links):
    if i > 3:
        pastWars_linkList.append(link.get('href')[6:])


We'll now iterate over all tables from the given links

In [55]:
# Fixing URLs format
pastWars_linkList = [link.replace(f"%E2%80%93", '–') for link in pastWars_linkList]

TEMP_ITERATOR = 0
for link in pastWars_linkList:
    # For each link, find all tables
    
    if TEMP_ITERATOR > 0:
        break
    TEMP_ITERATOR += 1

    PARAMS = {
        "action": "parse",
        "page": link,
        "format": "json"
    }
    response = requests.get(BASE_URL, params=PARAMS)

    if response.status_code//100 != 2: # if status code isn't in the 200s
        display(f'Something unexpected happened. Status Code returned: {response.status_code}')
        break
    else:
        display(f'Status_Code: {response.status_code}')

    data = response.json()

    if 'parse' in data:
        page_html = data['parse']['text']['*']
        soup = BeautifulSoup(page_html, 'html.parser')
        tables = soup.findAll('table')
    else:
        display('------------------------- ERROR ------------------------- ')
        display(data)
    

'Status_Code: 200'

In [58]:
#for table in tables:
#    print(table)