# Extracting Wikipedia Data
We'll extract some information about border lenghts and neighbours, wars and basic infrastructure. 
### Country Border information


In [12]:
import pandas as pd

import requests
import json
from bs4 import BeautifulSoup

from fuzzywuzzy import process

In [2]:
df = pd.read_csv("..\Data_Sets\processed\economicData_1960-2022_noNaN-drops.csv") # Processed Data Set from an earlier notebook
refNames = pd.DataFrame({
    'Standard Names': df['Country Name'].unique()
})

In [13]:
BASE_URL = "https://en.wikipedia.org/w/api.php"
PARAMS = {
    "action": "parse",
    "page": "List_of_countries_and_territories_by_number_of_land_borders",
    "format": "json"
}

response = requests.get(BASE_URL, params=PARAMS)
data = response.json()

# The main content of the page is in ['parse']['text']['*']
page_html = data['parse']['text']['*']

# Now, you can use BeautifulSoup to parse this HTML
soup = BeautifulSoup(page_html, 'html.parser')

# Find the table with neighbouring countries info
table = soup.find('table')

response.status_code

200

In [5]:
def fuzzySearchName(name, refNames=refNames):
    '''
        Given a country name, returns the closest match from refNames using fuzzy search.
        If no close match is found, it returns "PLEASE FILL MANUALLY".
    '''
    
    # If the name is empty or null, return "UNKNOWN"
    if pd.isna(name) or not name.strip():
        return "UNKNOWN"

    match, score, _ = process.extractOne(name, refNames['Standard Names'])
    
    # If a close match is found (score > 45), return the match
    if score > 45:
        return match
    
    # If no close match is found, ask for manual input
    return "PLEASE FILL MANUALLY"


In [6]:
countriesBordersList = {}
for tr in table.findAll('tr')[2:]: # Skip the 2 lines-header row by using slicing
    tds = tr.findAll('td')
    
    countryName = tds[0].find('b').find('a').text 
    
    neighboursBorders_inKM = tds[1].text.strip()    
    neighbouring_countries = []
    
    # Extracting neighbouring countries
    links_in_td = tds[5].findAll('a')
    if links_in_td:
        for a in links_in_td:
            if '[' not in a.text: # Excluding reference links
                currentName = a.text.strip()
                currentName = fuzzySearchName(currentName) # Correcting names with fuzzySearch
                
                if currentName in refNames['Standard Names'].values: # Discarding mismatchs, only interested in one of the 185 Countries
                    neighbouring_countries.append(currentName)
    
    # Adding countryName as Key and neighbouring_countries as Value in countriesBordersList dict
    countriesBordersList[countryName] = {
        'Borders Length (in KM)': neighboursBorders_inKM,
        'Neighbouring Countries': neighbouring_countries
    }

In [7]:
# Pivoting the dictionary, before creating the dataFrame
data_list = [{'Country Name': country, **values} for country, values in countriesBordersList.items()]

# Create a DataFrame from the list of dictionaries
df_CountryNeigh = pd.DataFrame(data_list)

Unnamed: 0,Country Name,Borders Length (in KM),Neighbouring Countries
0,Afghanistan,5529,"[China, Iran, Pakistan, Tajikistan, Turkmenist..."
1,Albania,720,"[Greece, Serbia, North Macedonia, Montenegro]"
2,Algeria,6470,"[Libya, Mali, Mauritania, Morocco, Niger, Tuni..."
3,Andorra,120,"[France, Spain]"
4,Angola,5198,"[Democratic Republic of Congo, Republic of Con..."
...,...,...,...
196,Vietnam,4639,"[Cambodia, China, Laos]"
197,Western Sahara,2046,"[Algeria, Mauritania, Morocco]"
198,Yemen,1746,"[Oman, Saudi Arabia]"
199,Zambia,5667,"[Angola, Botswana, Democratic Republic of Cong..."


In [8]:
# Fixing Country Name with the standard i'm using in other Data Sets
df_CountryNeigh['New Country Name'] = df_CountryNeigh['Country Name'].apply(fuzzySearchName)

df_CountryNeigh.loc[df_CountryNeigh['Country Name'] == 'Kyrgyzstan', 'New Country Name'] = 'Kyrgyz Republic'
df_CountryNeigh.loc[df_CountryNeigh['Country Name'] == 'Slovakia', 'New Country Name'] = 'Slovak Republic'

In [9]:
pd.set_option('display.max_rows', 250)
pd.set_option('display.min_rows', 135)
pd.set_option('display.max_columns', None)

# Cleaning Countries in the dataFrame:
# - Removing duplicates (where the original Country Name isn't in refNames)
# - The only exception is Bahamas, otherwise all other countries work.
mask_oldToActual = df_CountryNeigh['Country Name'].isin(refNames['Standard Names'])
mask_duplicatedEntries = df_CountryNeigh['New Country Name'].duplicated(keep=False)
condition = (~mask_oldToActual) & (mask_duplicatedEntries) & (df_CountryNeigh['Country Name'] != 'Bahamas')

df_CountryNeigh = df_CountryNeigh[~condition]


In [10]:
# Removing empty row
df_CountryNeigh = df_CountryNeigh[~(df_CountryNeigh['New Country Name'] == 'UNKNOWN')]

# Removing old names, and renaming New Country Name column
df_CountryNeigh.drop('Country Name', axis = 1, inplace=True)
df_CountryNeigh.rename(columns = {'New Country Name': 'Country Name'}, inplace=True)


# Manually adding Kosovo, which is missing from the wikipedia page
kosovo = {
    'Country Name': 'Kosovo',
    'Borders Length (in KM)': 743.556,
    'Neighbouring Countries': ['Albania', 'Montenegro', 'North Macedonia', 'Serbia']
}
df_CountryNeigh = pd.concat([df_CountryNeigh, pd.DataFrame([kosovo])], ignore_index=True)

In [11]:
df_CountryNeigh.shape

(185, 3)