In [37]:
import requests as req
from bs4 import BeautifulSoup as bs
import pandas as pd
import sys

sys.path.append('../src')
from functions import *

# Web scraping

In [44]:
main = 'https://en.wikipedia.org/wiki/List_of_countries_by_system_of_government'

soup = bs(req.get(main).content, 'html.parser')

table = soup.find_all('tbody')[5]

rows = table1.find_all('tr')


df = pd.DataFrame(data = [[i.text.split('\n')[1].strip() for i in rows[1:]],
                          [i.text.split('\n')[3] for i in rows[1:]],
                          [i.text.split('\n')[5] for i in rows[1:]],
                          [i.text.split('\n')[7].split('[')[0] for i in rows[1:]]]).T


df.columns = ['country_name','government_type','head_of_state_role','government_basis']

df

Unnamed: 0,country_name,government_type,head_of_state_role,government_basis
0,Afghanistan,Provisional,,No constitutionally-defined basis to current r...
1,Albania,Republic,Ceremonial,Ministry is subject to parliamentary confidence
2,Algeria,Republic,Executive,Presidency independent of legislature; ministr...
3,Andorra,Constitutional monarchy,Ceremonial,Ministry is subject to parliamentary confidence
4,Angola,Republic,Executive,Presidency is independent of legislature
...,...,...,...,...
190,Venezuela,Republic,Executive,Presidency is independent of legislature
191,Vietnam,Republic,Executive,Power constitutionally linked to a single poli...
192,Yemen,Provisional,,No constitutionally-defined basis to current r...
193,Zambia,Republic,Executive,Presidency is independent of legislature


In [46]:
df.country_name.apply(to_proper_country_name).value_counts()[:3] 

Congo               2
Afghanistan         1
Papua New Guinea    1
Name: country_name, dtype: int64

In [24]:
df[df.country_name.apply(to_proper_country_name) == 'Congo']

Unnamed: 0,country_name,government_type,head_of_state_role,government_basis
38,"Congo, Democratic Republic of the",Republic,Executive,Presidency independent of legislature; ministr...
39,"Congo, Republic of the",Republic,Executive,Presidency independent of legislature; ministr...


In [25]:
# Let's drop one of the Congos, as this is redundant information:

df = df.drop(index = 38).reset_index(drop = True)

df.country_name.apply(to_proper_country_name).value_counts()[:3] 

Afghanistan    1
Saint Lucia    1
Nicaragua      1
Name: country_name, dtype: int64

In [26]:
# Let's correct the names now:

df['country_name'] = df.country_name.apply(to_proper_country_name)

# There are four new country names. Let's update our country.csv file and incorporate them:

cc = pd.read_csv('../clean_data/countries.csv')

for i in df.country_name:
    if i not in list(cc.country_name):
        cc.loc[cc.shape[0]] = {'country_id': cc.shape[0] + 1, 'region_id': 0,'country_name': i}

cc

Unnamed: 0,country_id,region_id,country_name
0,1,1,Iceland
1,29,1,Slovenia
2,32,1,Portugal
3,38,1,Serbia
4,40,1,France
...,...,...,...
191,192,8,Yemen
192,193,5,Brunei
193,194,5,Micronesia
194,195,8,Palestine


In [27]:
# We assign them their corresponding region:

rr = pd.read_csv('../clean_data/regions.csv')

rr

Unnamed: 0,region_id,region
0,1,Europe
1,2,North America
2,3,Latinamerica Caribbean
3,4,Eurasia Central Asia
4,5,East Asia Pacific
5,6,Sub Saharan Africa
6,7,Southern Asia
7,8,Middle East North Africa
8,9,Northern Asia


In [28]:
# newcountries = list(cc[cc.region_id == 0].country_name)

# newcountries

## (They) don't appear because the countries csv file is already updated)

[]

In [15]:
# newreg = [5, 5, 8, 1]

# dictio = {i:j for i,j in zip(newcountries,newreg)}

# dictio

{'Brunei': 5, 'Micronesia': 5, 'Palestine': 8, 'Vatican City': 1}

In [29]:
def ass_reg(c, dictio):
    try:
        return dictio[c]
    except:
        pass
    
cc.region_id = [i if i !=0 else ass_reg(j, dictio) for i,j in zip(cc.region_id,cc.country_name)]

cc

Unnamed: 0,country_id,region_id,country_name
0,1,1,Iceland
1,29,1,Slovenia
2,32,1,Portugal
3,38,1,Serbia
4,40,1,France
...,...,...,...
191,192,8,Yemen
192,193,5,Brunei
193,194,5,Micronesia
194,195,8,Palestine


In [30]:
sum(cc.region_id == 0) 

0

In [31]:
# Great! Let's save the countries file:

cc.to_csv('../clean_data/countries.csv', index = False)

In [32]:
# Now we will replace the country_name column of df with the country_id:

df['country_name'] = [cc[cc.country_name == i]['country_id'].iloc[0] for i in df.country_name]

df.columns = ['country_id', 'government_type', 'head_of_state_role', 'government_basis']

df

Unnamed: 0,country_id,government_type,head_of_state_role,government_basis
0,146,Provisional,,No constitutionally-defined basis to current r...
1,17,Republic,Ceremonial,Ministry is subject to parliamentary confidence
2,144,Republic,Executive,Presidency independent of legislature; ministr...
3,147,Constitutional monarchy,Ceremonial,Ministry is subject to parliamentary confidence
4,118,Republic,Executive,Presidency is independent of legislature
...,...,...,...,...
189,191,Republic,Executive,Presidency is independent of legislature
190,72,Republic,Executive,Power constitutionally linked to a single poli...
191,192,Provisional,,No constitutionally-defined basis to current r...
192,85,Republic,Executive,Presidency is independent of legislature


In [35]:
df = df.replace('n/a', None)

df

Unnamed: 0,country_id,government_type,head_of_state_role,government_basis
0,146,Provisional,,No constitutionally-defined basis to current r...
1,17,Republic,Ceremonial,Ministry is subject to parliamentary confidence
2,144,Republic,Executive,Presidency independent of legislature; ministr...
3,147,Constitutional monarchy,Ceremonial,Ministry is subject to parliamentary confidence
4,118,Republic,Executive,Presidency is independent of legislature
...,...,...,...,...
189,191,Republic,Executive,Presidency is independent of legislature
190,72,Republic,Executive,Power constitutionally linked to a single poli...
191,192,Provisional,,No constitutionally-defined basis to current r...
192,85,Republic,Executive,Presidency is independent of legislature


In [36]:
# Nice, all done. Let's save the dataframe to upload it to the SQL database:

df.to_csv('../clean_data/government_type.csv', index = False)