In [1]:
import requests as req
from bs4 import BeautifulSoup as bs
import pandas as pd
import sys

sys.path.append('../src')
from functions import *

# Web scraping

In [2]:
w = pd.read_csv('../clean_data/world_footprint.csv')

In [3]:
main1 = 'https://www.worldometers.info/gdp/gdp-by-country/'

soup1 = bs(req.get(main1).content, 'html.parser')

rows1 = soup1.find('tbody').find_all('tr')
rows1 = [i.find_all('td') for i in rows1]
rows1 = [[j.text for j in i] for i in rows1]

df1 = pd.DataFrame(rows1)
df1 = df1.drop(columns = [0,3,4,5,6])

df1.columns = ['country_name','total_GDP','world_share']

world_gdp = 100562000000000
world_population = w['population_(millions)'].iloc[0]*1e6
world_gdp_per_capita_ppp = 20645

df1

Unnamed: 0,country_name,total_GDP,world_share
0,United States,"$25,462,700,000,000",25.32%
1,China,"$17,963,200,000,000",17.86%
2,Japan,"$4,231,140,000,000",4.21%
3,Germany,"$4,072,190,000,000",4.05%
4,India,"$3,385,090,000,000",3.37%
...,...,...,...
172,Sao Tome & Principe,"$546,680,342",0.00%
173,Micronesia,"$427,094,119",0.00%
174,Marshall Islands,"$279,667,900",0.00%
175,Kiribati,"$223,352,943",0.00%


In [4]:
main2 = 'https://www.worldometers.info/gdp/gdp-per-capita/'

soup2 = bs(req.get(main2).content, 'html.parser')

rows2 = soup2.find('tbody').find_all('tr')
rows2 = [i.find_all('td') for i in rows2]
rows2 = [[j.text for j in i] for i in rows2]

df2 = pd.DataFrame(rows2)
df2 = df2.drop(index = range(176,190))  # No data from these countries
df2 = df2.drop(columns = 0)
df2.columns = ['country_name','PPP_GDP_per_capita','absolute_GDP_per_capita','vsWorld_PPP_GDP_per_capita($20645)']

df2

Unnamed: 0,country_name,PPP_GDP_per_capita,absolute_GDP_per_capita,vsWorld_PPP_GDP_per_capita($20645)
0,Luxembourg,"$142,214","$127,046",689%
1,Singapore,"$127,565","$78,115",618%
2,Ireland,"$126,905","$105,362",615%
3,Norway,"$114,899","$106,594",557%
4,Qatar,"$114,648","$88,046",555%
...,...,...,...,...
171,Niger,"$1,505",$533,7%
172,Mozambique,"$1,468",$541,7%
173,DR Congo,"$1,337",$586,6%
174,Central African Republic,$967,$427,5%


In [5]:
# Data cleanning:

df1.total_GDP = [int(i.replace('$','').replace(',','')) for i in df1.total_GDP]
df1.world_share = [float(i.replace('%',''))/100 for i in df1.world_share]
df1.world_share = round(df1.total_GDP/world_gdp, 4) # Better
df1 = df1.reset_index(drop = True)

df1

Unnamed: 0,country_name,total_GDP,world_share
0,United States,25462700000000,0.2532
1,China,17963200000000,0.1786
2,Japan,4231140000000,0.0421
3,Germany,4072190000000,0.0405
4,India,3385090000000,0.0337
...,...,...,...
172,Sao Tome & Principe,546680342,0.0000
173,Micronesia,427094119,0.0000
174,Marshall Islands,279667900,0.0000
175,Kiribati,223352943,0.0000


In [6]:
df2.PPP_GDP_per_capita = [int(i.replace('$','').replace(',','')) for i in df2.PPP_GDP_per_capita]
df2.absolute_GDP_per_capita = [int(i.replace('$','').replace(',','')) for i in df2.absolute_GDP_per_capita]
df2['vsWorld_PPP_GDP_per_capita($20645)'] = [i/world_gdp_per_capita_ppp for i in df2.PPP_GDP_per_capita]
df2 = df2.reset_index(drop = True)

df2

Unnamed: 0,country_name,PPP_GDP_per_capita,absolute_GDP_per_capita,vsWorld_PPP_GDP_per_capita($20645)
0,Luxembourg,142214,127046,6.888544
1,Singapore,127565,78115,6.178978
2,Ireland,126905,105362,6.147009
3,Norway,114899,106594,5.565464
4,Qatar,114648,88046,5.553306
...,...,...,...,...
171,Niger,1505,533,0.072899
172,Mozambique,1468,541,0.071107
173,DR Congo,1337,586,0.064761
174,Central African Republic,967,427,0.046839


In [7]:
# Let's correct the country names and change them to the country_id from the countries.csv file:

df1['country_name'].apply(to_proper_country_name).value_counts()

143 Hong Kong Congo max sim:57


Congo                       2
United States of America    1
Benin                       1
Haiti                       1
Armenia                     1
                           ..
Slovakia                    1
Ecuador                     1
Oman                        1
Dominican Republic          1
Tuvalu                      1
Name: country_name, Length: 176, dtype: int64

In [8]:
df1[df1.country_name.apply(to_proper_country_name) == 'Congo'] # We drop the small Congo as well

143 Hong Kong Congo max sim:57


Unnamed: 0,country_name,total_GDP,world_share
84,DR Congo,58065953573,0.0006
129,Congo,14615532210,0.0001


In [9]:
# We will take out Hong Kong, as it is a city and not a country:

df1 = df1.drop(index = df1[df1.country_name == 'Hong Kong'].index)
df1 = df1.drop(index = 129)

In [10]:
df1['country_name'].apply(to_proper_country_name).value_counts() # Great

United States of America    1
Benin                       1
Haiti                       1
Armenia                     1
Palestine                   1
                           ..
Ecuador                     1
Oman                        1
Dominican Republic          1
Kenya                       1
Tuvalu                      1
Name: country_name, Length: 175, dtype: int64

In [12]:
# We'll make the change:

df1['country_name'] = df1.country_name.apply(to_proper_country_name)

cc = pd.read_csv('../clean_data/countries.csv')

df1['country_name'] = df1['country_name'].apply(change_to_country_id)
df1.columns = ['country_id', 'total_GDP', 'world_share']

df1

Unnamed: 0,country_id,total_GDP,world_share
0,43,25462700000000,0.2532
1,107,17963200000000,0.1786
2,125,4231140000000,0.0421
3,6,4072190000000,0.0405
4,127,3385090000000,0.0337
...,...,...,...
172,179,546680342,0.0000
173,194,427094119,0.0000
174,166,279667900,0.0000
175,163,223352943,0.0000


In [13]:
# Same with df2 now:

df2['country_name'].apply(to_proper_country_name).value_counts()

143 Hong Kong Congo max sim:57


Congo                  2
Luxembourg             1
Samoa                  1
Eswatini               1
Philippines            1
                      ..
Montenegro             1
Mauritius              1
Argentina              1
Antigua and Barbuda    1
Burundi                1
Name: country_name, Length: 175, dtype: int64

In [14]:
# We will take out Hong Kong, as it is a city and not a country:

df2 = df2.drop(index = df2[df2.country_name == 'Hong Kong'].index)
df2 = df2.drop(index = df2[df2.country_name == 'Congo'].index)

df2['country_name'].apply(to_proper_country_name).value_counts()

Luxembourg             1
Bangladesh             1
Eswatini               1
Philippines            1
Bolivia                1
                      ..
Montenegro             1
Mauritius              1
Argentina              1
Antigua and Barbuda    1
Burundi                1
Name: country_name, Length: 174, dtype: int64

In [15]:
# We make the change:

df2['country_name'] = df2.country_name.apply(to_proper_country_name)
df2['country_name'] = df2['country_name'].apply(change_to_country_id)
df2.columns = ['country_id','PPP_GDP_per_capita','absolute_GDP_per_capita','vsWorld_PPP_GDP_per_capita($20645)']

df2

Unnamed: 0,country_id,PPP_GDP_per_capita,absolute_GDP_per_capita,vsWorld_PPP_GDP_per_capita($20645)
0,44,142214,127046,6.888544
1,49,127565,78115,6.178978
2,11,126905,105362,6.147009
3,2,114899,106594,5.565464
4,133,114648,88046,5.553306
...,...,...,...,...
171,135,1505,533,0.072899
172,26,1468,541,0.071107
173,140,1337,586,0.064761
174,150,967,427,0.046839


In [19]:
# Let's join the dataframes in the same one:

df1 = df1.set_index('country_id')
df2 = df2.set_index('country_id')
df = pd.concat([df1, df2], axis = 1)
df = df.reset_index()

df

Unnamed: 0,country_id,total_GDP,world_share,PPP_GDP_per_capita,absolute_GDP_per_capita,vsWorld_PPP_GDP_per_capita($20645)
0,43,25462700000000,0.2532,76399.0,75269.0,3.700605
1,107,17963200000000,0.1786,21476.0,12598.0,1.040252
2,125,4231140000000,0.0421,45573.0,34135.0,2.207459
3,6,4072190000000,0.0405,63150.0,48845.0,3.058852
4,127,3385090000000,0.0337,8379.0,2389.0,0.405861
...,...,...,...,...,...,...
170,179,546680342,0.0000,4738.0,2404.0,0.229499
171,194,427094119,0.0000,3855.0,792.0,0.186728
172,166,279667900,0.0000,7228.0,6728.0,0.350109
173,163,223352943,0.0000,2365.0,1702.0,0.114556


In [23]:
# Nice. Let's save the dataframe to upload it to the SQL database:

df.to_csv('../clean_data/gross_domestic_product_GDP.csv', index = False)

# Regions calculations

In [48]:
reg = df.copy()

reg['region_id'] = [cc[cc.country_id == i]['region_id'].iloc[0] for i in reg.country_id]

reg = reg.groupby('region_id').sum()[['total_GDP', 'world_share', 'absolute_GDP_per_capita']]

reg = reg.reset_index()

reg

Unnamed: 0,region_id,total_GDP,world_share,absolute_GDP_per_capita
0,1,21278782551248,0.2116,1446951.0
1,2,27602540000000,0.2745,130915.0
2,3,5672289212560,0.0563,343579.0
3,4,1598972245892,0.0158,64282.0
4,5,29464546127553,0.293,381279.0
5,6,2006119005407,0.0195,108537.0
6,7,4731789691073,0.0471,27625.0
7,8,3955010966006,0.0392,367057.0
8,9,2240420000000,0.0223,15482.0


In [49]:
# Let's fix some columns:

regreg = pd.read_csv('../clean_data/regions_footprint.csv') # Other data from regions

regreg

Unnamed: 0,region,population_(millions),cropland_footprint,grazing_footprint,forest_product_footprint,carbon_footprint,fish_footprint,total_ecological_footprint_(consumption),cropland,grazing_land,forest_land,fishing_ground,built_up_land,total_biocapacity,ecological_(deficit)_or_reserve,number_of_earths_required
0,1,545.543,0.89381,0.214662,0.536844,2.179877,0.157055,4.307397,0.730784,0.125723,0.8414,0.237052,0.124689,2.096067,-2.21133,2.852174
1,2,373.193,1.326863,0.294763,0.861377,4.810147,0.113185,7.456518,1.781522,0.254341,2.111809,0.605125,0.050184,4.802981,-2.653537,4.937387
2,3,660.419,0.510517,0.450229,0.295058,0.830127,0.08972,2.380692,0.631172,0.638188,2.891304,0.257478,0.132827,4.713369,2.332677,1.576392
3,4,235.915,0.613708,0.138628,0.192015,1.421231,0.030847,2.831746,0.634826,0.28114,0.304809,0.026341,0.040782,1.870202,-0.961544,1.875061
4,5,2410.928,0.453991,0.110748,0.23172,2.153964,0.115528,3.193282,0.346641,0.128327,0.263611,0.144122,0.105678,1.014181,-2.179101,2.114455
5,6,1185.738,0.309987,0.158229,0.252314,0.245863,0.044974,1.066403,0.299132,0.283568,0.311383,0.06871,0.047847,1.013292,-0.053111,0.706127
6,7,1961.798,0.258601,0.013693,0.098341,0.618609,0.020055,1.061651,0.243261,0.009271,0.021991,0.033593,0.052352,0.360468,-0.701183,0.70298
7,8,385.218,0.39631,0.128151,0.089533,1.349898,0.056293,2.459484,0.174447,0.099164,0.029142,0.094131,0.038878,0.52204,-1.937444,1.628565
8,9,145.806,1.221597,0.082095,0.66766,3.565114,0.238481,5.801172,1.948343,0.338902,4.210829,1.187786,0.026225,7.712084,1.910912,3.841288


In [50]:
reg['absolute_GDP_per_capita'] = reg.reset_index()['total_GDP']/(regreg.reset_index()['population_(millions)']*1e6)

reg # Done

Unnamed: 0,region_id,total_GDP,world_share,absolute_GDP_per_capita
0,1,21278782551248,0.2116,39004.776069
1,2,27602540000000,0.2745,73963.17723
2,3,5672289212560,0.0563,8588.924929
3,4,1598972245892,0.0158,6777.747264
4,5,29464546127553,0.293,12221.246809
5,6,2006119005407,0.0195,1691.873757
6,7,4731789691073,0.0471,2411.965804
7,8,3955010966006,0.0392,10266.942266
8,9,2240420000000,0.0223,15365.759982


In [52]:
reg.to_csv('../clean_data/gross_domestic_product_GDPregions.csv', index = False)