# Data Cleaning
## 01 Import Libraries
## 02 Import Economy and Growth csv
## 03 Extract Relevant Rows for Study
### a) Extract required variables and copy the rows into a new dataframe
### b) Missing data and initial consistency checks
### c) Initial aggregations
### d) Interpolate Data
### e) Tidy Data Principles

## 01 Import Libraries

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## 02 Import Economy and Growth csv

In [3]:
# import dataframe

pathData = r'C:\Users\Michael\Desktop\Career Foundry\02 Data Immersion Course\06 Advanced Analytics and Dashboard Design\00 Data'
WB_econ = pd.read_csv(os.path.join(pathData, 'World Bank Economy and Growth', 'econgrowth.csv'))

## 03 Extract Relevant Rows for Study
### a) The file contains a column called "Indicator Code" and has an entry for each country. There are 254 indicators, but only the following four will be used for the study, as it is currentl understood:

| INDICATOR_CODE    | INDICATOR_NAME                             |
|:------------------|:-------------------------------------------|
| NY.GNP.PCAP.KD.ZG | GNI per capita growth (annual %)           |
| NY.GNP.PCAP.KD    | GNI per capita (constant 2015 US dollars)  |
| NY.GDP.PCAP.KD.ZG | GDP per capita growth (annual %)           |
| NY.GDP.PCAP.KD    | GDP per capita (constant 2015 US dollars)  |


The process to extract thesee will be as follows:
* All indicators other than these are to be dropped.
* The remaing will be saved as a new df (WB_encon_GDP_GNI)

In [4]:
# make list of the required Indicator Codes
econ_indicators = ["NY.GNP.PCAP.KD.ZG", "NY.GNP.PCAP.KD", "NY.GDP.PCAP.KD.ZG", "NY.GDP.PCAP.KD"]

In [5]:
# if the Indicators Variable is not in the econ_indicators list then the rown is to be dropped
# there new df is to be saved as WB_econ_GDP_GNI

WB_econ_GDP_GNI = WB_econ[WB_econ['Indicator Code'].isin(econ_indicators)]

In [6]:
#Check that the correct values have been selected
WB_econ_GDP_GNI.head(100)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
21,Aruba,ABW,GNI per capita growth (annual %),NY.GNP.PCAP.KD.ZG,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
22,Aruba,ABW,GNI per capita (constant 2015 US$),NY.GNP.PCAP.KD,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,27224.83755,,,,,,,
41,Aruba,ABW,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,,,,,,,,,,,,,,,,,,,,,,,,,,,,16.263941,18.866278,11.671573,2.195805,4.539317,2.370642,4.092615,4.800352,-0.567202,-1.830024,3.854744,-1.02709,-1.689537,4.918354,2.369193,-2.134006,0.117317,6.318777,-1.379148,-0.049084,1.838731,0.577088,-12.766043,-3.827013,2.400342,-1.833977,5.635318,-0.709567,2.885166,1.515938,4.917139,4.73833,0.181216,-18.698324,17.2253,
42,Aruba,ABW,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,,,,,,,,,,,,,,,,,,,,,,,,,,,15221.85068,17697.52345,21036.38745,23491.66468,24007.49571,25097.27211,25692.23846,26743.7228,28027.51564,27868.54308,27358.54212,28413.14401,28121.31558,27646.19565,29005.93337,29693.13991,29059.48651,29093.57824,30931.9365,30505.33919,30490.36584,31051.00157,31230.19304,27243.3332,26200.7273,26829.63448,26337.58506,27821.7916,27624.3773,28421.38649,28852.23707,30270.9416,31705.27882,31762.73396,25823.63494,30271.83363,
275,Africa Eastern and Southern,AFE,GNI per capita growth (annual %),NY.GNP.PCAP.KD.ZG,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.363691,-2.443167,-0.576866,0.049662,1.440924,2.598914,1.942857,-0.735913,-0.304226,1.17945,0.470167,1.401799,1.504914,2.88457,3.350672,3.592209,3.219139,0.895101,-2.136842,0.783565,0.804886,-0.375513,1.282105,1.505432,0.405513,0.386531,-0.539126,-0.25241,-0.67991,-5.058131,1.662013,0.555244
276,Africa Eastern and Southern,AFE,GNI per capita (constant 2015 US$),NY.GNP.PCAP.KD,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1222.81003,1193.906578,1164.737446,1158.018469,1158.59357,1175.288027,1205.832756,1229.260359,1220.214069,1216.501859,1230.849888,1236.636935,1253.9721,1272.843302,1309.559355,1353.438398,1402.056728,1447.190882,1460.144696,1428.943713,1440.140411,1451.731905,1446.280457,1464.823294,1486.875209,1492.904681,1498.675216,1490.595463,1486.833047,1476.723922,1402.029288,1425.331192,1433.245259
295,Africa Eastern and Southern,AFE,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,,-2.342974,5.093992,2.330796,1.724021,2.436215,1.09159,2.363152,1.097494,2.31429,1.721562,2.415494,-0.68631,1.492383,2.822419,-1.198882,-0.108534,-1.584489,-1.885533,-0.325151,2.362089,1.251224,-2.531377,-2.756441,0.079515,-3.301846,-0.712221,1.139694,1.062716,0.057761,-2.84668,-2.583363,-4.494923,-2.945245,-0.618975,1.664402,2.684745,1.806286,-0.655817,0.091279,0.750784,1.011096,1.26208,0.471842,2.789091,3.356423,3.754323,3.727576,1.505046,-2.035686,2.336313,0.942903,-1.844833,1.400457,1.186759,0.16684,-0.490259,-0.105886,-0.169822,-0.587557,-5.421662,1.683557,0.842507
296,Africa Eastern and Southern,AFE,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,1179.058077,1151.433058,1210.086966,1238.291625,1259.640033,1290.327575,1304.412658,1335.237911,1349.89207,1381.132486,1404.90954,1438.845045,1428.970103,1450.295813,1491.229235,1473.351159,1471.752073,1448.432328,1421.121658,1416.500869,1449.959873,1468.102112,1430.938906,1391.49592,1392.602362,1346.620772,1337.029857,1352.267899,1366.638667,1367.428052,1328.501753,1294.181733,1236.009267,1199.605765,1192.180503,1212.023185,1244.562918,1267.043279,1258.73379,1259.882746,1269.341742,1282.176012,1298.358102,1304.484303,1340.867561,1385.872748,1437.902891,1491.50182,1513.949609,1483.130349,1517.780915,1532.092118,1503.827572,1524.888025,1542.984773,1545.559084,1537.98184,1536.353327,1533.744255,1524.732633,1442.06678,1466.344792,1478.698845
529,Afghanistan,AFG,GNI per capita growth (annual %),NY.GNP.PCAP.KD.ZG,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
530,Afghanistan,AFG,GNI per capita (constant 2015 US$),NY.GNP.PCAP.KD,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,597.832321,,,,,,,


In [7]:
WB_econ_GDP_GNI.shape

(1064, 67)

### b) Missing data and initial consistency checks

In [8]:
# Finding 30% of the values for each year, to determine the years in which there is enough data
# Any years with more than 30% missing should be removed
0.3*1064

319.2

In [9]:
WB_econ_GDP_GNI.isnull().sum()

Country Name        0
Country Code        0
Indicator Name      0
Indicator Code      0
1960              927
1961              775
1962              767
1963              767
1964              765
1965              753
1966              735
1967              725
1968              716
1969              708
1970              672
1971              631
1972              629
1973              629
1974              626
1975              618
1976              609
1977              596
1978              587
1979              585
1980              566
1981              532
1982              514
1983              510
1984              507
1985              503
1986              494
1987              485
1988              479
1989              473
1990              446
1991              419
1992              412
1993              404
1994              391
1995              350
1996              315
1997              309
1998              303
1999              302
2000              285
2001      

Data in 2022 and prior to 1996 has too many missing values, these columns are to be removed and the study will cover the years 1996 to 2021
This is dependant on the other dataframes, as they may have other missing values and the reange may be compressed

In [10]:
for n in range(1960, 1996):
    WB_econ_GDP_GNI.drop(columns=[str(n)], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WB_econ_GDP_GNI.drop(columns=[str(n)], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WB_econ_GDP_GNI.drop(columns=[str(n)], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WB_econ_GDP_GNI.drop(columns=[str(n)], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WB_econ_G

In [11]:
WB_econ_GDP_GNI.drop(columns=['2022'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  WB_econ_GDP_GNI.drop(columns=['2022'], inplace=True)


In [12]:
WB_econ_GDP_GNI.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
21,Aruba,ABW,GNI per capita growth (annual %),NY.GNP.PCAP.KD.ZG,,,,,,,,,,,,,,,,,,,,,,,,,,
22,Aruba,ABW,GNI per capita (constant 2015 US$),NY.GNP.PCAP.KD,,,,,,,,,,,,,,,,,,,,27224.83755,,,,,,
41,Aruba,ABW,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,-1.830024,3.854744,-1.02709,-1.689537,4.918354,2.369193,-2.134006,0.117317,6.318777,-1.379148,-0.049084,1.838731,0.577088,-12.766043,-3.827013,2.400342,-1.833977,5.635318,-0.709567,2.885166,1.515938,4.917139,4.73833,0.181216,-18.698324,17.2253
42,Aruba,ABW,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,27358.54212,28413.14401,28121.31558,27646.19565,29005.93337,29693.13991,29059.48651,29093.57824,30931.9365,30505.33919,30490.36584,31051.00157,31230.19304,27243.3332,26200.7273,26829.63448,26337.58506,27821.7916,27624.3773,28421.38649,28852.23707,30270.9416,31705.27882,31762.73396,25823.63494,30271.83363
275,Africa Eastern and Southern,AFE,GNI per capita growth (annual %),NY.GNP.PCAP.KD.ZG,2.598914,1.942857,-0.735913,-0.304226,1.17945,0.470167,1.401799,1.504914,2.88457,3.350672,3.592209,3.219139,0.895101,-2.136842,0.783565,0.804886,-0.375513,1.282105,1.505432,0.405513,0.386531,-0.539126,-0.25241,-0.67991,-5.058131,1.662013


In [13]:
# Change display to 2dp floats
pd.options.display.float_format = '{:.2f}'.format

In [14]:
# check the types and the NaN count (should be >744 non-null)
WB_econ_GDP_GNI.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1064 entries, 21 to 67352
Data columns (total 30 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country Name    1064 non-null   object 
 1   Country Code    1064 non-null   object 
 2   Indicator Name  1064 non-null   object 
 3   Indicator Code  1064 non-null   object 
 4   1996            749 non-null    float64
 5   1997            755 non-null    float64
 6   1998            761 non-null    float64
 7   1999            762 non-null    float64
 8   2000            779 non-null    float64
 9   2001            799 non-null    float64
 10  2002            813 non-null    float64
 11  2003            822 non-null    float64
 12  2004            829 non-null    float64
 13  2005            836 non-null    float64
 14  2006            845 non-null    float64
 15  2007            859 non-null    float64
 16  2008            870 non-null    float64
 17  2009            879 non-null    floa

All columns now contain >70% of the required data and can therefore be used in the study

In [15]:
# check for duplicates
WB_econ_GDP_GNI[WB_econ_GDP_GNI.duplicated()]

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021


No duplicates found

### c) Perform aggregations

In [16]:
# making a list of all the years to use later
years = []
for n in range(1996, 2022):
    years.append(str(n))
    
print(years)

['1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']


In [17]:
# find the max value for each type
WB_econ_GDP_GNI.groupby('Indicator Name').max(years)

Unnamed: 0_level_0,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
Indicator Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
GDP per capita (constant 2015 US$),108553.2,110117.36,113122.26,116188.55,120467.75,123182.95,124670.29,126299.11,129740.49,132590.35,140855.53,162131.63,163013.0,142187.19,141773.29,148269.76,146511.88,157254.65,165362.43,170338.6,174739.58,168744.92,178868.25,191194.47,166910.63,204190.97
GDP per capita growth (annual %),81.36,140.48,30.63,20.81,18.91,55.59,19.56,14.77,49.03,26.66,33.03,23.59,65.39,17.14,21.92,18.7,96.96,18.01,12.63,23.2,30.17,30.5,7.19,21.72,43.76,39.84
GNI per capita (constant 2015 US$),76650.03,83205.56,83609.63,86190.29,89157.16,93556.14,85721.49,78874.91,93640.0,95225.39,86177.26,102844.63,97533.21,80735.18,85565.98,82352.96,83360.08,83816.63,105205.95,138201.22,105507.96,111662.41,110034.92,108260.99,102480.03,107243.42
GNI per capita growth (annual %),11.6,16.42,15.0,12.98,21.27,15.29,19.63,14.74,21.85,18.3,38.24,32.17,44.62,16.03,21.41,19.71,15.61,30.57,46.26,20.64,11.51,9.24,13.0,14.63,5.05,36.7


All max values are consistent with expectation

In [18]:
# find the min value for each type
WB_econ_GDP_GNI.groupby('Indicator Name').min(years)

Unnamed: 0_level_0,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
Indicator Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
GDP per capita (constant 2015 US$),234.71,254.24,246.39,251.38,258.87,272.29,268.42,255.1,281.52,294.74,299.77,298.94,300.84,296.85,297.79,299.02,301.48,305.29,307.77,289.36,282.98,277.96,274.13,270.14,263.36,264.36
GDP per capita growth (annual %),-18.35,-13.59,-29.41,-11.2,-16.42,-11.61,-14.96,-38.56,-6.47,-14.51,-8.3,-13.92,-18.32,-17.15,-13.15,-47.9,-48.39,-36.78,-24.46,-29.92,-12.89,-9.03,-18.59,-12.43,-55.09,-22.93
GNI per capita (constant 2015 US$),229.4,249.78,266.32,291.5,284.01,304.6,306.92,293.7,294.58,290.04,297.67,297.62,300.02,293.99,296.15,296.67,300.63,305.51,306.9,289.39,282.82,278.13,274.79,271.04,265.09,265.28
GNI per capita growth (annual %),-20.76,-13.26,-9.9,-16.26,-14.74,-14.68,-16.0,-17.24,-8.03,-7.06,-11.2,-12.83,-15.15,-28.55,-26.78,-11.29,-13.69,-36.55,-19.63,-21.22,-21.1,-15.29,-8.37,-10.59,-39.77,-61.95


All min values are consistent with expectation

find the mean values of each year

In [19]:
WB_econ_GDP_GNI.groupby('Indicator Name').mean(years)

Unnamed: 0_level_0,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
Indicator Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
GDP per capita (constant 2015 US$),10057.32,10502.37,10729.06,11000.06,11514.52,11639.9,11958.93,12202.62,12648.85,12959.17,13752.47,14236.28,14303.84,13731.5,13987.04,14260.26,14387.63,14441.55,14634.49,15430.5,15058.15,15302.08,15563.91,15780.76,14638.02,15225.43
GDP per capita growth (annual %),3.13,3.4,1.99,1.84,2.9,1.91,1.9,2.52,4.5,3.54,4.15,4.12,2.45,-1.39,3.04,2.35,1.87,1.66,1.9,1.36,1.77,1.96,1.85,1.61,-5.71,4.31
GNI per capita (constant 2015 US$),10052.39,10169.4,10435.66,10695.45,10882.91,11171.04,11149.96,11307.51,11683.07,11971.45,12019.74,12095.95,11858.05,11205.63,11501.74,11619.97,11732.23,11792.26,12360.73,13660.15,12617.01,12962.42,13127.06,13340.89,12640.18,13445.9
GNI per capita growth (annual %),2.73,2.84,1.99,1.48,2.21,2.26,1.88,2.65,4.31,4.14,4.79,4.66,2.44,-1.3,2.82,2.69,2.21,2.18,2.35,1.69,1.58,2.01,1.87,1.88,-4.99,3.59


The mean values are all consistent with the expectations of the variables

### d) Interpolate Data

There are many missing values, to get an estimated value for these data linear interpolation can be used.
Before interpolating a flag will be made to say that the row has had interpolated data.
Interpolation will only happen if there is data with no more than 5 consecutive years of the missing (20% of total timeframe).

In [20]:
WB_econ_GDP_GNI_interpolated = WB_econ_GDP_GNI[WB_econ_GDP_GNI.columns.intersection(years)]
WB_econ_GDP_GNI_interpolated.head()

Unnamed: 0,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
21,,,,,,,,,,,,,,,,,,,,,,,,,,
22,,,,,,,,,,,,,,,,,,,,27224.84,,,,,,
41,-1.83,3.85,-1.03,-1.69,4.92,2.37,-2.13,0.12,6.32,-1.38,-0.05,1.84,0.58,-12.77,-3.83,2.4,-1.83,5.64,-0.71,2.89,1.52,4.92,4.74,0.18,-18.7,17.23
42,27358.54,28413.14,28121.32,27646.2,29005.93,29693.14,29059.49,29093.58,30931.94,30505.34,30490.37,31051.0,31230.19,27243.33,26200.73,26829.63,26337.59,27821.79,27624.38,28421.39,28852.24,30270.94,31705.28,31762.73,25823.63,30271.83
275,2.6,1.94,-0.74,-0.3,1.18,0.47,1.4,1.5,2.88,3.35,3.59,3.22,0.9,-2.14,0.78,0.8,-0.38,1.28,1.51,0.41,0.39,-0.54,-0.25,-0.68,-5.06,1.66


In [21]:
WB_econ_GDP_GNI_interpolated = WB_econ_GDP_GNI_interpolated.interpolate(method='linear',
                                                            axis=1,
                                                            inplace=False,
                                                            limit_direction = 'both',
                                                            limit_area=None)

In [22]:
# check the interpolated values
WB_econ_GDP_GNI_interpolated

Unnamed: 0,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
21,,,,,,,,,,,,,,,,,,,,,,,,,,
22,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84
41,-1.83,3.85,-1.03,-1.69,4.92,2.37,-2.13,0.12,6.32,-1.38,-0.05,1.84,0.58,-12.77,-3.83,2.4,-1.83,5.64,-0.71,2.89,1.52,4.92,4.74,0.18,-18.7,17.23
42,27358.54,28413.14,28121.32,27646.2,29005.93,29693.14,29059.49,29093.58,30931.94,30505.34,30490.37,31051.0,31230.19,27243.33,26200.73,26829.63,26337.59,27821.79,27624.38,28421.39,28852.24,30270.94,31705.28,31762.73,25823.63,30271.83
275,2.6,1.94,-0.74,-0.3,1.18,0.47,1.4,1.5,2.88,3.35,3.59,3.22,0.9,-2.14,0.78,0.8,-0.38,1.28,1.51,0.41,0.39,-0.54,-0.25,-0.68,-5.06,1.66
276,1205.83,1229.26,1220.21,1216.5,1230.85,1236.64,1253.97,1272.84,1309.56,1353.44,1402.06,1447.19,1460.14,1428.94,1440.14,1451.73,1446.28,1464.82,1486.88,1492.9,1498.68,1490.6,1486.83,1476.72,1402.03,1425.33
295,2.68,1.81,-0.66,0.09,0.75,1.01,1.26,0.47,2.79,3.36,3.75,3.73,1.51,-2.04,2.34,0.94,-1.84,1.4,1.19,0.17,-0.49,-0.11,-0.17,-0.59,-5.42,1.68
296,1244.56,1267.04,1258.73,1259.88,1269.34,1282.18,1298.36,1304.48,1340.87,1385.87,1437.9,1491.5,1513.95,1483.13,1517.78,1532.09,1503.83,1524.89,1542.98,1545.56,1537.98,1536.35,1533.74,1524.73,1442.07,1466.34
529,,,,,,,,,,,,,,,,,,,,,,,,,,
530,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83


In [23]:
# Re-attach the row headers
WB_econ_GDP_GNI_headers = WB_econ_GDP_GNI[
                                    WB_econ_GDP_GNI.columns.intersection([
                                                'Country Name',
                                                'Country Code',
                                                'Indicator Name',
                                                'Indicator Code'])]
WB_econ_GDP_GNI_headers.head(1064)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code
21,Aruba,ABW,GNI per capita growth (annual %),NY.GNP.PCAP.KD.ZG
22,Aruba,ABW,GNI per capita (constant 2015 US$),NY.GNP.PCAP.KD
41,Aruba,ABW,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG
42,Aruba,ABW,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD
275,Africa Eastern and Southern,AFE,GNI per capita growth (annual %),NY.GNP.PCAP.KD.ZG
276,Africa Eastern and Southern,AFE,GNI per capita (constant 2015 US$),NY.GNP.PCAP.KD
295,Africa Eastern and Southern,AFE,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG
296,Africa Eastern and Southern,AFE,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD
529,Afghanistan,AFG,GNI per capita growth (annual %),NY.GNP.PCAP.KD.ZG
530,Afghanistan,AFG,GNI per capita (constant 2015 US$),NY.GNP.PCAP.KD


In [24]:
# merge the index names with the interpolated years data
WB_econ_GDP_GNI_corrected = pd.merge(WB_econ_GDP_GNI_headers, WB_econ_GDP_GNI_interpolated, left_index=True, right_index=True)
WB_econ_GDP_GNI_corrected.head(50)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
21,Aruba,ABW,GNI per capita growth (annual %),NY.GNP.PCAP.KD.ZG,,,,,,,,,,,,,,,,,,,,,,,,,,
22,Aruba,ABW,GNI per capita (constant 2015 US$),NY.GNP.PCAP.KD,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84,27224.84
41,Aruba,ABW,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,-1.83,3.85,-1.03,-1.69,4.92,2.37,-2.13,0.12,6.32,-1.38,-0.05,1.84,0.58,-12.77,-3.83,2.4,-1.83,5.64,-0.71,2.89,1.52,4.92,4.74,0.18,-18.7,17.23
42,Aruba,ABW,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,27358.54,28413.14,28121.32,27646.2,29005.93,29693.14,29059.49,29093.58,30931.94,30505.34,30490.37,31051.0,31230.19,27243.33,26200.73,26829.63,26337.59,27821.79,27624.38,28421.39,28852.24,30270.94,31705.28,31762.73,25823.63,30271.83
275,Africa Eastern and Southern,AFE,GNI per capita growth (annual %),NY.GNP.PCAP.KD.ZG,2.6,1.94,-0.74,-0.3,1.18,0.47,1.4,1.5,2.88,3.35,3.59,3.22,0.9,-2.14,0.78,0.8,-0.38,1.28,1.51,0.41,0.39,-0.54,-0.25,-0.68,-5.06,1.66
276,Africa Eastern and Southern,AFE,GNI per capita (constant 2015 US$),NY.GNP.PCAP.KD,1205.83,1229.26,1220.21,1216.5,1230.85,1236.64,1253.97,1272.84,1309.56,1353.44,1402.06,1447.19,1460.14,1428.94,1440.14,1451.73,1446.28,1464.82,1486.88,1492.9,1498.68,1490.6,1486.83,1476.72,1402.03,1425.33
295,Africa Eastern and Southern,AFE,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,2.68,1.81,-0.66,0.09,0.75,1.01,1.26,0.47,2.79,3.36,3.75,3.73,1.51,-2.04,2.34,0.94,-1.84,1.4,1.19,0.17,-0.49,-0.11,-0.17,-0.59,-5.42,1.68
296,Africa Eastern and Southern,AFE,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,1244.56,1267.04,1258.73,1259.88,1269.34,1282.18,1298.36,1304.48,1340.87,1385.87,1437.9,1491.5,1513.95,1483.13,1517.78,1532.09,1503.83,1524.89,1542.98,1545.56,1537.98,1536.35,1533.74,1524.73,1442.07,1466.34
529,Afghanistan,AFG,GNI per capita growth (annual %),NY.GNP.PCAP.KD.ZG,,,,,,,,,,,,,,,,,,,,,,,,,,
530,Afghanistan,AFG,GNI per capita (constant 2015 US$),NY.GNP.PCAP.KD,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83,597.83


Dataframe has been successfully interpolated with a maximum 5 year limit for the economic data required for the study.

### e) Tidy Data Principles

MELT: Each year is an observation and should be in a column called "Year".

PIVOT: The four indicators (currently in 'Indicator Name') are separate variables, and should ben four columns, with the numerical values being the observation in these columns.

Country and Country Code are correctly column headers

The dataframe should eb presented in the following way:

(All values are per capita)

| Country | Year | GNI growth per cap (%) | GNI (2015 USD) per cap | GDP (%) per cap | GDP (2015 USD) per cap |
|:--------|:-----|:-----------------------|:-----------------------|:----------------|:-----------------------|
| Aruba   | 1996 | XXX                    | XXX                    | XXX             | XXX                    |
| Aruba   | 1997 | XXX                    | XXX                    | XXX             | XXX                    |
| Aruba   | 1998 | XXX                    | XXX                    | XXX             | XXX                    |
| Aruba   | 1999 | XXX                    | XXX                    | XXX             | XXX                    |

This will also make merging with other dataframes easier, and give greater versatility to the visualisations made in the future.


In [25]:
# First use a MELT to turn the years into a variable, the recorded value of the indicator will temporarirly be called "value":
WB_econ_GDP_GNI_melt = WB_econ_GDP_GNI_corrected.melt(['Country Name','Country Code','Indicator Name','Indicator Code'], var_name = "Year", value_name = "value")

In [26]:
WB_econ_GDP_GNI_melt.head(50)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,Year,value
0,Aruba,ABW,GNI per capita growth (annual %),NY.GNP.PCAP.KD.ZG,1996,
1,Aruba,ABW,GNI per capita (constant 2015 US$),NY.GNP.PCAP.KD,1996,27224.84
2,Aruba,ABW,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,1996,-1.83
3,Aruba,ABW,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,1996,27358.54
4,Africa Eastern and Southern,AFE,GNI per capita growth (annual %),NY.GNP.PCAP.KD.ZG,1996,2.6
5,Africa Eastern and Southern,AFE,GNI per capita (constant 2015 US$),NY.GNP.PCAP.KD,1996,1205.83
6,Africa Eastern and Southern,AFE,GDP per capita growth (annual %),NY.GDP.PCAP.KD.ZG,1996,2.68
7,Africa Eastern and Southern,AFE,GDP per capita (constant 2015 US$),NY.GDP.PCAP.KD,1996,1244.56
8,Afghanistan,AFG,GNI per capita growth (annual %),NY.GNP.PCAP.KD.ZG,1996,
9,Afghanistan,AFG,GNI per capita (constant 2015 US$),NY.GNP.PCAP.KD,1996,597.83


The dataframe has had the years conrrectly converted to variables, and each year is an observation.

In [27]:
# Second use PIVOT to convert the Indicators into separate variables, and not observations
WB_econ_GDP_GNI_melt_pivot = WB_econ_GDP_GNI_melt.pivot_table(
                                index = ['Country Name', 'Year'], 
                                columns = 'Indicator Name', 
                                values = 'value').reset_index()

WB_econ_GDP_GNI_melt_pivot.head(500)

Indicator Name,Country Name,Year,GDP per capita (constant 2015 US$),GDP per capita growth (annual %),GNI per capita (constant 2015 US$),GNI per capita growth (annual %)
0,Afghanistan,1996,359.77,0.93,597.83,
1,Afghanistan,1997,359.77,0.93,597.83,
2,Afghanistan,1998,359.77,0.93,597.83,
3,Afghanistan,1999,359.77,0.93,597.83,
4,Afghanistan,2000,359.77,0.93,597.83,
5,Afghanistan,2001,359.77,0.93,597.83,
6,Afghanistan,2002,359.77,0.93,597.83,
7,Afghanistan,2003,363.1,0.93,597.83,
8,Afghanistan,2004,354.03,-2.5,597.83,
9,Afghanistan,2005,379.96,7.32,597.83,


The pivoting has correctly converted the data to the standardised Tidy Data norms.

In [28]:
# After the changes the dataframe haas been converted to a list of Tuples.
# This needs to be converted back to a dataframe.
WB_econ_GDP_GNI_melt_pivot = pd.DataFrame(WB_econ_GDP_GNI_melt_pivot)

In [29]:
WB_econ_GDP_GNI_melt_pivot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6786 entries, 0 to 6785
Data columns (total 6 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Country Name                        6786 non-null   object 
 1   Year                                6786 non-null   object 
 2   GDP per capita (constant 2015 US$)  6734 non-null   float64
 3   GDP per capita growth (annual %)    6786 non-null   float64
 4   GNI per capita (constant 2015 US$)  6448 non-null   float64
 5   GNI per capita growth (annual %)    5356 non-null   float64
dtypes: float64(4), object(2)
memory usage: 318.2+ KB


## 04 Save data to a cleaned data folder

In [30]:
WB_econ_GDP_GNI_melt_pivot.to_csv(os.path.join(pathData, 'World Bank Cleaned', 'econgrowth_clean.csv'))