In [1]:
import pandas as pd
import numpy as np
import difflib
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 100)

In [2]:
df = pd.read_csv('../data/gdp.csv', header=1)

In [3]:
df.head()

Unnamed: 0,Time,2013,2014,2015,2016,2017,2018,2019
0,Country,,,,,,,
1,Afghanistan,637.16504,613.85633,578.46635,547.22811,556.30214,520.8966,..
2,Albania,4413.08289,4578.66793,3952.83078,4124.10854,4532.8892,5253.63006,..
3,Algeria,5499.58776,5493.05677,4177.88897,3948.8119,4048.28526,4278.85006,..
4,American Samoa,11505.39371,11525.15639,11843.33118,11714.89568,11398.77742,..,..


In [4]:
df.rename(columns={'Time': 'Country'}, inplace=True)
df.drop(0, inplace=True)
df.head()

Unnamed: 0,Country,2013,2014,2015,2016,2017,2018,2019
1,Afghanistan,637.16504,613.85633,578.46635,547.22811,556.30214,520.8966,..
2,Albania,4413.08289,4578.66793,3952.83078,4124.10854,4532.8892,5253.63006,..
3,Algeria,5499.58776,5493.05677,4177.88897,3948.8119,4048.28526,4278.85006,..
4,American Samoa,11505.39371,11525.15639,11843.33118,11714.89568,11398.77742,..,..
5,Andorra,40626.75163,42300.33413,36039.6535,37224.10892,39134.39337,42029.76274,..


In [5]:
import difflib
countries = pd.read_csv('../data/country-codes_csv.csv')
def get_country_iso(name):
    p = difflib.get_close_matches(name, countries['official_name_en'].dropna().unique(), n=1, cutoff=0.5)
    if len(p)>0:
        return countries[countries['official_name_en']==p[0]]['ISO3166-1-Alpha-3'].to_list()[0]
    else:
        return ""

In [6]:
df['country_iso'] = [get_country_iso(x) for x in df['Country']]
df[df['country_iso']=='']

Unnamed: 0,Country,2013,2014,2015,2016,2017,2018,2019,country_iso


In [7]:
df['country_val'] = [countries[countries['ISO3166-1-Alpha-3']==x]['official_name_en'].to_list()[0] for x in df['country_iso']]

In [8]:
df[['Country', 'country_iso', 'country_val']]

Unnamed: 0,Country,country_iso,country_val
1,Afghanistan,AFG,Afghanistan
2,Albania,ALB,Albania
3,Algeria,DZA,Algeria
4,American Samoa,ASM,American Samoa
5,Andorra,AND,Andorra
6,Angola,AGO,Angola
7,Anguilla,AIA,Anguilla
8,Antigua and Barbuda,ATG,Antigua and Barbuda
9,Argentina,ARG,Argentina
10,Armenia,ARM,Armenia


Line 44, North Macedonia, was incorrectly assigned as New Caledonia. Fixing it to the correct ISO code:

In [9]:
df.loc[153,'country_iso'] = 'MKD'

In [16]:
iso_count = df.groupby(['country_iso'])['country_iso'].count()
wrong_isos = iso_count[iso_count>1]
wrong_isos.index.to_list()

[]

In [14]:
group_countries = df.groupby('Country')[['Country', 'country_iso']].max()
group_countries[group_countries['country_iso'].isin(wrong_isos.index.to_list())].sort_values('country_iso')

Unnamed: 0_level_0,Country,country_iso
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Estonia,Estonia,EST
Eswatini,Eswatini,EST


In [12]:
df.drop(df[df['Country']=='Channel Islands'].index, inplace=True)

In [15]:
df.loc[df['Country']=='Eswatini', 'country_iso'] = 'SWZ'

In [17]:
df.drop(columns=['Country', 'country_val'], inplace=True)

In [18]:
df.columns

Index(['2013', '2014', '2015', '2016', '2017', '2018', '2019', 'country_iso'], dtype='object')

In [None]:
df

In [19]:
df_new = df.melt(id_vars=["country_iso"], var_name="year", value_name="gdp_per_capta_usd")

In [20]:
pd.set_option('display.max_rows', None)
df_new = df_new.sort_values(['country_iso', 'year'])
df_new.replace('..', np.nan, inplace=True)
df_new.dropna(inplace=True)
df_new

Unnamed: 0,country_iso,year,gdp_per_capta_usd
10,ABW,2013,25025.09956
242,ABW,2014,25533.56978
474,ABW,2015,25796.38025
706,ABW,2016,25239.60041
938,ABW,2017,25630.26649
0,AFG,2013,637.16504
232,AFG,2014,613.85633
464,AFG,2015,578.46635
696,AFG,2016,547.22811
928,AFG,2017,556.30214


In [21]:
df_new.groupby(['country_iso', 'year']).count().max().to_list()[0] <= 1

True

In [22]:
df_new.to_csv('../data/converted/gdp_per_capta.csv', index=False)

In [23]:
df_saved = pd.read_csv('../data/converted/gdp_per_capta.csv')
df_saved

Unnamed: 0,country_iso,year,gdp_per_capta_usd
0,ABW,2013,25025.09956
1,ABW,2014,25533.56978
2,ABW,2015,25796.38025
3,ABW,2016,25239.60041
4,ABW,2017,25630.26649
5,AFG,2013,637.16504
6,AFG,2014,613.85633
7,AFG,2015,578.46635
8,AFG,2016,547.22811
9,AFG,2017,556.30214
