In [18]:
import pandas as pd
import difflib
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 100)

In [19]:
df = pd.read_csv('../data/rural_population.csv', header=1)

In [20]:
df.head()

Unnamed: 0,Time,Unnamed: 1,2013,2014,2015,2016,2017,2018,2019
0,Country,,,,,,,,
1,Afghanistan,,75.627,75.413,75.197,74.98,74.75,74.505,..
2,Albania,,44.613,43.577,42.566,41.579,40.617,39.681,..
3,Algeria,,30.424,29.779,29.152,28.541,27.948,27.371,..
4,American Samoa,,12.652,12.712,12.762,12.802,12.83,12.847,..


In [21]:
df.rename(columns={'Time': 'Country'}, inplace=True)
df.drop(0, inplace=True)
df.drop(columns=['Unnamed: 1'], inplace=True)
df.drop(columns=['2019'], inplace=True)
df.head()

Unnamed: 0,Country,2013,2014,2015,2016,2017,2018
1,Afghanistan,75.627,75.413,75.197,74.98,74.75,74.505
2,Albania,44.613,43.577,42.566,41.579,40.617,39.681
3,Algeria,30.424,29.779,29.152,28.541,27.948,27.371
4,American Samoa,12.652,12.712,12.762,12.802,12.83,12.847
5,Andorra,11.463,11.559,11.655,11.752,11.85,11.938


In [22]:
countries = pd.read_csv('../data/country-codes_csv.csv')
def get_country_iso(name):
    p = difflib.get_close_matches(name, countries['official_name_en'].dropna().unique(), n=1, cutoff=0.5)
    if len(p)>0:
        return countries[countries['official_name_en']==p[0]]['ISO3166-1-Alpha-3'].to_list()[0]
    else:
        return ""

In [23]:
df['country_iso'] = [get_country_iso(x) for x in df['Country']]
df[df['country_iso']=='']

Unnamed: 0,Country,2013,2014,2015,2016,2017,2018,country_iso


In [24]:
df['country_val'] = [countries[countries['ISO3166-1-Alpha-3']==x]['official_name_en'].to_list()[0] for x in df['country_iso']]

In [25]:
df[['Country', 'country_iso', 'country_val']]

Unnamed: 0,Country,country_iso,country_val
1,Afghanistan,AFG,Afghanistan
2,Albania,ALB,Albania
3,Algeria,DZA,Algeria
4,American Samoa,ASM,American Samoa
5,Andorra,AND,Andorra
6,Angola,AGO,Angola
7,Anguilla,AIA,Anguilla
8,Antigua and Barbuda,ATG,Antigua and Barbuda
9,Argentina,ARG,Argentina
10,Armenia,ARM,Armenia


In [26]:
df.loc[df['Country'] != df["country_val"]]

Unnamed: 0,Country,2013,2014,2015,2016,2017,2018,country_iso,country_val
41,Channel Islands,68.973,69.005,69.038,69.07,69.086,69.086,CYM,Cayman Islands
69,Eswatini,77.001,76.851,76.7,76.541,76.375,76.201,EST,Estonia
71,Faeroe Islands,58.637,58.499,58.362,58.223,58.086,57.936,FRO,Faroe Islands
153,North Macedonia,42.811,42.716,42.592,42.437,42.252,42.037,NCL,New Caledonia
159,Palestine,25.137,24.887,24.632,24.372,24.106,23.836,PSE,State of Palestine
180,Saint-Barthélemy,..,..,..,..,..,..,BLM,Saint Barthélemy
181,Saint-Martin (French part),..,..,..,..,..,..,MAF,Saint Martin (French Part)


In [27]:
df.loc[153,'country_iso'] = 'MKD'
df.drop(41, inplace=True)
df.drop(69, inplace=True)

In [28]:
df.drop(columns=['Country', 'country_val'], inplace=True)

In [29]:
df.columns

Index(['2013', '2014', '2015', '2016', '2017', '2018', 'country_iso'], dtype='object')

In [30]:
df

Unnamed: 0,2013,2014,2015,2016,2017,2018,country_iso
1,75.627,75.413,75.197,74.98,74.75,74.505,AFG
2,44.613,43.577,42.566,41.579,40.617,39.681,ALB
3,30.424,29.779,29.152,28.541,27.948,27.371,DZA
4,12.652,12.712,12.762,12.802,12.83,12.847,ASM
5,11.463,11.559,11.655,11.752,11.85,11.938,AND
6,37.998,37.269,36.554,35.851,35.161,34.486,AGO
7,..,..,..,..,..,..,AIA
8,74.65,74.825,75,75.154,75.287,75.401,ATG
9,8.751,8.623,8.497,8.373,8.251,8.13,ARG
10,36.836,36.888,36.915,36.918,36.897,36.851,ARM


In [31]:
df_new = df.melt(id_vars=["country_iso"], var_name="year", value_name="rural_population")

In [32]:
pd.set_option('display.max_rows', None)
df_new = df_new.sort_values(['country_iso', 'year'])
df_new

Unnamed: 0,country_iso,year,rural_population
10,ABW,2013,57.01
241,ABW,2014,56.959
472,ABW,2015,56.892
703,ABW,2016,56.808
934,ABW,2017,56.707
1165,ABW,2018,56.589
0,AFG,2013,75.627
231,AFG,2014,75.413
462,AFG,2015,75.197
693,AFG,2016,74.98


In [33]:
df_new.drop(df_new[df_new['rural_population']=='..'].index, inplace=True)

In [34]:
df_new

Unnamed: 0,country_iso,year,rural_population
10,ABW,2013,57.01
241,ABW,2014,56.959
472,ABW,2015,56.892
703,ABW,2016,56.808
934,ABW,2017,56.707
1165,ABW,2018,56.589
0,AFG,2013,75.627
231,AFG,2014,75.413
462,AFG,2015,75.197
693,AFG,2016,74.98


In [35]:
df_new.to_csv('../data/converted/rural_population.csv', index=False)

In [36]:
df_saved = pd.read_csv('../data/converted/rural_population.csv')
df_saved

Unnamed: 0,country_iso,year,rural_population
0,ABW,2013,57.01
1,ABW,2014,56.959
2,ABW,2015,56.892
3,ABW,2016,56.808
4,ABW,2017,56.707
5,ABW,2018,56.589
6,AFG,2013,75.627
7,AFG,2014,75.413
8,AFG,2015,75.197
9,AFG,2016,74.98
