In [2]:
import pandas as pd
import difflib
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 100)

In [60]:
df = pd.read_csv('../data/cultural_occupation.csv', header=1)

In [61]:
df.head()

Unnamed: 0,Time,Unnamed: 1,2010,2011,2012,2013,2014,2015,2016
0,Country,,,,,,,,
1,Armenia,,..,..,..,..,0.94,0.5,..
2,Austria,,..,..,..,..,1.34,1.41,..
3,Belgium,,..,..,..,..,0.97,1.12,..
4,Bermuda,,0.74,..,..,..,..,..,..


In [63]:
df.rename(columns={'Time': 'Country'}, inplace=True)
df.drop(0, inplace=True)
df.drop(columns=['Unnamed: 1'], inplace=True)
df.head()

Unnamed: 0,Country,2010,2011,2012,2013,2014,2015,2016
1,Armenia,..,..,..,..,0.94,0.5,..
2,Austria,..,..,..,..,1.34,1.41,..
3,Belgium,..,..,..,..,0.97,1.12,..
4,Bermuda,0.74,..,..,..,..,..,..
5,Bolivia (Plurinational State of),..,1.03,..,..,..,1.53,..


In [64]:
import difflib
countries = pd.read_csv('../data/country-codes_csv.csv')
def get_country_iso(name):
    p = difflib.get_close_matches(name, countries['official_name_en'].dropna().unique(), n=1, cutoff=0.5)
    if len(p)>0:
        return countries[countries['official_name_en']==p[0]]['ISO3166-1-Alpha-3'].to_list()[0]
    else:
        return ""

In [65]:
df['country_iso'] = [get_country_iso(x) for x in df['Country']]
df[df['country_iso']=='']

Unnamed: 0,Country,2010,2011,2012,2013,2014,2015,2016,country_iso


In [66]:
df['country_val'] = [countries[countries['ISO3166-1-Alpha-3']==x]['official_name_en'].to_list()[0] for x in df['country_iso']]

In [67]:
df[['Country', 'country_iso', 'country_val']]

Unnamed: 0,Country,country_iso,country_val
1,Armenia,ARM,Armenia
2,Austria,AUT,Austria
3,Belgium,BEL,Belgium
4,Bermuda,BMU,Bermuda
5,Bolivia (Plurinational State of),BOL,Bolivia (Plurinational State of)
6,Bosnia and Herzegovina,BIH,Bosnia and Herzegovina
7,Brazil,BRA,Brazil
8,Brunei Darussalam,BRN,Brunei Darussalam
9,Bulgaria,BGR,Bulgaria
10,Canada,CAN,Canada


Line 44, North Macedonia, was incorrectly assigned as New Caledonia. Fixing it to the correct ISO code:

In [68]:
df.loc[44,'country_iso'] = 'MKD'

In [69]:
df.drop(columns=['Country', 'country_val'], inplace=True)

In [70]:
df.columns

Index(['2010', '2011', '2012', '2013', '2014', '2015', '2016', 'country_iso'], dtype='object')

In [71]:
df

Unnamed: 0,2010,2011,2012,2013,2014,2015,2016,country_iso
1,..,..,..,..,0.94,0.5,..,ARM
2,..,..,..,..,1.34,1.41,..,AUT
3,..,..,..,..,0.97,1.12,..,BEL
4,0.74,..,..,..,..,..,..,BMU
5,..,1.03,..,..,..,1.53,..,BOL
6,..,..,..,..,1.1,0.89,..,BIH
7,..,..,..,0.83,..,..,..,BRA
8,..,..,..,..,0.72,..,..,BRN
9,..,..,..,..,0.99,0.99,..,BGR
10,..,..,..,..,..,1.22,..,CAN


In [79]:
df_new = df.melt(id_vars=["country_iso"], var_name="year", value_name="cultural_occupation")

In [80]:
pd.set_option('display.max_rows', None)
df_new = df_new.sort_values(['country_iso', 'year'])
df_new

Unnamed: 0,country_iso,year,cultural_occupation
0,ARM,2010,..
70,ARM,2011,..
140,ARM,2012,..
210,ARM,2013,..
280,ARM,2014,0.94
350,ARM,2015,0.5
420,ARM,2016,..
1,AUT,2010,..
71,AUT,2011,..
141,AUT,2012,..


In [82]:
df_new.drop(df_new[df_new['cultural_occupation']=='..'].index, inplace=True)

In [88]:
df_new

Unnamed: 0,country_iso,year,cultural_occupation
280,ARM,2014,0.94
350,ARM,2015,0.5
281,AUT,2014,1.34
351,AUT,2015,1.41
282,BEL,2014,0.97
352,BEL,2015,1.12
288,BGR,2014,0.99
358,BGR,2015,0.99
285,BIH,2014,1.1
355,BIH,2015,0.89


In [89]:
df_new.to_csv('../data/converted/cultural_occupation.csv', index=False)

In [90]:
df_saved = pd.read_csv('../data/converted/cultural_occupation.csv')
df_saved

Unnamed: 0,country_iso,year,cultural_occupation
0,ARM,2014,0.94
1,ARM,2015,0.5
2,AUT,2014,1.34
3,AUT,2015,1.41
4,BEL,2014,0.97
5,BEL,2015,1.12
6,BGR,2014,0.99
7,BGR,2015,0.99
8,BIH,2014,1.1
9,BIH,2015,0.89
