In [26]:
import pandas as pd
import difflib
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 100)

In [27]:
df = pd.read_csv('../data/creative_services.csv', skiprows=4)

In [28]:
df.head()

Unnamed: 0.1,Unnamed: 0,YEAR,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012
0,ECONOMY,CATEGORY,,,,,,,,,,
1,Afghanistan,"Advertising, market research and public opinio...",..,..,..,..,..,..,0.005589347,0.023603115,..,..
2,Afghanistan,"Architectural, engineering and other technical...",..,..,..,..,..,..,0.123334731,0.083703934,..,..
3,Afghanistan,Research and Development,..,..,..,..,..,..,7.15320346,0.174979874,..,..
4,Afghanistan,"Personal, cultural and recreational services",-,-,-,-,-,-,0.925616871,0.616532382,-,-


In [29]:
df.drop(0, inplace=True)

In [30]:
df.rename(columns={'Unnamed: 0': 'Country Name', 'YEAR': 'Category'}, inplace=True)

In [31]:
import difflib
countries = pd.read_csv('../data/country-codes_csv.csv')
def get_country_iso(name):
    p = difflib.get_close_matches(name, countries['official_name_en'].dropna().unique(), n=1, cutoff=0.5)
    if len(p)>0:
        return countries[countries['official_name_en']==p[0]]['ISO3166-1-Alpha-3'].to_list()[0]
    else:
        return ""

In [32]:
df['country_iso'] = [get_country_iso(x) for x in df['Country Name']]
df[df['country_iso']=='']

Unnamed: 0,Country Name,Category,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,country_iso
961,"Panama, excluding Canal Zone","Advertising, market research and public opinio...",_,_,_,_,_,_,_,_,_,_,
962,"Panama, excluding Canal Zone","Architectural, engineering and other technical...",_,_,_,_,_,_,_,_,_,_,
963,"Panama, excluding Canal Zone",Research and Development,_,_,_,_,_,_,_,_,_,_,
964,"Panama, excluding Canal Zone","Personal, cultural and recreational services",_,_,_,_,_,_,_,_,_,_,
965,"Panama, excluding Canal Zone",Audiovisual and related services,_,_,_,_,_,_,_,_,_,_,
966,"Panama, excluding Canal Zone","Other other personal, cultural and recreatio...",_,_,_,_,_,_,_,_,_,_,


For most datasets, we are considering only Panama. So we will drop its subdivisions:

In [33]:
df[df['Country Name'].str.contains('Panama')].groupby('Country Name')['country_iso'].last()

Country Name
Panama                          PAN
Panama, Canal Zone              PAN
Panama, excluding Canal Zone       
Name: country_iso, dtype: object

In [34]:
df.count()

Country Name    1422
Category        1422
2003            1422
2004            1422
2005            1422
2006            1422
2007            1422
2008            1422
2009            1422
2010            1422
2011            1422
2012            1422
country_iso     1422
dtype: int64

In [35]:
df.drop(df[df['Country Name'].str.contains('Panama,')].index, inplace=True)

In [36]:
df.count()

Country Name    1410
Category        1410
2003            1410
2004            1410
2005            1410
2006            1410
2007            1410
2008            1410
2009            1410
2010            1410
2011            1410
2012            1410
country_iso     1410
dtype: int64

In [37]:
df.columns[2:-1]

Index(['2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011',
       '2012'],
      dtype='object')

In [38]:
df_new = df[['Country Name', 'country_iso', 'Category', *df.columns[2:-1]]]

In [39]:
df_new_2 = df_new.melt(id_vars=["Country Name", "country_iso", "Category"], var_name="year", value_name="value")

In [40]:
df_new_2['value'].value_counts()

..             8048
_               960
-               851
0                90
0.265565063       3
               ... 
2.218395857       1
0.00508001        1
0.537042242       1
0.174790612       1
0.359301665       1
Name: value, Length: 3805, dtype: int64

In [41]:
import numpy as np
df_new_2['value'].replace('..', np.nan, inplace=True)
df_new_2['value'].replace('-', np.nan, inplace=True)
df_new_2['value'].replace('_', np.nan, inplace=True)
df_new_2['value'].value_counts()

0              90
0.018355191     3
0.265565063     3
0.527443547     2
0.264330129     2
               ..
0.00508001      1
0.537042242     1
0.174790612     1
0.41306658      1
0.359301665     1
Name: value, Length: 3802, dtype: int64

In [98]:
iso_count = df_new_2.groupby(['country_iso', 'year'])['year'].count()
wrong_isos = iso_count[iso_count>6]
wrong_isos.index.get_level_values(0).unique().to_list()

[]

In [99]:
group_countries = df_new_2.groupby('Country Name')[['Country Name', 'country_iso']].max()
group_countries[group_countries['country_iso'].isin(wrong_isos.index.get_level_values(0).unique().to_list())].sort_values('country_iso')

Unnamed: 0_level_0,Country Name,country_iso
Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1


In [95]:
df_new_2.drop(df_new_2[df_new_2['Country Name']=='Germany, Democratic Republic of'].index, inplace=True)
df_new_2.drop(df_new_2[df_new_2['Country Name']=='Germany, Federal Republic of'].index, inplace=True)
df_new_2.drop(df_new_2[df_new_2['Country Name']=='Sudan (…2011)'].index, inplace=True)
df_new_2.drop(df_new_2[df_new_2['Country Name']=='Indonesia (…2002)'].index, inplace=True)
df_new_2.drop(df_new_2[df_new_2['Country Name']=='Ethiopia (…1991)'].index, inplace=True)
df_new_2.drop(df_new_2[df_new_2['Country Name']=='Socialist Federative Republic of Yugoslavia'].index, inplace=True)
df_new_2.drop(df_new_2[df_new_2['Country Name']=='Serbia and Montenegro'].index, inplace=True)
df_new_2.drop(df_new_2[df_new_2['Country Name']=='Union of Soviet Socialist Republics'].index, inplace=True)
df_new_2.drop(df_new_2[df_new_2['Country Name']=='Korea, Republic of'].index, inplace=True)
df_new_2.drop(df_new_2[df_new_2['Country Name']=='Yemen, Arab Republic'].index, inplace=True)
df_new_2.drop(df_new_2[df_new_2['Country Name']=='Czechoslovakia'].index, inplace=True)
df_new_2.drop(df_new_2[df_new_2['Country Name']=='Yemen, Democratic'].index, inplace=True)

In [96]:
df_new_2.loc[df_new_2['Country Name']=='Czech Republic','country_iso'] = 'CZE'
df_new_2.loc[df_new_2['Country Name']=='United Kingdom','country_iso'] = 'GBR'
df_new_2.loc[df_new_2['Country Name']=='Netherlands Antilles','country_iso'] = 'ANT'
df_new_2.loc[df_new_2['Country Name']=='China, Taiwan Province of','country_iso'] = 'TWN'

In [100]:
df_new_2[df_new_2['country_iso'].isin(wrong_isos.index.get_level_values(0).unique().to_list())]

Unnamed: 0,Country Name,country_iso,Category,year,value


In [101]:
df_new_3 = df_new_2[['country_iso', 'year', 'Category', 'value']]

In [102]:
df_new_3['tmp_idx'] = df_new_3.index

In [103]:
df_new_3.dropna(inplace=True)

In [104]:
df_new_3

Unnamed: 0,country_iso,year,Category,value,tmp_idx
6,ALB,2003,"Advertising, market research and public opinio...",0.008614593,6
7,ALB,2003,"Architectural, engineering and other technical...",0.052799117,7
9,ALB,2003,"Personal, cultural and recreational services",0.628587387,9
10,ALB,2003,Audiovisual and related services,0.009448263,10
11,ALB,2003,"Other other personal, cultural and recreatio...",0.619278069,11
...,...,...,...,...,...
14031,URY,2012,"Personal, cultural and recreational services",0.047325702,14031
14047,VEN,2012,"Architectural, engineering and other technical...",3.673469388,14047
14049,VEN,2012,"Personal, cultural and recreational services",0.362811791,14049
14050,VEN,2012,Audiovisual and related services,0.317460318,14050


In [105]:
df_new_4 = df_new_3.pivot(index='tmp_idx', columns='Category', values='value')

In [106]:
df_new_4

Category,Audiovisual and related services,"Other other personal, cultural and recreational services","Advertising, market research and public opinion polling","Architectural, engineering and other technical services","Personal, cultural and recreational services",Research and Development
tmp_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6,,,0.008614593,,,
7,,,,0.052799117,,
9,,,,,0.628587387,
10,0.009448263,,,,,
11,,0.619278069,,,,
...,...,...,...,...,...,...
14031,,,,,0.047325702,
14047,,,,3.673469388,,
14049,,,,,0.362811791,
14050,0.317460318,,,,,


In [110]:
df_new_4['country_iso'] = df_new_3['country_iso']
df_new_4['year'] = df_new_3['year']
df_new_4

Category,creative_svc_audiovisual,creative_svc_other_personal_cultural_recreational,creative_svc_advertising_mktresearch_polling,creative_svc_architectural_engineering_technical,creative_svc_personal_cultural_recreational,creative_svc_research,country_iso,year
tmp_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6,,,0.008614593,,,,ALB,2003
7,,,,0.052799117,,,ALB,2003
9,,,,,0.628587387,,ALB,2003
10,0.009448263,,,,,,ALB,2003
11,,0.619278069,,,,,ALB,2003
...,...,...,...,...,...,...,...,...
14031,,,,,0.047325702,,URY,2012
14047,,,,3.673469388,,,VEN,2012
14049,,,,,0.362811791,,VEN,2012
14050,0.317460318,,,,,,VEN,2012


In [111]:
df_new_4.columns

Index(['creative_svc_audiovisual',
       'creative_svc_other_personal_cultural_recreational',
       'creative_svc_advertising_mktresearch_polling',
       'creative_svc_architectural_engineering_technical',
       'creative_svc_personal_cultural_recreational', 'creative_svc_research',
       'country_iso', 'year'],
      dtype='object', name='Category')

In [112]:
df_new_4.rename(columns={
    '  Audiovisual and related services': 'creative_svc_audiovisual',
    '  Other other personal, cultural and recreational services': 'creative_svc_other_personal_cultural_recreational',
    'Advertising, market research and public opinion polling': 'creative_svc_advertising_mktresearch_polling',
    'Architectural, engineering and other technical services': 'creative_svc_architectural_engineering_technical',
    'Personal, cultural and recreational services': 'creative_svc_personal_cultural_recreational',
    'Research and Development': 'creative_svc_research'
}, inplace=True)

In [113]:
df_new_4.columns

Index(['creative_svc_audiovisual',
       'creative_svc_other_personal_cultural_recreational',
       'creative_svc_advertising_mktresearch_polling',
       'creative_svc_architectural_engineering_technical',
       'creative_svc_personal_cultural_recreational', 'creative_svc_research',
       'country_iso', 'year'],
      dtype='object', name='Category')

In [114]:
df_new_5 = df_new_4[['country_iso', 'year', *df_new_4.columns[0:6]]]
df_new_5

Category,country_iso,year,creative_svc_audiovisual,creative_svc_other_personal_cultural_recreational,creative_svc_advertising_mktresearch_polling,creative_svc_architectural_engineering_technical,creative_svc_personal_cultural_recreational,creative_svc_research
tmp_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6,ALB,2003,,,0.008614593,,,
7,ALB,2003,,,,0.052799117,,
9,ALB,2003,,,,,0.628587387,
10,ALB,2003,0.009448263,,,,,
11,ALB,2003,,0.619278069,,,,
...,...,...,...,...,...,...,...,...
14031,URY,2012,,,,,0.047325702,
14047,VEN,2012,,,,3.673469388,,
14049,VEN,2012,,,,,0.362811791,
14050,VEN,2012,0.317460318,,,,,


In [117]:
df_new_5.groupby(['country_iso', 'year']).count().max()

Category
creative_svc_audiovisual                             1
creative_svc_other_personal_cultural_recreational    1
creative_svc_advertising_mktresearch_polling         1
creative_svc_architectural_engineering_technical     1
creative_svc_personal_cultural_recreational          1
creative_svc_research                                1
dtype: int64

In [118]:
df_new_5.to_csv('../data/converted/creative_services.csv', index=False)

In [119]:
df_saved = pd.read_csv('../data/converted/creative_services.csv')
df_saved

Unnamed: 0,country_iso,year,creative_svc_audiovisual,creative_svc_other_personal_cultural_recreational,creative_svc_advertising_mktresearch_polling,creative_svc_architectural_engineering_technical,creative_svc_personal_cultural_recreational,creative_svc_research
0,ALB,2003,,,0.008615,,,
1,ALB,2003,,,,0.052799,,
2,ALB,2003,,,,,0.628587,
3,ALB,2003,0.009448,,,,,
4,ALB,2003,,0.619278,,,,
...,...,...,...,...,...,...,...,...
4171,URY,2012,,,,,0.047326,
4172,VEN,2012,,,,3.673469,,
4173,VEN,2012,,,,,0.362812,
4174,VEN,2012,0.317460,,,,,
