#### CSE 6242 Data for Visual Analytics Innovation TeamTransparency group project 
#### Script to cleanse & convert  Global Innovation Index (gii) csv datasets
#### 2020-03-22 by Marc Boulet
#### Based on Ricardo Stamato's *book_conversion* Jupyter notebook

In [203]:
import pandas as pd
import difflib
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 100)

In [217]:
# point to dataset 
dataset = 'gii_creative_goods'
gii = pd.read_csv('../data/' + dataset + '.csv')

In [218]:
gii.head()

Unnamed: 0,rank,Economy,Income Group(Strength/Weakness),Strength / Weakness,Value,Score
0,1,China,Strength,Strength,11.9,100.0
1,1,Czech Republic (the),Strength,Strength,10.1,100.0
2,1,"Hong Kong, China",Strength,Strength,9.9,100.0
3,1,Malaysia,Strength,Strength,9.8,100.0
4,1,Mexico,Strength,Strength,9.6,100.0


In [219]:
# make copy
gii_new = gii.copy()

In [220]:
# get country codes
countries = pd.read_csv('../data/country-codes_csv.csv')

In [221]:
def get_country_iso(name):
    p = difflib.get_close_matches(name, countries['official_name_en'].dropna().unique(), n=1, cutoff=0.5)
    if len(p)>0:
        return countries[countries['official_name_en']==p[0]]['ISO3166-1-Alpha-3'].to_list()[0]
    else:
        return ""

In [222]:
# drop excess columns
gii_new.drop(columns=['rank'], inplace=True)
gii_new.drop(columns=['Income Group(Strength/Weakness)'], inplace=True)
gii_new.drop(columns=['Strength / Weakness'], inplace=True)
gii_new.drop(columns=['Score'], inplace=True)

# add year column (all data reflects 2019 values)
gii_new.insert(0, 'year', 2019, True)

# rename value column
gii_new.rename(columns={'Value': dataset}, inplace=True)

In [223]:
# assign country codes
gii_new['country_iso'] = [get_country_iso(x) for x in gii_new['Economy']]
#gii_new.sample(10)
gii_new.head(10)

Unnamed: 0,year,Economy,gii_creative_goods,country_iso
0,2019,China,11.9,CHN
1,2019,Czech Republic (the),10.1,COD
2,2019,"Hong Kong, China",9.9,MNG
3,2019,Malaysia,9.8,MYS
4,2019,Mexico,9.6,MEX
5,2019,Thailand,8.7,THA
6,2019,Slovakia,8.5,SVK
7,2019,Philippines,7.0,PHL
8,2019,Hungary,6.1,HUN
9,2019,Viet Nam,5.9,VNM


In [225]:
# check country_iso for unmatched countries
gii_new[gii_new['country_iso']=='']

Unnamed: 0,year,Economy,gii_creative_goods,country_iso


In [226]:
# manually change improperly assigned country_iso values
gii_new.loc[gii_new.Economy == 'Czech Republic (the)','country_iso'] = 'CZE'
gii_new.loc[gii_new.Economy == 'Hong Kong, China','country_iso'] = 'HKG'
gii_new.loc[gii_new.Economy == 'North Macedonia','country_iso'] = 'MKD'

In [227]:
gii_new

Unnamed: 0,year,Economy,gii_creative_goods,country_iso
0,2019,China,11.9,CHN
1,2019,Czech Republic (the),10.1,CZE
2,2019,"Hong Kong, China",9.9,HKG
3,2019,Malaysia,9.8,MYS
4,2019,Mexico,9.6,MEX
5,2019,Thailand,8.7,THA
6,2019,Slovakia,8.5,SVK
7,2019,Philippines,7.0,PHL
8,2019,Hungary,6.1,HUN
9,2019,Viet Nam,5.9,VNM


In [228]:
# remove Economy column after QC
gii_new.drop(columns=['Economy'], inplace=True)

In [229]:
gii_new.to_csv('../data/converted/' + dataset + '.csv', index=False)

In [230]:
gii_saved = pd.read_csv('../data/converted/' + dataset + '.csv')
gii_saved

Unnamed: 0,year,gii_creative_goods,country_iso
0,2019,11.9,CHN
1,2019,10.1,CZE
2,2019,9.9,HKG
3,2019,9.8,MYS
4,2019,9.6,MEX
5,2019,8.7,THA
6,2019,8.5,SVK
7,2019,7.0,PHL
8,2019,6.1,HUN
9,2019,5.9,VNM
