In [39]:
import pandas as pd
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 100)

In [40]:
import glob

def check_required_columns(df):
    return df.columns.contains('year') and df.columns.contains('country_iso')

def check_unicity(df):
    return df.groupby(['country_iso', 'year']).count().max().to_list()[0] <= 1

def sanity_check(df, filename):
    if not check_required_columns(df):
        print(f'{filename} does not have required columns: {df.columns}')
        return
    if not check_unicity(df):
        print(f'{filename} has duplicate country|year rows')
        return
    print(f'{filename} OK')

In [41]:
path = r'../data/converted' # use your path
all_files = glob.glob(path + "/*.csv")
dfs = []
for filename in all_files:
    dfs.append(pd.read_csv(filename))

In [42]:
for i, df in enumerate(dfs):
    sanity_check(df, all_files[i])

../data/converted/gii_human_capital.csv OK
../data/converted/gii_domestic_credit.csv OK
../data/converted/mobile_subscriptions.csv OK
../data/converted/broadband_subscriptions.csv OK
../data/converted/gii_ict_services_imports.csv OK
../data/converted/diversity_published.csv OK
../data/converted/literacy_rate.csv OK
../data/converted/electrification.csv OK
../data/converted/gii_scientific_publications.csv OK
../data/converted/books_published.csv OK
../data/converted/rd_in_gdp.csv OK
../data/converted/creative_services.csv OK
../data/converted/rural_population.csv OK
../data/converted/school_enrollment_tertiary.csv OK
../data/converted/gii_creative_services.csv OK
../data/converted/gii_rule_of_law.csv OK
../data/converted/population.csv OK
../data/converted/mortality_rate.csv OK
../data/converted/secure_internet_servers.csv OK
../data/converted/gii_institutions.csv OK
../data/converted/gii_top_level_domains.csv OK
../data/converted/gii_patent_applications.csv OK
../data/converted/feature

  after removing the cwd from sys.path.


In [43]:
dfs = []

merged = pd.read_csv(all_files[0], index_col=None)

for filename in all_files[1:]:
    df = pd.read_csv(filename, index_col=None)
    merged = merged.merge(df, on=['country_iso', 'year'], how='outer', sort=True)
    print('merged ok for ' + filename)

merged ok for ../data/converted/gii_domestic_credit.csv
merged ok for ../data/converted/mobile_subscriptions.csv
merged ok for ../data/converted/broadband_subscriptions.csv
merged ok for ../data/converted/gii_ict_services_imports.csv
merged ok for ../data/converted/diversity_published.csv
merged ok for ../data/converted/literacy_rate.csv
merged ok for ../data/converted/electrification.csv
merged ok for ../data/converted/gii_scientific_publications.csv
merged ok for ../data/converted/books_published.csv
merged ok for ../data/converted/rd_in_gdp.csv
merged ok for ../data/converted/creative_services.csv
merged ok for ../data/converted/rural_population.csv
merged ok for ../data/converted/school_enrollment_tertiary.csv
merged ok for ../data/converted/gii_creative_services.csv
merged ok for ../data/converted/gii_rule_of_law.csv
merged ok for ../data/converted/population.csv
merged ok for ../data/converted/mortality_rate.csv
merged ok for ../data/converted/secure_internet_servers.csv
merged o

In [45]:
merged.groupby(['country_iso', 'year'])['country_iso'].count().sort_values(ascending=False)

country_iso  year
ZWE          2020    1
GRC          1967    1
             1965    1
             1964    1
             1963    1
                    ..
NOR          1971    1
             1970    1
             1969    1
             1968    1
ABW          1960    1
Name: country_iso, Length: 15833, dtype: int64

In [53]:
merged.shape

(15833, 49)

In [58]:
merged['country_iso'].unique().size

271

In [64]:
merged[merged.isna().all(axis=1)].index.size

0

In [54]:
merged.to_csv('../data/converted/merged.csv', index=False)

In [55]:
df_saved = pd.read_csv('../data/converted/merged.csv')
df_saved

Unnamed: 0,year,gii_human_capital,country_iso,gii_domestic_credit,mobile_subscriptions,broadband_subscriptions_per100,gii_ict_services_imports,diversity_ethnicFractionalization,diversity_linguisticFractionalization,diversity_religiousFractionalization,literacy_rate,electrification,gii_scientific_publications,book_titles,rd_in_gdp,creative_svc_audiovisual,creative_svc_other_personal_cultural_recreational,creative_svc_advertising_mktresearch_polling,creative_svc_architectural_engineering_technical,creative_svc_personal_cultural_recreational,creative_svc_research,rural_population,school_enrollment_tertiary,gii_creative_services,gii_rule_of_law,population,mortality_rate,secure_internet_servers,gii_institutions,gii_top_level_domains,gii_patent_applications,feature_films_produced,gii_patent_families,gii_mobile_apps,gii_research_talent_in_business,cultural_occupation,gii_creative_goods,life_expectancy,ease_of_business,gii_wikipedia_edits,gii_ict_access,gii_stem_assessment,poverty_ratio,foreign_investment,total_hospital_density_per_100k,gii_university_industry,gii_innovation_output,gdp_per_capta_usd,global_innovation_index
0,1960,,ABW,,0.000000,,,,,,,,,,,,,,,,,,,,,54211.0,,,,,,,,,,,,,,,,,,,,,,,
1,1961,,ABW,,,,,,,,,,,,,,,,,,,,,,,55438.0,,,,,,,,,,,,,,,,,,,,,,,
2,1962,,ABW,,,,,,,,,,,,,,,,,,,,,,,56225.0,,,,,,,,,,,,,,,,,,,,,,,
3,1963,,ABW,,,,,,,,,,,,,,,,,,,,,,,56695.0,,,,,,,,,,,,,,,,,,,,,,,
4,1964,,ABW,,,,,,,,,,,,,,,,,,,,,,,57032.0,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15828,2016,,ZWE,,91.793458,1.217633,,,,,,39.892345,,,,,,,,,,67.704,,,,14030390.0,36.3,13.827128,,,,,,,,,,60.294,,,,,,1.669274,,,,1464.58353,
15829,2017,,ZWE,,98.985073,1.315694,,,,,,40.421368,,,,,,,,,,67.763,,,,14236745.0,35.4,30.554737,,,,,,,,,,60.812,,,,,,1.083538,,,,1602.40351,
15830,2018,,ZWE,,89.404869,1.406322,,,,,,,,,,,,,,,,67.791,,,,14439018.0,33.9,46.609818,,,,,,,,,,,,,,,,2.402015,,,,2146.99638,
15831,2019,27.8,ZWE,,,,0.9,,,,,,7.0,,,,,,,,,,,0.2,-1.4,,,,37.6,1.1,0.2,,0.0,,,,0.3,,140.0,0.3,3.6,,,,,2.6,15.4,,22.3
