In [2]:
import pandas as pd
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 100)

In [3]:
import glob

def check_required_columns(df):
    return df.columns.contains('year') and df.columns.contains('country_iso')

def check_unicity(df):
    return df.groupby(['country_iso', 'year']).count().max().to_list()[0] <= 1

def sanity_check(df, filename):
    if not check_required_columns(df):
        print(f'{filename} does not have required columns: {df.columns}')
        return
    if not check_unicity(df):
        print(f'{filename} has duplicate country|year rows')
        return
    print(f'{filename} OK')

In [4]:
path = r'../data/converted' # use your path
all_files = glob.glob(path + "/*.csv")
dfs = []
for filename in all_files:
    dfs.append(pd.read_csv(filename))

In [5]:
for i, df in enumerate(dfs):
    sanity_check(df, all_files[i])

../data/converted/gii_human_capital.csv OK
../data/converted/gii_domestic_credit.csv OK
../data/converted/mobile_subscriptions.csv OK
../data/converted/broadband_subscriptions.csv OK
../data/converted/gii_ict_services_imports.csv OK
../data/converted/diversity_published.csv OK
../data/converted/literacy_rate.csv OK
../data/converted/electrification.csv OK
../data/converted/gii_scientific_publications.csv OK
../data/converted/books_published.csv OK
../data/converted/rd_in_gdp.csv OK
../data/converted/creative_services.csv OK
../data/converted/rural_population.csv OK
../data/converted/school_enrollment_tertiary.csv OK
../data/converted/gii_creative_services.csv OK
../data/converted/gii_rule_of_law.csv OK
../data/converted/population.csv OK
../data/converted/mortality_rate.csv OK
../data/converted/secure_internet_servers.csv OK
../data/converted/gii_institutions.csv OK
../data/converted/gii_top_level_domains.csv OK
../data/converted/gii_patent_applications.csv OK
../data/converted/feature

  after removing the cwd from sys.path.


In [120]:
dfs[3].groupby(['country_iso', 'year'])[['country_iso', 'year']].count()[lambda x: x['year'] > 1]


Unnamed: 0_level_0,Unnamed: 1_level_0,country_iso,year
country_iso,year,Unnamed: 2_level_1,Unnamed: 3_level_1
CPV,2004,2,2
CPV,2005,2,2
CPV,2006,2,2
CPV,2007,2,2
CPV,2008,2,2
CPV,2009,2,2
CPV,2010,2,2
CPV,2011,2,2
CPV,2012,2,2
CPV,2013,2,2


In [125]:
dd = dfs[3]
dd[dd['country_iso']=='ZAF']

Unnamed: 0,country_iso,year,broadband_subscriptions_per100
3291,ZAF,1998,0.277165
3292,ZAF,1999,1.080226
3293,ZAF,2000,2.715602
3294,ZAF,2001,4.953852
3295,ZAF,2001,0.004601
3296,ZAF,2001,0.004601
3297,ZAF,2002,0.172038
3298,ZAF,2002,7.346834
3299,ZAF,2002,0.007477
3300,ZAF,2002,0.007477


In [129]:
orig = pd.read_csv('../data/broadband_subscriptions.csv', skiprows=4)

In [133]:
orig.groupby('Country Code').count()

Unnamed: 0_level_0,Country Name,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1
ABW,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,1,1,1,0,0,0,0
AFG,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0
AGO,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0
ALB,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0
AND,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
ARB,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
ARE,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
ARG,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
ARM,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0
ASM,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
dfs = []

merged = pd.read_csv(all_files[0], index_col=None)

for filename in all_files[1:]:
    df = pd.read_csv(filename, index_col=None)
    merged = merged.merge(df, on=['country_iso', 'year'], how='outer', sort=True)
    print('merged ok for ' + filename)

In [37]:
merged[merged['country_iso']=='POL']

Unnamed: 0,year,gii_human_capital,country_iso,gii_domestic_credit,mobile_subscriptions,broadband_subscriptions_per100,gii_ict_services_imports,diversity_ethnicFractionalization,diversity_linguisticFractionalization,diversity_religiousFractionalization,literacy_rate,electrification,gii_scientific_publications,book_titles,rd_in_gdp,creative_svc_audiovisual,creative_svc_other_personal_cultural_recreational,creative_svc_advertising_mktresearch_polling,creative_svc_architectural_engineering_technical,creative_svc_personal_cultural_recreational,creative_svc_research,rural_population,school_enrollment_tertiary,gii_creative_services,gii_rule_of_law,population,mortality_rate,secure_internet_servers,gii_institutions,gii_top_level_domains,gii_patent_applications,feature_films_produced,gii_patent_families,gii_mobile_apps,gii_research_talent_in_business,cultural_occupation,gii_creative_goods,life_expectancy,ease_of_business,gii_wikipedia_edits,gii_ict_access,gii_stem_assessment,poverty_ratio,foreign_investment,total_hospital_density_per_100k,gii_university_industry,gii_innovation_output,gdp_per_capta_usd,global_innovation_index
13842,1960,,POL,,0.0,,,,,,,,,,,,,,,,,,,,,29637450.0,,,,,,,,,,,,,,,,,,,,,,,
13843,1961,,POL,,,,,,,,,,,,,,,,,,,,,,,29964000.0,,,,,,,,,,,,,,,,,,,,,,,
13844,1962,,POL,,,,,,,,,,,,,,,,,,,,,,,30308500.0,,,,,,,,,,,,,,,,,,,,,,,
13845,1963,,POL,,,,,,,,,,,,,,,,,,,,,,,30712000.0,,,,,,,,,,,,,,,,,,,,,,,
13846,1964,,POL,,,,,,,,,,,,,,,,,,,,,,,31139450.0,,,,,,,,,,,,,,,,,,,,,,,
13847,1965,,POL,,0.0,,,,,,,,,,,,,,,,,,,,,31444950.0,,,,,,,,,,,,,,,,,,,,,,,
13848,1966,,POL,,,,,,,,,,,,,,,,,,,,,,,31681000.0,,,,,,,,,,,,,,,,,,,,,,,
13849,1967,,POL,,,,,,,,,,,,,,,,,,,,,,,31987155.0,,,,,,,,,,,,,,,,,,,,,,,
13850,1968,,POL,,,,,,,,,,,,,,,,,,,,,,,32294655.0,,,,,,,,,,,,,,,,,,,,,,,
13851,1969,,POL,,,,,,,,,,,,,,,,,,,,,,,32548300.0,,,,,,,,,,,,,,,,,,,,,,,
