In [35]:
import json
import pandas as pd
import os

#read all json files in the folder "raw_data" and store them in a list
json_files = [pos_json for pos_json in os.listdir('raw_data') if pos_json.endswith('.json')]
data = []
for file in json_files:
    with open(os.path.join('raw_data', file)) as f:
        data.append(json.load(f))

In [60]:
def normalize(string):
    ret = string.strip()
    ret = string.lower()
    if ret.startswith("."):
        ret = ret[1:]
    if ret.endswith("."):
        ret = ret[:-1]
    return ret.strip()

# for every file in the list, extract the data and store it in a dataframe
df = pd.DataFrame()
for dict_formats in data:
    norm_dict = {}
    # normalize all keys to lowercase and merge the values summing them
    for key, value in dict_formats.items():
        if type(value) == int:
            norm_dict[normalize(key)] = norm_dict.get(normalize(key), 0) + value
        elif key not in ("country", "source"):
            norm_dict[key] = value
    # add keys of the dictionary as rows of the dataframe
    country_df = pd.DataFrame(pd.Series(norm_dict).astype(int), columns=[dict_formats["country"]])
    df = df.join(country_df, how='outer')

# change df nan values as 0
df = df.fillna(0)
df.loc["total"] = df.drop(["num_datasets"],axis=0).sum()
print(df)

                       Argentina  Australia   Brazil   Canada   Chile  \
                             3.0        0.0      0.0      0.0     0.0   
/ csv                        0.0        0.0      0.0      0.0     0.0   
/ pdf                        0.0        0.0      0.0      0.0     0.0   
00                           0.0        0.0      0.0      0.0     0.0   
01 / 2015 / csv              0.0        0.0      0.0      0.0     0.0   
...                          ...        ...      ...      ...     ...   
zip, contiene archi..        0.0        0.0      0.0      0.0     4.0   
zip-kml                      0.0        0.0      0.0      0.0     0.0   
zip/csv                      0.0        0.0      0.0      0.0     0.0   
zip/spreadsheet              0.0        0.0      0.0      0.0     0.0   
total                     5583.0   132979.0  39662.0  81506.0  4951.0   

                       Germany  Ghana    Japan   Mexico  Morocco  New Zealand  \
                       10686.0    0.0     

§Results of analysis: please note that the format label are in general not exclusive: a dataset can be marked "csv, json" and therefore contain both "csv" and "json" files.
The labels are therefore indicative of the how many datasets contain a given format, but not of the number of files for each format.

In [66]:
# Overall analysis
new_df = pd.DataFrame()
new_df['Overall'] = df.sum(axis=1).astype(int)
new_df["Percentage"] = new_df["Overall"] / new_df["Overall"]["num_datasets"]

# sum all rows except six with most value and add them as a new row
# new_df.loc["all_others"] = new_df.drop(["num_datasets","total","html","csv","pdf", "xml", "json","zip"],axis=0).sum()

# print(new_df.drop(["num_datasets","total","html","csv","pdf", "xml", "json","zip"],axis=0).sort_values(by="Overall", ascending=False))
# print(sorted(set(new_df.index)))

print(new_df[["Overall","Percentage"]].sort_values(by="Overall", ascending=False))
# print(sorted(set(new_df.drop(["num_datasets","total", "html","csv","pdf", "xml", "json","zip"],axis=0).index)))

                 Overall  Percentage
total            1651038    2.105749
num_datasets      784062    1.000000
html              326446    0.416352
csv               245594    0.313233
pdf               151053    0.192654
...                  ...         ...
esri geoservice        1    0.000001
esri shape             1    0.000001
excell                 1    0.000001
feb 2015 / csv         1    0.000001
04 / 2015 / csv        1    0.000001

[426 rows x 2 columns]


Results of the analysis:
 We surveyed 17 open data portals from all five continents.
 In our survey, out of 784,062 publicly available datasets, the five most common formats are:
  - html : 326,446 (41.63\%)
  - csv: 245,594 (31.32\%)
  - pdf: 151,053 (19.26\%)
  - xml: 128,452 (16.38\%)
  - json: 65,008
     - (8.29\%)

