## Notebook for statistics of availability of file types:

In this notebook we sample the availability of CSV files from 17 different countries from the 5 different continents.

In [1]:
import requests
from bs4 import BeautifulSoup
import json
import re

import os
if not os.path.exists("./raw_data/"):
    os.makedirs("./raw_data/")

### North America: USA, Canada, Mexico

- Usa statistics from catalog.data.gov
- Canadas statistics from https://search.open.canada.ca
- Mexico statistics from https://datos.gob.mx

<div>
    <div>
        <p>Statistics are in a tiny table on the left of the data.gov webpage. To obtain the total amount of datasets we query for the empty string "". Even though the page only shows the first 20 or so, we will crawl the information we need from the left table. To make sure the table contains all formats, we pass the parameter ``&_res_format_limit=0`` to the API.
</p>
    </div>
    <div>
        <img src="usa_formats.png" align:right>
    </div>

In [75]:
query = "https://catalog.data.gov/dataset/?q=&_res_format_limit=0"
r  =requests.get(query)
assert r.status_code == 200

soup = BeautifulSoup(r.text, 'html.parser')
num_text = soup.find("div", {"class": "new-results"})
num_datasets = int(num_text.text.split()[0].replace(",", ""))
print("Total number of datasets", num_datasets)
# parse the list of formats with their  number from the <nav> with aria-label "Formats"


Total number of datasets 321550


In [76]:
formats = {"source": "data.gov", "country": "USA", "num_datasets": num_datasets}
for li in soup.find("nav", {"aria-label": "Formats"}).find_all("li"):
    format = li.find("span", {"class": "item-label"}).text
    formats[format] = int(li.find("span", {"class": "item-count badge"}).text.replace(",",""))

print(formats)
with open("./raw_data/usa_formats.json", "w") as f:
    json.dump(formats, f)

{'source': 'data.gov', 'country': 'USA', 'num_datasets': 321550, 'HTML': 167118, 'XML': 98080, 'PDF': 56947, 'CSV': 36146, 'TIFF': 35423, 'ZIP': 30243, 'TEXT': 29626, 'XYZ': 23161, 'KML': 20614, 'ArcGIS GeoServices REST API': 20285, 'JSON': 16842, 'GeoJSON': 16293, 'SID': 12835, 'JPEG': 11967, 'WMS': 11843, 'RDF': 11663, 'sos': 9335, 'EXCEL': 7948, 'Esri REST': 5911, 'NETCDF': 5308, 'WCS': 4880, 'application/unknown': 4388, 'PNG': 4381, 'WFS': 2998, 'QGIS': 2403, 'gml': 2371, 'API': 1617, 'application/vnd.geo+json': 1331, 'EXE': 1228, 'DOC': 1213, 'ArcGIS Online Map': 927, 'CDF': 896, 'application/html': 503, 'BIN': 249, 'GIF': 220, 'TAR': 219, 'Undefined': 209, 'GZ': 208, 'nc ': 197, 'chemical/x-mdl-sdfile': 192, '00': 174, 'POWERPOINT': 153, 'ArcGIS Map Preview': 149, 'ArcGIS Map Service': 149, 'ACCESS': 138, 'cdfnc ': 91, 'b0': 89, 'application/txt': 72, 'image/x-3ds': 65, 'Data Explorer': 63}


In [77]:
#repeat for canada
query = "https://search.open.canada.ca/en/od"
r  =requests.get(query)
assert r.status_code == 200

soup = BeautifulSoup(r.text, 'html.parser')
num_text = soup.find("p", text=lambda text: "Found" in text)
num_datasets = int(''.join(filter(str.isdigit, num_text.text)))
print("Total number of datasets", num_datasets)

Total number of datasets 34075


In [79]:
formats = {"source": "open.canada.ca", "country": "Canada", "num_datasets": num_datasets}
#find all <summary> items with the class "panel-heading"
for summary in soup.find_all("summary", {"class": "panel-heading"}):
    x = summary.find("h5", {"class": "panel-title"})
    if x.text == "Formats":
        # get the parent in the soup of this element
        panel = summary.parent.find("ul", {"class": "list-group"})
        for row in panel.find_all("div", {"class": "row"}):
            format = row.find("label").text
            num = row.find("span", {"class": "badge"}).text
                formats[format] = formats.get(format, 0) + int(num)

print(formats)
with open("./raw_data/canada_formats.json", "w") as f:
    json.dump(formats, f)

{'source': 'open.canada.ca', 'country': 'Canada', 'num_datasets': 34075, ' ASCII Grid ': 262, ' AVI ': 2, ' BAG ': 2, ' CDR ': 91, ' CSV ': 12139, ' DBF ': 1, ' DOC ': 75, ' DOCX ': 236, ' DXF ': 13, ' E00 ': 5, ' ECW ': 64, ' EDI ': 1881, ' ESRI REST ': 1471, ' EXE ': 3, ' FGDB/GDB ': 661, ' Flat raster binary ': 1, ' GDB ': 9, ' GEOJSON ': 715, ' GIF ': 97, ' GML ': 313, ' GPKG ': 106, ' GRD ': 2, ' GRIB2 ': 10, ' GeoPDF ': 1, ' GeoTIF ': 249, ' HDF ': 3, ' HTML ': 20041, ' IATI ': 1, ' JAR ': 1, ' JP2 ': 1227, ' JPG ': 699, ' JSON ': 292, ' JSONL ': 1, ' KML ': 934, ' KMZ ': 396, ' LAS ': 6, ' LYR ': 5, ' MXD ': 104, ' NetCDF ': 36, ' ODP ': 1, ' ODS ': 19, ' ODT ': 8, ' PDF ': 4490, ' PDF/A-1 ': 22, ' PDF/A-2 ': 2, ' PDF/UA ': 18, ' PNG ': 13, ' PPTX ': 8, ' RDF ': 617, ' RSS ': 861, ' RTF ': 14, ' SAS ': 7, ' SEGY ': 253, ' SHP ': 4335, ' SQL ': 1, ' SQLITE ': 20, ' TAB ': 39, ' TIFF ': 144, ' TXT ': 377, ' VPF ': 1, ' WCS ': 15, ' WFS ': 75, ' WMS ': 1329, ' WMTS ': 10, ' XLS ': 

In [2]:
#repeat for mexico
query = "https://datos.gob.mx/busca/dataset?_res_format_limit=0"
r  =requests.get(query)
assert r.status_code == 200
#
soup = BeautifulSoup(r.text, 'html.parser')
num_text = soup.find("p", text=lambda text: "Datos" in text)
num_datasets = int(''.join(filter(str.isdigit, num_text.text)))
print("Total number of datasets", num_datasets)

Total number of datasets 9696


In [10]:
formats = {"source": "datos.gob.mx", "country": "Mexico", "num_datasets": num_datasets}
# find the <a> tag with the text "Formatos"
for x in soup.find_all("a"):
    if "Formatos" in x.text:
        if "Mostrar Solamente Populares" in x.text:
            continue
        lst = x.find_next_sibling("div")
        for li in lst.find_all("li"):
            try:
                format_text = li.find("a").text
                if "Mostrar Solamente Populares" in format_text:
                    continue
                format = format_text.split("(")[0].strip()
                num = format_text.split("(")[1].split(")")
                #extract the content of the last parenthesis in a string with a regular expression
                num = re.findall(r'\(([^)]+)', format_text)[-1]
                formats[format] = formats.get(format, 0) + int(num)
            except:
                print("Error parsing", format_text)

print(formats)
with open("./raw_data/mexico_formats.json", "w") as f:
    json.dump(formats, f)

CSV (4949)
4949
CSV (delimitado por comas) (9)
9
{'source': 'datos.gob.mx', 'country': 'Mexico', 'num_datasets': 9696, 'CSV': 4958, 'ZIP': 1580, 'vnd.ms-excel': 979, 'XLS': 877, 'PDF': 820, 'JSON': 816, 'XLSX': 551, 'vnd.google-earth.kml+xml': 159, 'SHP': 140, 'TXT': 109, 'KML': 99, 'vnd.google-earth.kmz': 32, 'XML': 29, 'kmz': 27, 'php': 23, 'DOCX': 22, 'RAR': 20, 'kml': 18, 'KMZ': 16, 'view': 14, 'DOC': 13, 'csv': 12, 'arcGis': 11, 'SQL': 11, 'ArcGIS': 10, 'CSV, EXCEL': 9, 'ODT': 8, 'Word': 8, 'MySQL': 7, 'GeoJSON': 6, 'HTML': 6, 'ODS': 6, 'Oracle': 6, 'ORACLE': 6, 'SHAPE': 6, '.csv': 5, 'oracle': 5, 'SQL Server 2012': 5, 'CSV, XLS, XML': 4, 'Excel': 7, 'Plataforma Joomla, Formato de los archivos .CSV': 4, 'shape': 4, 'ArcGis': 3, 'ARCGIS': 3, 'excel CSV': 3, 'Manejador de Bases de Datos MySQL.': 3, 'shp': 3, 'SQL Server': 3}


### South America: Brazil, Argentina, Chile, Colombia, Ecuador

- Brazil statistics from https://dados.gov.br
- Argentina statistics from https://datos.gob.ar
- Chile statistics from https://datos.gob.cl
- Colombia statistics from https://datos.gov.co

In [11]:
# repeat for brazil
query = "https://dados.gov.br/dataset?q=&_res_format_limit=0"
r  =requests.get(query)
assert r.status_code == 200

soup = BeautifulSoup(r.text, 'html.parser')
num_text = soup.find("h3", text=lambda text: "dados encontrado" in text)
num_datasets = int(''.join(filter(str.isdigit, num_text.text)))
print("Total number of datasets", num_datasets)

Total number of datasets 13619


In [12]:
formats = {"source": "dados.gov.br", "country": "Brazil", "num_datasets": num_datasets}

for section in soup.find_all("section"):
    lst = section.find_all("h2")
    for h2 in lst:
        try:
            if "Formatos" not in h2.text:
                continue
            for a in section.find_all("a"):
                format_text = a.text
                if "Mostrar somente Formatos popular" in format_text:
                    continue
                format = format_text.split("(")[0].strip()
                num = format_text.split("(")[1].split(")")
                #extract the content of the last parenthesis in a string with a regular expression
                num = re.findall(r'\(([^)]+)', format_text)[-1]
                formats[format] = formats.get(format, 0) + int(num)
        except:
            print("Error parsing", format_text)

with open("./raw_data/brazil_formats.json", "w") as f:
    json.dump(formats, f)

In [13]:
# repeat for argentina
query = "http://datos.gob.ar/api/3/action/package_list"
r  =requests.get(query)
assert r.status_code == 200
ar_datasets = r.json()["result"]

num_datasets = len(ar_datasets)
print("Total number of datasets", num_datasets)

Total number of datasets 1129


In [14]:
# wrap this function in multiprocessing to speed up the process
def get_format(dataset):
    formats = {}
    query = "http://datos.gob.ar/api/3/action/package_show?id=" + dataset
    r  =requests.get(query)
    assert r.status_code == 200
    dataset = r.json()["result"]
    if "resources" in dataset:
        for resource in dataset["resources"]:
            if "format" in resource:
                format = resource["format"]
                formats[format] = formats.get(format, 0) + 1
    return formats

import joblib
n_jobs = 10
lst_formats = joblib.Parallel(n_jobs=4)(joblib.delayed(get_format)(dataset) for dataset in ar_datasets)

KeyboardInterrupt: 

In [208]:
formats = {}
for d in lst_formats:
    try:
        formats = {k: formats.get(k, 0) + d.get(k, 0) for k in set(formats) | set(d)}
    except:
        print("Error parsing", d)

formats["source"] = "datos.gob.ar"
formats["country"] = "Argentina"
formats["num_datasets"] = num_datasets

with open("./raw_data/argentina_formats.json", "w") as f:
    json.dump(formats, f)

In [18]:
#analysis for chile
query = "https://datos.gob.cl/dataset?q=&_res_format_limit=0"
r  =requests.get(query)
assert r.status_code == 200
#
soup = BeautifulSoup(r.text, 'html.parser')
num_text = soup.find("h2", text=lambda text: "datos encontrados" in text)
num_datasets = int(''.join(filter(str.isdigit, num_text.text)))
print("Total number of datasets", num_datasets)

Total number of datasets 3591


In [20]:
formats = {"source": "datos.gob.cl", "country": "Chile", "num_datasets": num_datasets}
# find the <a> tag with the text "Formatos"
for x in soup.find_all("h3"):
    if "Formatos" not in x.text:
        continue
    nav = x.find_next_sibling("nav")
    for li in nav.find_all("li"):
        try:
            if "populares" in format_text:
                continue
            format_text = li.find("a").text
            format = " ".join(format_text.strip().split()[:-1])
            num = format_text.strip().split()[-1]
            formats[format] = formats.get(format, 0) + int(num)
        except Exception as e:
            print(e)
            print("Error parsing", format_text)

with open("./raw_data/chile_formats.json", "w") as f:
    json.dump(formats, f)

### Europe: UK, Germany, Spain

- UK statistics from https://data.gov.uk
- Germany statistics from https://www.govdata.de
- Spain statistics from https://datos.gob.es

In [2]:
# analysis with uk website
query = "https://data.gov.uk/api/3/action/package_list"
r  =requests.get(query)
assert r.status_code == 200
uk_datasets = r.json()["result"]
num_datasets = len(uk_datasets)
print("Total number of datasets", num_datasets)

Total number of datasets 52927


In [79]:
def get_format(dataset):
    formats = {}
    query = "https://data.gov.uk/api/3/action/package_show?id=" + dataset
    r  =requests.get(query)
    assert r.status_code == 200
    dataset = r.json()["result"]
    if "resources" in dataset:
        for resource in dataset["resources"]:
            if "format" in resource:
                format = resource["format"]
                formats[format] = formats.get(format, 0) + 1
    return formats

import joblib
n_jobs = 10
# lst_formats = joblib.Parallel(n_jobs=12)(joblib.delayed(get_format)(dataset) for dataset in uk_datasets)
from pqdm.processes import pqdm
result = pqdm([dataset for dataset in uk_datasets], get_format, n_jobs=n_jobs)

QUEUEING TASKS | :   0%|          | 0/52927 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/52927 [00:00<?, ?it/s]

Process ForkProcess-24:
Process ForkProcess-23:
Process ForkProcess-17:
Process ForkProcess-21:
Process ForkProcess-25:
Process ForkProcess-22:
Process ForkProcess-19:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):


In [98]:
lst_formats = result
formats = {}
for d in lst_formats:
    try:
        formats = {k: formats.get(k, 0) + d.get(k, 0) for k in set(formats) | set(d)}
    except:
        print("Error parsing", d)

formats["source"] = "data.gov.uk"
formats["country"] = "United Kingdom"
formats["num_datasets"] = num_datasets

with open("./raw_data/uk_formats.json", "w") as f:
    json.dump(formats, f)

In [25]:
# repeat for Germany
query = "https://www.govdata.de/web/guest/suchen/-/searchresult/s/relevance_desc"
r  =requests.get(query)
assert r.status_code == 200

soup = BeautifulSoup(r.text, 'html.parser')
num_text = soup.find("h2", {"class": "hitscount"})
num_datasets = int(''.join(filter(str.isdigit, num_text.text)))
print("Total number of datasets", num_datasets)

Total number of datasets 63470


In [29]:
formats = {"source": "www.govdata.de", "country": "Germany", "num_datasets": num_datasets}
# find the <dl> tag that contains the <dt> tag with the class "filtergroup-title" and the text "Dateiformat"
for x in soup.find_all("dl"):
    filtergroups = x.find("dt", {"class": "filtergroup-title"})
    if not filtergroups or "Dateiformat" not in filtergroups.text:
        continue
    for dd in x.find_all("dd"):
        try:
            format_link = dd.find("a")
            # format is the <div> inside the <a> tag with the class "filtername"
            if not "weitere anzeigen" not in format_link.text:
                continue
            format = format_link.find("div", {"class": "filtername"}).text
            num = int(format_link.find("div", {"class": "filterresultcount"}).text)
            formats[format] = formats.get(format, 0) + int(num)
        except Exception as e:
            print(e)
            # print("Error parsing", format_text)

with open("./raw_data/germany_formats.json", "w") as f:
    json.dump(formats, f)

23285
23285
{'source': 'www.govdata.de', 'country': 'Germany', 'num_datasets': 63470, 'csv': 23285, 'pdf': 12461, '': 10686, 'html': 9437, 'wms': 6517, 'karte': 5868, 'webanwendung': 5867, 'zip': 5196, 'xlsx': 5168, 'json': 3145}


In [30]:
# repeat for spain
query = "https://datos.gob.es/es/catalogo?_res_format_label_limit=0"
r  =requests.get(query)
assert r.status_code == 200

soup = BeautifulSoup(r.text, 'html.parser')
for h2 in soup.find_all("h2"):
    try:
        if "datos encontrados" in h2.text.strip():
            num_datasets = int(''.join(filter(str.isdigit, h2.text)))
    except AttributeError:
        pass

print("Total number of datasets", num_datasets)

Total number of datasets 61761


In [31]:
formats = {"source": "datos.gob.es", "country": "Spain", "num_datasets": num_datasets}
# find the <h2> tag that contains "Formato" and get the next "nav" element
for x in soup.find_all("h2"):
    if "Formato" not in x.text:
        continue
    nav = x.parent.find_next_sibling("nav")
    for li in nav.find_all("li"):
        try:
            format_text = li.find("a").text
            if "populares" in format_text:
                continue
            format = " ".join(format_text.strip().split()[:-1])
            # get the number that is inbetween brackets in a string
            num = int(re.search(r'\((\d+)\)', format_text).group(1))
            formats[format] = formats.get(format, 0) + int(num)
        except Exception as e:
            print(e)
            print("Error parsing", format_text)

with open("./raw_data/spain_formats.json", "w") as f:
    json.dump(formats, f)

### Asia: Pakistan, Japan

- Pakistan statistics from https://opendata.com.pk
- Japan statistics from https://www.data.go.jp


In [None]:
# analysis for pakistan
query = "https://opendata.com.pk/dataset?_res_format_limit=0"
r  =requests.get(query)
assert r.status_code == 200

soup = BeautifulSoup(r.text, 'html.parser')
for h2 in soup.find_all("h2"):
    try:
        if "datasets found" in h2.text.strip():
            num_datasets = int(''.join(filter(str.isdigit, h2.text)))
    except AttributeError:
        pass

print("Total number of datasets", num_datasets)

In [15]:
formats = {"source": "opendata.com.pk", "country": "Pakistan", "num_datasets": num_datasets}
# find the <h2> tag that contains "Format" and get the next "nav" element
for x in soup.find_all("h2"):
    if "Formats" not in x.text:
        continue
    nav = x.parent.find("nav")
    for li in nav.find_all("li"):
        try:
            format = li.find("span", {"class":"item-label"}).text
            if "Popular" in format_text:
                continue
            num_text = li.find("span", {"class":"item-count"}).text
            num = int(re.search(r'\((\d+)\)', num_text).group(1))
            formats[format] = formats.get(format, 0) + int(num)
        except Exception as e:
            print(e)
            print("Error parsing", format_text)

with open("./raw_data/pakistan_formats.json", "w") as f:
    json.dump(formats, f)

In [24]:
# analysis for japan
query = "https://www.data.go.jp/data/en/dataset?q=&_res_format_limit=0"
r  =requests.get(query)
assert r.status_code == 200

soup = BeautifulSoup(r.text, 'html.parser')
for h2 in soup.find_all("h2"):
    try:
        if "datasets found" in h2.text.strip():
            num_datasets = int(''.join(filter(str.isdigit, h2.text)))
    except AttributeError:
        pass

print("Total number of datasets", num_datasets)


Total number of datasets 22944


In [28]:
formats = {"source": "data.go.jp", "country": "Japan", "num_datasets": num_datasets}
# find the <h2> tag that contains "Format" and get the next "nav" element
for x in soup.find_all("h2"):
    if "Formats" not in x.text:
        continue
    nav = x.parent.find("nav")
    for li in nav.find_all("li"):
        try:
            format = li.find("span", {"class":"item-label"}).text
            if "Popular" in format_text:
                continue
            num_text = int(li.find("span", {"class":"item-count"}).text)
            formats[format] = formats.get(format, 0) + int(num_text)
        except Exception as e:
            print(e)
            print("Error parsing", format_text)

with open("./raw_data/japan_formats.json", "w") as f:
    json.dump(formats, f)

{'source': 'data.go.jp', 'country': 'Japan', 'num_datasets': 22944, 'PDF': 10566, 'HTML': 6678, 'XLS': 5442, 'XLSX': 1802, 'ZIP': 801, 'CSV': 657, 'html': 523, 'pdf': 507, 'JPEG': 414, 'csv': 365, 'XML': 160, 'xls': 147, 'GIF': 90, 'KMZ': 73, 'PNG': 47, 'EXE': 36, 'TXT': 35, 'DOC': 30, 'DOCX': 24, 'xlsx': 17, 'PPTX': 12, 'lzh': 11, 'epub': 10, 'KML': 10, 'PPT': 7, 'asx': 6, 'mp3': 6, 'jtd': 5, 'php': 5, 'jsp': 1, 'ODT': 1, 'SHP': 1, 'zip': 1}


### Oceania: Australia, New Zealand

- Australia statistics from https://data.gov.au
- New Zealand statistics from https://catalogue.data.govt.nz

In [26]:
# analysis for australia
query = "https://data.gov.au/api/v0/search/datasets?start=0&limit=11&publishingState=published"
r  =requests.get(query)
assert r.status_code == 200
response = r.json()
num_datasets = response["hitCount"]
print("Total number of datasets", num_datasets)

formats = {"source": "https://data.gov.au", "country": "Australia", "num_datasets": num_datasets}
for x in response["facets"]:
    if x["id"] == "Format":
        for f in x["options"]:
            format_name = f["value"]
            format_count = f["hitCount"]
            formats[format_name] = int(format_count)

print(formats)
with open("./raw_data/australia_formats.json", "w") as f:
    json.dump(formats, f)

In [29]:
# analysis for new zealand
query = "https://catalogue.data.govt.nz/dataset/?q=&_res_format_limit=0"
r  =requests.get(query)
assert r.status_code == 200

soup = BeautifulSoup(r.text, 'html.parser')
for h2 in soup.find_all("h2"):
    try:
        if "datasets found" in h2.text.strip():
            num_datasets = int(''.join(filter(str.isdigit, h2.text)))
    except AttributeError:
        pass

print("Total number of datasets", num_datasets)

Total number of datasets 31881


In [30]:
formats = {"source": "data.govt.nz", "country": "New Zealand", "num_datasets": num_datasets}
# find the <h2> tag that contains "Format" and get the next "nav" element
for x in soup.find_all("h2"):
    if "Formats" not in x.text:
        continue
    nav = x.parent.find("nav")
    for li in nav.find_all("li"):
        try:
            format = li.find("span", {"class":"item-label"}).text
            if "Popular" in format_text:
                continue
            num_text = int(li.find("span", {"class":"item-count"}).text)
            formats[format] = formats.get(format, 0) + int(num_text)
        except Exception as e:
            print(e)
            print("Error parsing", format_text)

print(formats)
with open("./raw_data/new_zealand_formats.json", "w") as f:
    json.dump(formats, f)

{'source': 'data.govt.nz', 'country': 'New Zealand', 'num_datasets': 31881, 'KML': 18272, 'CSV': 16126, 'PDF': 14192, 'HTML': 13469, 'DWG': 12737, 'SHP': 10578, 'FileGDB': 10304, 'GPKG': 10304, 'MapInfo File': 10304, 'TIFF': 6975, 'GeoJSON': 5586, 'ZIP': 5553, 'GTiff': 4245, 'KEA': 4245, 'ArcGIS GeoServices ...': 4229, 'HFA': 3213, 'JPEG': 3211, 'JP2KAK': 3210, 'JP2KAK_LOSSLESS': 3210, 'MapInfo MIF': 2835, 'Esri REST': 2025, 'AAIGrid': 1032, 'XLSX': 564, 'XLS': 455, 'OGC WMS': 388, 'OGC WFS': 268, 'File Geodatabase Fe...': 184, '.xlsx': 118, 'API': 67, 'GIS': 59, 'Mixed': 54, 'Mesh Dataset': 52, 'RasterDataset': 49, 'Word': 39, 'File Geodatabase Ra...': 35, '.pdf': 31, 'excel workbook (*.x...': 30, 'GeoService API, Geo...': 29, 'XML': 27, 'xlsm': 23, 'Raster Dataset': 22, 'Zip/CSV': 16, 'geoservice api, geo...': 15, 'TXT': 13, 'XLSM': 13, '.csv': 12, 'SDE Feature Class': 12, '.xls': 10, 'Zip/Spreadsheet': 10, '.xlxs': 9}


### Africa: Morocco, Ghana, Somalia

- Morocco statistics from https://data.gov.ma
- Ghana statistics from https://data.gov.gh
- Somalia statistics from

In [None]:
# repeat for morocco
query = "https://data.gov.ma/data/fr/dataset?q=&_res_format_limit=0"
r  =requests.get(query)
assert r.status_code == 200

soup = BeautifulSoup(r.text, 'html.parser')
for h2 in soup.find_all("h2"):
    try:
        if "données trouvés" in h2.text.strip():
            num_datasets = int(''.join(filter(str.isdigit, h2.text)))
    except AttributeError:
        pass

print("Total number of datasets", num_datasets)

In [34]:
formats = {"source": "data.gov.ma", "country": "Morocco", "num_datasets": num_datasets}
# find the <h2> tag that contains "Format" and get the next "nav" element
for x in soup.find_all("h2"):
    if "Formats" not in x.text:
        continue
    nav = x.parent.find("nav")
    for li in nav.find_all("li"):
        try:
            format = li.find("span", {"class":"item-label"}).text
            num_text = int(li.find("span", {"class":"item-count"}).text)
            formats[format] = formats.get(format, 0) + int(num_text)
        except Exception as e:
            print(e)
            print("Error parsing", format_text)

print(formats)
with open("./raw_data/morocco_formats.json", "w") as f:
    json.dump(formats, f)

{'source': 'data.gov.ma', 'country': 'Morocco', 'num_datasets': 31881, 'XLSX': 212, 'XLS': 100, 'DOCX': 54, 'DOC': 16, 'CSV': 10, 'PDF': 6, 'PPTX': 6, '.docx': 1}


In [44]:
# analysis for ghana
query = "https://data.gov.gh/search?sort_by=changed"
r = requests.get(query)
assert r.status_code == 200

soup = BeautifulSoup(r.text, 'html.parser')
for div in soup.find_all("div"):
    try:
        if "results" in div.text.strip():
            num_datasets = int(''.join(filter(str.isdigit, div.text)))
    except AttributeError:
        pass

print("Total number of datasets", num_datasets)
formats = {"source": "data.gov.gh", "country": "Ghana", "num_datasets": num_datasets}
# find the <h2> tag that contains "Format" and get the next "nav" element
for x in soup.find_all("h2"):
    if "Format" not in x.text:
        continue
    div = x.find_next_sibling("div")
    for li in div.find_all("a"):
        format_text =li.text
        format = format_text.split(" (")[0]
        num = int(re.search(r'\((\d+)\)', format_text).group(1))
        formats[format] = formats.get(format, 0) + int(num)

print(formats)
with open("./raw_data/ghana_formats.json", "w") as f:
    json.dump(formats, f)

Total number of datasets 1308
{'source': 'data.gov.gh', 'country': 'Ghana', 'num_datasets': 1308, 'csv': 263, 'excel': 19, 'arcgis': 10, 'pdf': 7, 'xlsx': 4, 'argis': 1, 'xlb': 1, 'zip': 1}


In [9]:
# analysis for ghana
query = "https://www.data.gov.so/dataset"
r = requests.get(query)
assert r.status_code == 200

soup = BeautifulSoup(r.text, 'html.parser')
for h in soup.find_all("h2"):
    try:
        if "datasets found" in h.text.strip():
            num_datasets = int(''.join(filter(str.isdigit, h.text)))
    except AttributeError:
        pass

print("Total number of datasets", num_datasets)
formats = {"source": "www.data.gov.so", "country": "Somalia", "num_datasets": num_datasets}
# find the <h2> tag that contains "Format" and get the next "nav" element
for x in soup.find_all("h2"):
    if "Format" not in x.text:
        continue
    nav = x.find_next_sibling("nav")
    for li in nav.find_all("a"):
        format_text =li.text
        format = format_text.strip().split(" (")[0]
        num = int(re.search(r'\((\d+)\)', format_text).group(1))
        formats[format] = formats.get(format, 0) + int(num)

print(formats)
with open("./raw_data/somalia_formats.json", "w") as f:
    json.dump(formats, f)

Total number of datasets 8
{'source': 'www.data.gov.so', 'country': 'Somalia', 'num_datasets': 8, 'PDF': 8, 'CSV': 2, 'XLS': 1, 'XLSX': 1}
