# Dataset: _reports_

In [None]:
from datetime import datetime
import pandas as pd
import requests, zipfile, io, json, re
from bs4 import BeautifulSoup
import urllib.parse

import src.utils as ut

# Setup the root path of the application
project_path = ut.project_path()

# Load the metadata

meta_filename = [
    f"{ut.project_path(1)}/meta/mosquito_alert/reports.json",
    f"{ut.project_path(2)}/meta_ipynb/reports.html",
]
metadata = ut.load_metadata(meta_filename)

# Get contentUrl from metadata file
ut.info_meta(metadata)

## 1. Distribution from Zenodo cloud

This dataset is updated nightly and the most recent version can be downloaded
from Zenodo at https://doi.org/10.5281/zenodo.597466. This URL will always
resolve to the most recent version of the data.

In [None]:
# Get metadata
contentUrl, dataset_name, distr_name = ut.get_meta(
    metadata, idx_distribution=0, idx_hasPart=None
)

# Make folders for data download
path = f"{project_path}/data/{dataset_name}/{distr_name}"
ut.makedirs(path)

In [None]:
# Download and open the zip container

# Get the latest zenodo file version of the dataset
r = requests.get(contentUrl)
file_url = BeautifulSoup(r.content, "html.parser").find("a", {"class": "filename"})[
    "href"
]
file_contentUrl = urllib.parse.urljoin(r.url, file_url)

# Download the dataset
r_file = requests.get(file_contentUrl)
z = zipfile.ZipFile(io.BytesIO(r_file.content))

We have the option to extract all the file reports into a distribution folder.

In [None]:
z.extractall(path)

Or we could concatenate all reports into a single dataframe before and save
it as a file.

In [None]:
# Merge all reports into a dataframe
df_reports = []
reports = [s for s in z.namelist() if (s.find("all_reports") != -1)]
for name in reports:
    f = z.open(name)
    d = json.loads(f.read())
    df_reports.append(pd.DataFrame.from_records(d, coerce_float=True))

df = pd.concat(df_reports)
df.info()

Some attributes of reports are key-value json-like data, that need additional
tables to be fully comprehensive (for example, tiger_responses). Since
multilanguage translations are available, we make language as index.

In [None]:
reports_translation = [s for s in z.namelist() if (s.find("translation_dict") != -1)]

f = z.open(reports_translation[0])
r = f.read()

try:
    d = json.loads(r)
except ValueError:
    print("Warning: not a valid Json format. Try to get rid of trailing comma.")
try:
    r = re.sub(r"\"\s*,\s*\}", '" }', r.decode("utf-8"))
    d = json.loads(r)
except ValueError:
    print("Json format is still not valid.")

df_reports_translation = pd.DataFrame.from_dict(d, orient="index")
df_reports_translation.info()

In [None]:
# Save reports on CSV or parquet
filename = f"{path}/all_reports"
df.to_parquet(f"{filename}.parquet")  # very low file-size (need to install pyArrow)
df.to_csv(f"{filename}.csv")  # x10 size if compared with the dataframe

# Save seports translation on CSV
df_reports_translation.to_csv(f"{filename}_translation.csv")

## 2. Distribution from MosquitoAlert Github repository

This dataset is also updated daily on GitHub and can be accessed from there.

In [None]:
# Get metadata
contentUrl, dataset_name, distr_name = ut.get_meta(
    metadata, idx_distribution=1, idx_hasPart=None
)

# Make folders for data download
path = f"{project_path}/data/{dataset_name}/{distr_name}"
ut.makedirs(path)

In [None]:
# Request reports in json format and concatenate all of them into a dataframe

current_year = datetime.today().year  # all the reports until the current year

df_reports = []
for year in range(2014, current_year):
    url = contentUrl[0].format(YEAR=str(year))
    r = requests.get(url)
    d = r.json()
    df_reports.append(pd.DataFrame.from_records(d, coerce_float=True))

df = pd.concat(df_reports)
df.info()

In [None]:
# Request other support material of the reports and put them into dataframes
# Since multilanguage translations are available, we make language as index

url = contentUrl[-1]
r = requests.get(url)

try:
    d = r.json()
except ValueError:
    print("Warning: not a valid Json format. Try to get rid of trailing comma.")
try:
    r = re.sub(r"\"\s*,\s*\}", '" }', r.text)
    d = json.loads(r)
except ValueError:
    print("Json format is still not valid.")

df_reports_translation = pd.DataFrame.from_dict(d, orient="index")
df_reports_translation.info()

In [None]:
# Save reports on CSV or parquet
filename = f"{path}/all_reports"
df.to_parquet(f"{filename}.parquet")  # very low file-size (need to install pyArrow)
df.to_csv(f"{filename}.csv")  # x10 size if compared with the dataframe

# Save seports translation on CSV
df_reports_translation.to_csv(f"{filename}_translation.csv")