In [1]:
import pandas as pd
import os
import numpy as np
import epiweeks
import datetime
import requests
import zipfile
import csv
from contextlib import ExitStack
import progressbar as pbar
import json
import time

In [2]:
today = datetime.date.today().strftime("%d-%m-%Y")

In [3]:
# Download and unzip data file
ids = {'inputDB': '9dsfk',
       'output_5': '7tnfh',
       'output_10': '43ucn',
       'quality_metrics': 'qpfw5'}
for data_name, data_id in ids.items():
    link = f'https://osf.io/{data_id}/download'
    print(link)
    downloaded_data = requests.get(link)
    with open('data/tmp.zip', 'wb') as file:
        file.write(downloaded_data.content)
    with zipfile.ZipFile('data/tmp.zip', 'r') as zip_ref:
        zip_ref.extractall('data/')
    os.remove('data/tmp.zip')
    data_file = f'data/Data/{data_name}.csv'
link = 'https://osf.io/unf6v/download'
print(link)
downloaded_data = requests.get(link)
with open('data/Data/population.csv', 'wb') as file:
    file.write(downloaded_data.content)

https://osf.io/9dsfk/download
https://osf.io/7tnfh/download
https://osf.io/43ucn/download
https://osf.io/qpfw5/download
https://osf.io/unf6v/download


In [4]:
data_files = {'inputDB': 'data/Data/inputDB.csv',
              'output_5': 'data/Data/output_5.csv',
              'output_10': 'data/Data/output_10.csv',
              'quality_metrics': 'data/Data/qualityMetrics.csv',
              'population': 'data/Data/population.csv'}

In [5]:
# Find country names and the corresponding region names in the file -> {'country name': {'region name': 'region code'}}
with open(data_files['inputDB'], 'r') as file:
    reader = csv.reader(file)
    countries = {}
    measure = set()
    number_of_rows = 0
    columns = None
    for index, row in enumerate(reader):
        number_of_rows += 1
        if index == 1:
            columns = row
        if index in [0, 1]:
            continue
        else:
            country = row[0]
            if row[0] == '1':
                continue
            if country not in countries.keys():
                countries[country] = {row[1]: row[-2]}
            else:
                countries[row[0]] = {**countries[row[0]], **{row[1]: row[-1]}}
            measure.add(row[8])


In [6]:
# Preparing dir
data_dir_path = f'data/World/{today}'
if not os.path.exists(f'{data_dir_path}'):
        os.mkdir(f'{data_dir_path}')
for country, regions in countries.items():
    if not os.path.exists(f'{data_dir_path}/{country}'):
        os.mkdir(f'{data_dir_path}/{country}')
with open(f'{data_dir_path}/countries.json', 'w') as file:
    json.dump(countries, file, indent=4)
print(f'number of countries: {len(countries)}')
time.sleep(1)

number of countries: 128


In [7]:
def prepare_data_inputDB():
    file_names = ['raw_cases.csv', 'raw_deaths.csv', 'raw_tests.csv', 'all_raw.csv']
    countries_inputDB = {}
    for key in countries.keys():
        if key == '' or key == '1':
            continue
        countries_inputDB[key] = []
    widgets = ['Prepare inputDB.csv: ', pbar.Percentage(), ' ', pbar.Bar(marker='-',left='[',right=']'), ' ', pbar.ETA(), ' ', pbar.Timer()]
    bar = pbar.ProgressBar(widgets=widgets, maxval=number_of_rows + 1)
    bar.start()
    with ExitStack() as stack:
        writer_dict = {}
        for country, file_list in countries_inputDB.items():
            file_list = [stack.enter_context(open(f'{data_dir_path}/{country}/{file_name}', 'w')) for file_name in file_names]
            writer_dict[country] = [csv.writer(file, delimiter=',') for file in file_list]
            writer_dict[country][0].writerow(['date', 'location', 'location_name', 'sex', 'Age group', 'value'])
            writer_dict[country][1].writerow(['date', 'location', 'location_name', 'sex', 'Age group', 'value'])
            writer_dict[country][2].writerow(['date', 'location', 'location_name', 'sex', 'Age group', 'value'])
            writer_dict[country][3].writerow(columns)
        read = stack.enter_context(open(data_files['inputDB'], 'r'))
        reader = csv.reader(read)
        
        for index, row in enumerate(reader):
            if index in [0, 1] or row[0] == '1':
                continue
            if row[0] == '':
                row[0] = 'USA'
            current_country = row[0] 
            writer_dict[current_country][3].writerow(row)
            if row[7] == 'Count' and row[8] in ['Cases', 'Deaths', 'Tests']:
                date = datetime.datetime.strptime(row[3], '%d.%m.%Y').strftime("%Y-%m-%d")
                location_name = row[1] if row[1] != 'All' else row[0]
                row_to_save = [date, row[10], location_name, row[4], row[5], row[9]]
                if row[8] == 'Cases':
                    writer_dict[current_country][0].writerow(row_to_save)
                elif row[8] == 'Deaths':
                    writer_dict[current_country][1].writerow(row_to_save)   
                elif row[8] == 'Tests':
                    writer_dict[current_country][2].writerow(row_to_save)
            bar.update(index)
        bar.finish()

In [8]:
def prepare_data_output5():
    file_names = ['cases_5.csv', 'deaths_5.csv', 'tests_5.csv']
    countries_inputDB = {}
    for key in countries.keys():
        if key == '' or key == '1':
            continue
        countries_inputDB[key] = []
    widgets = ['Prepare Output_5.csv: ', pbar.Percentage(), ' ', pbar.Bar(marker='-',left='[',right=']'), ' ', pbar.ETA(), ' ', pbar.Timer()]
    bar = pbar.ProgressBar(widgets=widgets, maxval=number_of_rows + 1)
    bar.start()
    with ExitStack() as stack:
        writer_dict = {}
        for country, file_list in countries_inputDB.items():
            file_list = [stack.enter_context(open(f'{data_dir_path}/{country}/{file_name}', 'w')) for file_name in file_names]
            writer_dict[country] = [csv.writer(file, delimiter=',') for file in file_list]

        read = stack.enter_context(open(data_files['output_5'], 'r'))
        reader = csv.reader(read)
        
        for index, row in enumerate(reader):
            if index in [0, 1, 2] or row[0] == '1':
                continue
            if index == 3:
                for country in countries.keys():
                    if country in ['', '1']:
                        continue
                    writer_dict[country][0].writerow(row[:8])
                    writer_dict[country][1].writerow(row[:7] + [row[8]])
                    writer_dict[country][2].writerow(row[:7] + [row[9]])
                continue
            if row[0] == '':
                row[0] = 'USA'
            current_country = row[0] 
            writer_dict[current_country][0].writerow(row[:8])
            writer_dict[current_country][1].writerow(row[:7] + [row[8]])
            writer_dict[current_country][2].writerow(row[:7] + [row[9]])
            bar.update(index)
        bar.finish()

In [9]:
def prepare_data_output10():
    file_names = ['cases_10.csv', 'deaths_10.csv', 'tests_10.csv']
    countries_inputDB = {}
    for key in countries.keys():
        if key == '' or key == '1':
            continue
        countries_inputDB[key] = []
    widgets = ['Prepare Output_10.csv: ', pbar.Percentage(), ' ', pbar.Bar(marker='-',left='[',right=']'), ' ', pbar.ETA(), ' ', pbar.Timer()]
    bar = pbar.ProgressBar(widgets=widgets, maxval=number_of_rows + 1)
    bar.start()
    with ExitStack() as stack:
        writer_dict = {}
        for country, file_list in countries_inputDB.items():
            file_list = [stack.enter_context(open(f'{data_dir_path}/{country}/{file_name}', 'w')) for file_name in file_names]
            writer_dict[country] = [csv.writer(file, delimiter=',') for file in file_list]

        read = stack.enter_context(open(data_files['output_10'], 'r'))
        reader = csv.reader(read)
        
        for index, row in enumerate(reader):
            if index in [0, 1, 2] or row[0] == '1':
                continue
            if index == 3:
                for country in countries.keys():
                    if country in ['', '1']:
                        continue
                    writer_dict[country][0].writerow(row[:8])
                    writer_dict[country][1].writerow(row[:7] + [row[8]])
                    writer_dict[country][2].writerow(row[:7] + [row[9]])
                continue
            if row[0] == '':
                row[0] = 'USA'
            current_country = row[0] 
            writer_dict[current_country][0].writerow(row[:8])
            writer_dict[current_country][1].writerow(row[:7] + [row[8]])
            writer_dict[current_country][2].writerow(row[:7] + [row[9]])
            bar.update(index)
        bar.finish()

In [10]:
def prepare_data_population():
    file_names = ['population.csv']
    countries_inputDB = {}
    for key in countries.keys():
        if key == '' or key == '1':
            continue
        countries_inputDB[key] = []
    widgets = ['Prepare population.csv: ', pbar.Percentage(), ' ', pbar.Bar(marker='-',left='[',right=']'), ' ', pbar.ETA(), ' ', pbar.Timer()]
    bar = pbar.ProgressBar(widgets=widgets, maxval=number_of_rows + 1)
    bar.start()
    with ExitStack() as stack:
        writer_dict = {}
        for country, file_list in countries_inputDB.items():
            file_list = [stack.enter_context(open(f'{data_dir_path}/{country}/{file_name}', 'w')) for file_name in file_names]
            writer_dict[country] = [csv.writer(file, delimiter=',') for file in file_list]

        read = stack.enter_context(open(data_files['population'], 'r'))
        reader = csv.reader(read)
        
        for index, row in enumerate(reader):
            if index in [0] or row[0] == '1':
                continue
            if index == 1:
                for country in countries.keys():
                    if country in ['', '1']:
                        continue
                    writer_dict[country][0].writerow(row)
                continue
            current_country = row[2]
            if current_country in countries.keys():
                if row[3] == 'All':
                    row[3] = current_country
                writer_dict[current_country][0].writerow(row)
            bar.update(index)
        bar.finish()

In [11]:
# prepare_data_inputDB()
prepare_data_output5()
# prepare_data_output10()
prepare_data_population()

Prepare Output_5.csv: 100% [-------------] Time:  0:02:15 Elapsed Time: 0:02:15
Prepare population.csv: 100% [-----------] Time:  0:00:00 Elapsed Time: 0:00:00


In [12]:
for country in countries.keys():
    df = pd.read_csv(f'{data_dir_path}/{country}/population.csv')
    new_df = pd.DataFrame({'Age': df['Age'].unique()})
    for i, region in enumerate(df['Region'].unique()):
        new_df.insert(i + 1, region, list(df[np.logical_and(df['Region'] == region, df['Sex'] == 'b')]['Population']), True)
    new_df.to_csv(f'{data_dir_path}/{country}/population.csv', index=False)