# Data Preprocessing 

In this notebook we clean and prepare the data for analysis.

In [1]:
import os
from dateutil.parser import parse 
import numpy as np
import pandas as pd
from itertools import chain

# Covid Data -  'Active', 'Incident_Rate' and 'Case_Fatality_Ratio'

We sourced the data from the John Hopkins Covid Github page, where they already aggregate and clean multiple sources.

In [2]:
# Get files to process
covid_dir = os.path.join(os.pardir, 'data', 'raw', 'covid19', 'daily_reports')
daily_reports = os.listdir(covid_dir)


agg = {'Confirmed':'sum', 'Deaths':'sum', 'Recovered':'sum','Active':'mean','Incident_Rate':'mean','Case_Fatality_Ratio':'mean'}
columns = ['Country_Region', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'Incident_Rate', 'Case_Fatality_Ratio']

def parse_daily_report(daily_report):
    dt_obj = parse(daily_report.split('.')[0])
    print(f'parsing file {daily_report}')

    daily_report_pth = os.path.join(covid_dir, daily_report)
    tmp_pdf = pd.read_csv(daily_report_pth)

    if not pd.Series(columns).isin(tmp_pdf.columns).all():
        return None

    tmp_pdf = tmp_pdf[['Country_Region', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'Incident_Rate', 'Case_Fatality_Ratio']]
    tmp_pdf = tmp_pdf.groupby(['Country_Region']).aggregate(agg).reset_index()
    tmp_pdf['year'] = dt_obj.year
    tmp_pdf['month'] = dt_obj.month
    return tmp_pdf

# First file in list 
covid_df = parse_daily_report(daily_reports[0])

# rest of files 
for daily_report in daily_reports[1:]:
    tmp_pdf = parse_daily_report(daily_report)
    
    if type(tmp_pdf) != None:
        covid_df = pd.concat([covid_df, tmp_pdf])

covid_year_df = covid_df.groupby(['Country_Region', 'year']).aggregate({'Confirmed':'max', 'Deaths':'max', 'Recovered':'max','Active':'mean','Incident_Rate':'mean','Case_Fatality_Ratio':'mean'}).reset_index()
covid_month_df = covid_df.groupby(['Country_Region', 'year', 'month']).aggregate({'Confirmed':'max', 'Deaths':'max', 'Recovered':'max','Active':'mean','Incident_Rate':'mean','Case_Fatality_Ratio':'mean'}).reset_index()


parsing file 01-01-2021.csv
parsing file 01-02-2021.csv
parsing file 01-03-2021.csv
parsing file 01-04-2021.csv
parsing file 01-05-2021.csv
parsing file 01-06-2021.csv
parsing file 01-07-2021.csv
parsing file 01-08-2021.csv
parsing file 01-09-2021.csv
parsing file 01-10-2021.csv
parsing file 01-11-2021.csv
parsing file 01-12-2021.csv
parsing file 01-13-2021.csv
parsing file 01-14-2021.csv
parsing file 01-15-2021.csv
parsing file 01-16-2021.csv
parsing file 01-17-2021.csv
parsing file 01-18-2021.csv
parsing file 01-19-2021.csv
parsing file 01-20-2021.csv
parsing file 01-21-2021.csv
parsing file 01-22-2020.csv
parsing file 01-22-2021.csv
parsing file 01-23-2020.csv
parsing file 01-23-2021.csv
parsing file 01-24-2020.csv
parsing file 01-24-2021.csv
parsing file 01-25-2020.csv
parsing file 01-25-2021.csv
parsing file 01-26-2020.csv
parsing file 01-26-2021.csv
parsing file 01-27-2020.csv
parsing file 01-27-2021.csv
parsing file 01-28-2020.csv
parsing file 01-28-2021.csv
parsing file 01-29-2

In [3]:
covid_month_df

Unnamed: 0,Country_Region,year,month,Confirmed,Deaths,Recovered,Active,Incident_Rate,Case_Fatality_Ratio
0,Afghanistan,2020,11,46215,1763,36831.0,6811.727273,113.240820,3.753157
1,Afghanistan,2020,12,52330,2189,41727.0,8700.000000,127.575268,4.026543
2,Afghanistan,2021,1,55023,2400,47679.0,6312.387097,138.489284,4.300818
3,Afghanistan,2021,2,55714,2443,49333.0,4533.964286,142.487265,4.369554
4,Afghanistan,2021,3,56454,2484,51550.0,3834.225806,143.942563,4.388250
...,...,...,...,...,...,...,...,...,...
2737,Zimbabwe,2021,8,124773,4419,82994.0,27777.000000,804.506147,3.415043
2738,Zimbabwe,2021,9,130820,4623,0.0,,857.510772,3.568200
2739,Zimbabwe,2021,10,132977,4678,0.0,,889.102647,3.520466
2740,Zimbabwe,2021,11,134625,4707,0.0,,898.262502,3.516952


In [4]:
covid_year_df

Unnamed: 0,Country_Region,year,Confirmed,Deaths,Recovered,Active,Incident_Rate,Case_Fatality_Ratio
0,Afghanistan,2020,52330,2189,41727.0,7916.188679,121.625120,3.913062
1,Afghanistan,2021,157858,7331,82586.0,15623.259259,272.287093,4.434308
2,Albania,2020,58316,1181,33634.0,19419.207547,1459.439011,2.118022
3,Albania,2021,204627,3152,130314.0,17691.259259,4758.528632,1.740721
4,Algeria,2020,99610,2756,67127.0,26434.339623,192.710762,2.933269
...,...,...,...,...,...,...,...,...
386,Yemen,2021,10081,1973,4251.0,1258.916667,21.540478,21.349153
387,Zambia,2020,20725,388,18660.0,576.207547,98.580523,2.012397
388,Zambia,2021,212278,3671,189658.0,6144.569444,762.727480,1.550910
389,Zimbabwe,2020,13867,363,11250.0,1297.773585,72.077654,2.769321


In [5]:
_dir =  os.path.join(os.pardir, 'data', 'covid')
if not os.path.isdir(_dir):
    os.makedirs(_dir)

covid_month_df.to_csv(os.path.join(os.pardir, 'data', 'covid', 'covid_month_agg.csv'), index=False)
covid_year_df.to_csv(os.path.join(os.pardir, 'data', 'covid', 'covid_year_agg.csv'), index=False)


# Covid Data -  Confirmed 

We sourced the data from the John Hopkins Covid Github page, where they already aggregate and clean multiple sources.

In [23]:
# Get files to process
covid_dir = os.path.join(os.pardir, 'data', 'raw', 'covid19', 'daily_reports')
daily_reports = os.listdir(covid_dir)

def parse_daily_report(daily_report):
    dt_obj = parse(daily_report.split('.')[0])
    print(f'parsing file {daily_report}')

    daily_report_pth = os.path.join(covid_dir, daily_report)
    tmp_pdf = pd.read_csv(daily_report_pth)

    if 'Country/Region' in tmp_pdf.columns:
        tmp_pdf = tmp_pdf.rename(columns={'Country/Region':'Country_Region'})

    tmp_pdf = tmp_pdf[['Country_Region', 'Confirmed']]
    tmp_pdf = tmp_pdf.groupby(['Country_Region']).aggregate({'Confirmed':'sum'}).reset_index()
    tmp_pdf['year'] = dt_obj.year
    tmp_pdf['month'] = dt_obj.month
    return tmp_pdf

# First file in list 
covid_df = parse_daily_report(daily_reports[0])

# rest of files 
for daily_report in daily_reports[1:]:
    tmp_pdf = parse_daily_report(daily_report)
    
    if type(tmp_pdf) != None:
        covid_df = pd.concat([covid_df, tmp_pdf])

covid_month_df = covid_df.groupby(['Country_Region', 'year', 'month']).aggregate({'Confirmed':'max'}).reset_index()

covid_month_df.to_csv(os.path.join(os.pardir, 'data', 'covid', 'covid_confirmed.csv'), index=False)


parsing file 01-01-2021.csv
parsing file 01-02-2021.csv
parsing file 01-03-2021.csv
parsing file 01-04-2021.csv
parsing file 01-05-2021.csv
parsing file 01-06-2021.csv
parsing file 01-07-2021.csv
parsing file 01-08-2021.csv
parsing file 01-09-2021.csv
parsing file 01-10-2021.csv
parsing file 01-11-2021.csv
parsing file 01-12-2021.csv
parsing file 01-13-2021.csv
parsing file 01-14-2021.csv
parsing file 01-15-2021.csv
parsing file 01-16-2021.csv
parsing file 01-17-2021.csv
parsing file 01-18-2021.csv
parsing file 01-19-2021.csv
parsing file 01-20-2021.csv
parsing file 01-21-2021.csv
parsing file 01-22-2020.csv
parsing file 01-22-2021.csv
parsing file 01-23-2020.csv
parsing file 01-23-2021.csv
parsing file 01-24-2020.csv
parsing file 01-24-2021.csv
parsing file 01-25-2020.csv
parsing file 01-25-2021.csv
parsing file 01-26-2020.csv
parsing file 01-26-2021.csv
parsing file 01-27-2020.csv
parsing file 01-27-2021.csv
parsing file 01-28-2020.csv
parsing file 01-28-2021.csv
parsing file 01-29-2