In [3]:
import pandas as pd
from pathlib import Path

load raw extracted data

In [4]:
# Notebooks runs off of machine root and scripts run off of repo root - function to help locate files for each context
def find_repo_root(start: Path) -> Path:
    for parent in [start] + list(start.parents):
        if (parent / "data").exists():
            return parent
    raise FileNotFoundError("Could not locate repo root containing data/")

REPO_ROOT = find_repo_root(Path.cwd().resolve())
RAW_PATH = REPO_ROOT / "data" / "raw" / "raw_c19.csv"
OUT_PATH = REPO_ROOT / "data" / "processed" / "processed_c19.csv"

note: src path will be: ../data/raw/raw_19.csv or similar. Full path only required in notebook due to location of the kernel

In [5]:
df = pd.read_csv(RAW_PATH)
df.head()

Unnamed: 0,Date,Confirmed,Recovered,Deaths,Increase rate
0,2020-01-22,557,30,17,0.0
1,2020-01-23,657,32,18,17.953321
2,2020-01-24,944,39,26,43.683409
3,2020-01-25,1437,42,42,52.224576
4,2020-01-26,2120,56,56,47.529576


basic time series featurees

In [8]:
# Convert cumulative totals into daily values (better for forecasting-style modelling)
df["NewConfirmed"] = df["Confirmed"].diff().fillna(0).clip(lower=0)
df["NewDeaths"] = df["Deaths"].diff().fillna(0).clip(lower=0)

# Simple weekly seasonality features
df["Date"] = pd.to_datetime(df["Date"])           # minimal so dayofweek works
df["DayOfWeek"] = df["Date"].dt.dayofweek
df["Lag7_NewConfirmed"] = df["NewConfirmed"].shift(7).fillna(0)

df[["Date", "NewConfirmed", "DayOfWeek", "Lag7_NewConfirmed"]].head(12)

df["Lag1_NewConfirmed"] = df["NewConfirmed"].shift(1).fillna(0)
df["TargetNext_NewConfirmed"] = df["NewConfirmed"].shift(-1)

df[["Date", "NewConfirmed", "Lag1_NewConfirmed", "Lag7_NewConfirmed", "TargetNext_NewConfirmed"]].head(12)


Unnamed: 0,Date,NewConfirmed,Lag1_NewConfirmed,Lag7_NewConfirmed,TargetNext_NewConfirmed
0,2020-01-22,0.0,0.0,0.0,100.0
1,2020-01-23,100.0,0.0,0.0,287.0
2,2020-01-24,287.0,100.0,0.0,493.0
3,2020-01-25,493.0,287.0,0.0,683.0
4,2020-01-26,683.0,493.0,0.0,809.0
5,2020-01-27,809.0,683.0,0.0,2651.0
6,2020-01-28,2651.0,809.0,0.0,589.0
7,2020-01-29,589.0,2651.0,0.0,2068.0
8,2020-01-30,2068.0,589.0,100.0,1690.0
9,2020-01-31,1690.0,2068.0,287.0,2111.0


# Actions:
- load raw data
- add daily + weekly seasonality features