In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder

In [6]:
# Notebooks runs off of machine root and scripts run off of repo root - function to help locate files for each context
def find_repo_root(start: Path) -> Path:
    for parent in [start] + list(start.parents):
        if (parent / "data").exists():
            return parent
    raise FileNotFoundError("Could not locate repo root containing data/")

REPO_ROOT = find_repo_root(Path.cwd().resolve())
PROCESSED_PATH = REPO_ROOT / "data" / "processed" / "processed_c19.csv"

note: src path will be: ../data/processed/processed_19.csv or similar. Full path only required in notebook due to location of the kernel

In [7]:
df = pd.read_csv(PROCESSED_PATH)
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Date                     816 non-null    object 
 1   Confirmed                816 non-null    int64  
 2   Recovered                816 non-null    int64  
 3   Deaths                   816 non-null    int64  
 4   Increase rate            816 non-null    float64
 5   NewConfirmed             816 non-null    float64
 6   NewDeaths                816 non-null    float64
 7   DayOfWeek                816 non-null    int64  
 8   Lag7_NewConfirmed        816 non-null    float64
 9   Lag1_NewConfirmed        816 non-null    float64
 10  TargetNext_NewConfirmed  815 non-null    float64
dtypes: float64(6), int64(4), object(1)
memory usage: 70.3+ KB


select modelling cols

In [None]:
df["Date"] = pd.to_datetime(df["Date"])

model_df = df[[
    "Date",
    "TargetNext_NewConfirmed",
    "Lag1_NewConfirmed",
    "Lag7_NewConfirmed",
    "DayOfWeek"
]].dropna().copy()

model_df.head(10)

Unnamed: 0,Date,TargetNext_NewConfirmed,Lag1_NewConfirmed,Lag7_NewConfirmed,DayOfWeek
0,2020-01-22,100.0,0.0,0.0,2
1,2020-01-23,287.0,0.0,0.0,3
2,2020-01-24,493.0,100.0,0.0,4
3,2020-01-25,683.0,287.0,0.0,5
4,2020-01-26,809.0,493.0,0.0,6
5,2020-01-27,2651.0,683.0,0.0,0
6,2020-01-28,589.0,809.0,0.0,1
7,2020-01-29,2068.0,2651.0,0.0,2
8,2020-01-30,1690.0,589.0,100.0,3
9,2020-01-31,2111.0,2068.0,287.0,4


train test split - hold out last 30 days for test

In [9]:
cutoff = model_df["Date"].max() - pd.Timedelta(days=30)

train_df = model_df[model_df["Date"] <= cutoff].copy()
test_df  = model_df[model_df["Date"] >  cutoff].copy()

train_df.shape, test_df.shape, cutoff

((785, 5), (30, 5), Timestamp('2022-03-16 00:00:00'))

features and target

In [17]:
X_train = train_df[["Lag1_NewConfirmed", "Lag7_NewConfirmed", "DayOfWeek"]]
y_train = train_df["TargetNext_NewConfirmed"]

X_test = test_df[["Lag1_NewConfirmed", "Lag7_NewConfirmed", "DayOfWeek"]]
y_test = test_df["TargetNext_NewConfirmed"]

produce baseline metrics as benchmark

In [18]:
# Baseline: tomorrow's new cases â‰ˆ today's new cases
yhat_naive = test_df["Lag1_NewConfirmed"]

mae_naive = mean_absolute_error(y_test, yhat_naive)
rmse_naive = mean_squared_error(y_test, yhat_naive, squared=False)

mae_naive, rmse_naive



(387383.8333333333, 467317.7574383052)

encoding

In [23]:
# Numeric features only
Xnum_train = train_df[["Lag1_NewConfirmed", "Lag7_NewConfirmed"]].values
y_train = train_df["TargetNext_NewConfirmed"].values

Xnum_test = test_df[["Lag1_NewConfirmed", "Lag7_NewConfirmed"]].values
y_test = test_df["TargetNext_NewConfirmed"].values

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

Xohe_train = ohe.fit_transform(train_df[["DayOfWeek"]])
Xohe_test = ohe.transform(test_df[["DayOfWeek"]])

# Combine numeric + one-hot features
X_train = np.hstack([Xnum_train, Xohe_train])
X_test = np.hstack([Xnum_test, Xohe_test])

fit model

In [24]:
lr = LinearRegression()
lr.fit(X_train, y_train)

yhat_model = lr.predict(X_test)

mae_model = mean_absolute_error(y_test, yhat_model)
rmse_model = mean_squared_error(y_test, yhat_model, squared=False)

mae_model, rmse_model



(294493.9996023419, 345986.21511504805)