In [1]:
import pandas as pd
import numpy as np
import requests 

from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold 
from sklearn.pipeline import make_pipeline 
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
## Import and clean data (copied from Daniela's script)

r = requests.get("https://api-pc6dbtrtla-uc.a.run.app/API/timeseries/usa")
response_dict = r.json()
df = pd.DataFrame.from_dict(response_dict)
df = df.rename(columns={'Total Results as of Date': 'Date'})
Days = (np.array([i for i in range (len(df.Date))]).reshape(-1,1))+1
df['Days'] = Days 

In [3]:
df

Unnamed: 0,ISO3,Country,Date,Cases,Deaths,Recovered,Days
0,USA,US,2020-01-22,1.0,0.0,0.0,1
1,USA,US,2020-01-23,1.0,0.0,0.0,2
2,USA,US,2020-01-24,2.0,0.0,0.0,3
3,USA,US,2020-01-25,2.0,0.0,0.0,4
4,USA,US,2020-01-26,5.0,0.0,0.0,5
...,...,...,...,...,...,...,...
99,USA,US,2020-04-30,1069424.0,62996.0,153947.0,100
100,USA,US,2020-05-01,1103461.0,64943.0,164015.0,101
101,USA,US,2020-05-02,1132539.0,66369.0,175382.0,102
102,USA,US,2020-05-03,1158040.0,67682.0,180152.0,103


In [4]:
## Models predicting confirmed cases 

X = df[["Days"]]
y_confirmed = df["Cases"]

tscv = TimeSeriesSplit(n_splits=5)

# To see scoring options: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [5]:
# Linear Regression

lr = LinearRegression()
cross_validate(lr, X, y_confirmed, cv=tscv, scoring="neg_mean_squared_error")["test_score"].mean()

-102250513255.19186

In [6]:
cross_validate(lr, X, y_confirmed, scoring="neg_mean_squared_error")["test_score"].mean()

-143805207205.22748

In [7]:
# Decision Tree

dt = DecisionTreeRegressor()
cross_validate(dt, X, y_confirmed, cv=tscv, scoring="neg_mean_squared_error")["test_score"].mean()

-37769339622.94118

In [8]:
cross_validate(dt, X, y_confirmed, scoring="neg_mean_squared_error")["test_score"].mean()

-29850290716.654285

In [9]:
# Random Forest 

rf = RandomForestRegressor()
cross_validate(rf, X, y_confirmed, cv=tscv, scoring="neg_mean_squared_error")["test_score"].mean()

-41110940978.045494

In [10]:
cross_validate(rf, X, y_confirmed, scoring="neg_mean_squared_error")["test_score"].mean()

-31296781901.567524

In [11]:
# Scaled Decision Tree

dt_scaled = make_pipeline(StandardScaler(), DecisionTreeRegressor())
cross_validate(dt_scaled, X, y_confirmed, cv=tscv, scoring="neg_mean_squared_error")["test_score"].mean()

-37769339622.94118

In [12]:
cross_validate(dt_scaled, X, y_confirmed, scoring="neg_mean_squared_error")["test_score"].mean()

-29888087113.987617

In [13]:
# Scaled Random Forest

rf_scaled = make_pipeline(StandardScaler(), RandomForestRegressor())
cross_validate(rf_scaled, X, y_confirmed, cv=tscv, scoring="neg_mean_squared_error")["test_score"].mean()

-41150741753.757614

In [14]:
cross_validate(rf_scaled, X, y_confirmed, scoring="neg_mean_squared_error")["test_score"].mean()

-31359309703.50506

In [15]:
## Models predicting deaths

X = df[["Days"]]
y_deaths = df["Deaths"]

tscv = TimeSeriesSplit(n_splits=5)

In [16]:
# Linear Regression

lr = LinearRegression()
cross_validate(lr, X, y_deaths, cv=tscv, scoring="neg_mean_squared_error")["test_score"].mean()

-386966511.5848629

In [17]:
# Decision Tree

dt = DecisionTreeRegressor()
cross_validate(dt, X, y_deaths, cv=tscv, scoring="neg_mean_squared_error")["test_score"].mean()

-143975976.67058823

In [18]:
# Random Forest 

rf = RandomForestRegressor()
cross_validate(rf, X, y_deaths, cv=tscv, scoring="neg_mean_squared_error")["test_score"].mean()

-157065920.10374352

In [19]:
# Scaled Decision Tree

dt_scaled = make_pipeline(StandardScaler(), DecisionTreeRegressor())
cross_validate(dt_scaled, X, y_confirmed, cv=tscv, scoring="neg_mean_squared_error")["test_score"].mean()

-37769339622.94118

In [20]:
# Scaled Random Forest

rf_scaled = make_pipeline(StandardScaler(), RandomForestRegressor())
cross_validate(rf_scaled, X, y_deaths, cv=tscv, scoring="neg_mean_squared_error")["test_score"].mean()

-159389629.53290707