In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("../input/covid19-global-forecasting-week-3/train.csv")
df.shape

(21726, 6)

In [2]:
loc_group = ["Province_State", "Country_Region"]


def preprocess(df):
    df["Date"] = df["Date"].astype("datetime64[ms]")
    for col in loc_group:
        df[col].fillna("none", inplace=True)
    return df

df = preprocess(df)
sub_df = preprocess(pd.read_csv("../input/covid19-global-forecasting-week-3/test.csv"))
df.head()

Unnamed: 0,Id,Country_Region,Province_State,Date,ConfirmedCases,Fatalities
0,1,Afghanistan,none,2020-01-22,0.0,0.0
1,2,Afghanistan,none,2020-01-23,0.0,0.0
2,3,Afghanistan,none,2020-01-24,0.0,0.0
3,4,Afghanistan,none,2020-01-25,0.0,0.0
4,5,Afghanistan,none,2020-01-26,0.0,0.0


In [3]:
df["Date"].min(), df["Date"].max()

(Timestamp('2020-01-22 00:00:00'), Timestamp('2020-04-01 00:00:00'))

In [4]:
TARGETS = ["ConfirmedCases", "Fatalities"]

for col in TARGETS:
    df[col] = np.log1p(df[col])

In [5]:
for col in TARGETS:
    df["prev_{}".format(col)] = df.groupby(loc_group)[col].shift()

In [6]:
df = df[df["Date"] > df["Date"].min()].copy()
df.head()

Unnamed: 0,Id,Country_Region,Province_State,Date,ConfirmedCases,Fatalities,prev_ConfirmedCases,prev_Fatalities
1,2,Afghanistan,none,2020-01-23,0.0,0.0,0.0,0.0
2,3,Afghanistan,none,2020-01-24,0.0,0.0,0.0,0.0
3,4,Afghanistan,none,2020-01-25,0.0,0.0,0.0,0.0
4,5,Afghanistan,none,2020-01-26,0.0,0.0,0.0,0.0
5,6,Afghanistan,none,2020-01-27,0.0,0.0,0.0,0.0


In [7]:
from datetime import timedelta

TEST_DAYS = 7

TRAIN_LAST =  - timedelta(days=TEST_DAYS)

TEST_FIRST = sub_df["Date"].min()
TEST_DAYS = (df["Date"].max() - TEST_FIRST).days + 1

dev_df, test_df = df[df["Date"] < TEST_FIRST].copy(), df[df["Date"] >= TEST_FIRST].copy()
dev_df.shape, test_df.shape

((19278, 8), (2142, 8))

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

model = Pipeline([('poly', PolynomialFeatures(degree=2, include_bias=False)),
                  ('linear', LinearRegression())])

features = ["prev_{}".format(col) for col in TARGETS]

model.fit(dev_df[features], dev_df[TARGETS])

[mean_squared_error(dev_df[TARGETS[i]], model.predict(dev_df[features])[:, i]) for i in range(len(TARGETS))]

[0.04276376898471454, 0.01009907447273643]

In [9]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def evaluate(df):
    error = 0
    for col in TARGETS:
        error += rmse(df[col].values, df["pred_{}".format(col)].values)
    return np.round(error/len(TARGETS), 5)


def predict(test_df, first_day, num_days, val=False):

    y_pred = np.clip(model.predict(test_df.loc[test_df["Date"] == first_day][features]), None, 16)

    for i, col in enumerate(TARGETS):
        test_df["pred_{}".format(col)] = 0
        test_df.loc[test_df["Date"] == first_day, "pred_{}".format(col)] = y_pred[:, i]

    if val:
        print(first_day, evaluate(test_df[test_df["Date"] == first_day]))

    for d in range(1, num_days):
        y_pred = np.clip(model.predict(y_pred), None, 16)
        date = first_day + timedelta(days=d)

        for i, col in enumerate(TARGETS):
            test_df.loc[test_df["Date"] == date, "pred_{}".format(col)] = y_pred[:, i]

        if val:
            print(date, evaluate(test_df[test_df["Date"] == date]))
        
    return test_df

test_df = predict(test_df, TEST_FIRST, TEST_DAYS, val=True)
evaluate(test_df)

2020-03-26 00:00:00 0.25798
2020-03-27 00:00:00 0.32953
2020-03-28 00:00:00 0.39395
2020-03-29 00:00:00 0.47952
2020-03-30 00:00:00 0.53787
2020-03-31 00:00:00 0.58438
2020-04-01 00:00:00 0.64138


0.47916

In [10]:
for col in TARGETS:
    test_df[col] = np.expm1(test_df[col])
    test_df["pred_{}".format(col)] = np.expm1(test_df["pred_{}".format(col)])

In [11]:
SUB_FIRST = sub_df["Date"].min()
SUB_DAYS = (sub_df["Date"].max() - sub_df["Date"].min()).days + 1

sub_df = dev_df.append(sub_df, sort=False)

for col in TARGETS:
    sub_df["prev_{}".format(col)] = sub_df.groupby(loc_group)[col].shift()
    
sub_df = sub_df[sub_df["Date"] >= SUB_FIRST].copy()
sub_df["ForecastId"] = sub_df["ForecastId"].astype(np.int16)
sub_df = predict(sub_df, SUB_FIRST, SUB_DAYS)

for col in TARGETS:
    sub_df[col] = np.expm1(sub_df["pred_{}".format(col)])
    
sub_df.head()

Unnamed: 0,Id,Country_Region,Province_State,Date,ConfirmedCases,Fatalities,prev_ConfirmedCases,prev_Fatalities,ForecastId,pred_ConfirmedCases,pred_Fatalities
0,,Afghanistan,none,2020-03-26,99.47768,2.215183,4.442651,1.098612,1,4.609936,1.167884
1,,Afghanistan,none,2020-03-27,117.370876,2.458014,,,2,4.773823,1.240694
2,,Afghanistan,none,2020-03-28,137.927815,2.732309,,,3,4.933954,1.317027
3,,Afghanistan,none,2020-03-29,161.398592,3.042438,,,4,5.090054,1.396848
4,,Afghanistan,none,2020-03-30,188.033244,3.393414,,,5,5.241923,1.480107


In [12]:
sub_df.to_csv("submission.csv", index=False, columns=["ForecastId"] + TARGETS)