In [277]:
import pandas as pd 
from sklearn.model_selection import train_test_split
import datetime
import math
import numpy as np

from interpret import show
from interpret.data import Marginal
from interpret.glassbox import ExplainableBoostingRegressor, LinearRegression, RegressionTree
from interpret.perf import RegressionPerf

seed = 1

In [374]:
df = pd.read_csv("train.csv")

# Create category called Region: country_province
region_list = ["{}_{}".format(df["Country_Region"][i], df["Province_State"][i]) for i in range(df.shape[0])]
df["Region"]=region_list

# Get first day of corona virus for each region
unique_region_list = list(set(region_list))
unique_region_list.sort()
first_date_dict = {}
for region in unique_region_list:
    mask = df["Region"]==region
    first_ix = np.where(df[mask]["ConfirmedCases"]>0)[0][0] -1    
    first_date = df[mask]["Date"].iloc[first_ix]
    first_date_dict[region] = first_date

# add column "Days": number of days since the first day of case per each region
def get_days(dt):
    return dt.days
dummy = [first_date_dict[region] for region in df["Region"]]
df["Days"]=(pd.to_datetime(df['Date'])-pd.to_datetime(dummy)).apply(get_days)

In [376]:
df = df[df["Days"]>0].copy(deep=True)

In [390]:
train_cols = ['Days','Region']
# ConfirmCases, Fatilies
output_col = ['ConfirmedCases']
X = df[train_cols]
# TODO use log1p
Y = np.log1p(df[output_col])

In [391]:
def split_train_val(X,Y, unique_region_list,num_of_val_days):
    
    train_ix = []
    val_ix = []
    for region in unique_region_list:
        
        mask = X["Region"]==region
        ix = np.where(mask)[0]
        
        train_ix += list(ix[:-num_of_val_days].flatten())
        val_ix += list(ix[-num_of_val_days:].flatten())
    
    return X.iloc[train_ix],X.iloc[val_ix],Y.iloc[train_ix],Y.iloc[val_ix]    
    

In [392]:
X_train, X_val, Y_train, Y_val = split_train_val(X,Y, unique_region_list,3)

In [393]:
df

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,Region,Days
33,34,,Afghanistan,2020-02-24,1.0,0.0,Afghanistan_nan,1
34,35,,Afghanistan,2020-02-25,1.0,0.0,Afghanistan_nan,2
35,36,,Afghanistan,2020-02-26,1.0,0.0,Afghanistan_nan,3
36,37,,Afghanistan,2020-02-27,1.0,0.0,Afghanistan_nan,4
37,38,,Afghanistan,2020-02-28,1.0,0.0,Afghanistan_nan,5
38,39,,Afghanistan,2020-02-29,1.0,0.0,Afghanistan_nan,6
39,40,,Afghanistan,2020-03-01,1.0,0.0,Afghanistan_nan,7
40,41,,Afghanistan,2020-03-02,1.0,0.0,Afghanistan_nan,8
41,42,,Afghanistan,2020-03-03,1.0,0.0,Afghanistan_nan,9
42,43,,Afghanistan,2020-03-04,1.0,0.0,Afghanistan_nan,10


In [394]:
marginal = Marginal().explain_data(X_train, Y_train, name = 'Train Data')
show(marginal)

## Train the Explainable Boosting Machine (EBM)

In [395]:
ebm = ExplainableBoostingRegressor(random_state=seed)
ebm.fit(X_train, Y_train)   #Works on dataframes and numpy arrays

ExplainableBoostingRegressor(binning_strategy='uniform', data_n_episodes=2000,
                             early_stopping_run_length=50,
                             early_stopping_tolerance=1e-05,
                             feature_names=['Days', 'Region'],
                             feature_step_n_inner_bags=0,
                             feature_types=['continuous', 'categorical'],
                             holdout_size=0.15, holdout_split=0.15,
                             interactions=0, learning_rate=0.01,
                             main_attr='all', max_tree_splits=2,
                             min_cases_for_splits=2, n_estimators=16, n_jobs=-2,
                             random_state=1, schema=None, scoring=None,
                             training_step_episodes=1)

## Global Explanations: What the model learned overall

In [396]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

## Local Explanations: How an individual prediction was made

In [397]:
ebm_local = ebm.explain_local(X_val, Y_val, name='EBM')
show(ebm_local)

## Evaluate EBM performance

In [398]:
ebm_perf = RegressionPerf(ebm.predict).explain_perf(X_val, Y_val, name='EBM')
show(ebm_perf)

In [None]:
# TODO
# Days since first case in the country

# Ideas of features:
# latest case

# When predicting, for those with days<0 delete from feature