In [None]:
import pandas as pd 
from sklearn.model_selection import train_test_split
import datetime
import math
import numpy as np
from sklearn.metrics import mean_squared_error
from tqdm import tqdm

from interpret import show
from interpret.data import Marginal
from interpret.glassbox import ExplainableBoostingRegressor, LinearRegression, RegressionTree
from interpret.perf import RegressionPerf

seed = 1

## Preprocess data

In [None]:
def preprocess(filename):
    df = pd.read_csv(filename)

    # Create category called Region: country_province
    region_list = ["{}_{}".format(df["Country_Region"][i], df["Province_State"][i]) for i in range(df.shape[0])]
    df["Region"]=region_list

    # Get first day of corona virus for each region
    unique_region_list = list(set(region_list))
    unique_region_list.sort()
    first_date_dict = {}
    for region in unique_region_list:
        mask = df["Region"]==region
        first_ix = np.where(df[mask]["ConfirmedCases"]>0)[0][0] -1    
        first_date = df[mask]["Date"].iloc[first_ix]
        first_date_dict[region] = first_date

    # add column "Days": number of days since the first day of case per each region
    def get_days(dt):
        return dt.days
    dummy = [first_date_dict[region] for region in df["Region"]]
    df["Days"]=(pd.to_datetime(df['Date'])-pd.to_datetime(dummy)).apply(get_days)

    # Add previous confirmed cases as a feature
    loc_group=["Region"]
    df["prev_{}".format('ConfirmedCases')] = df.groupby(loc_group)['ConfirmedCases'].shift()
    df["prev_ConfirmedCases"].fillna(0, inplace=True)
    
    # TODO
    ### df = df[df["Days"]>0].copy(deep=True)

    # TODO use log
    df["ConfirmedCases"] = np.log1p(df["ConfirmedCases"])
    df["prev_ConfirmedCases"] = np.log1p(df["prev_ConfirmedCases"])

    features = ['Days','Region',"prev_ConfirmedCases"]
    # ConfirmCases, Fatilies
    output_col = ['ConfirmedCases']
    X = df[features]
    # TODO use log1p
    Y = df[output_col]
    
    return X,Y

In [None]:
X,Y=preprocess("train.csv")

In [None]:
def split_train_val(X,Y, unique_region_list,num_of_val_days):
    
    train_ix = []
    val_ix = []
    for region in unique_region_list:
        
        mask = X["Region"]==region
        ix = np.where(mask)[0]
        
        train_ix += list(ix[:-num_of_val_days].flatten())
        val_ix += list(ix[-num_of_val_days:].flatten())
        
    return X.iloc[train_ix],X.iloc[val_ix],Y.iloc[train_ix],Y.iloc[val_ix]    

# IMPORTANT NOTE: We can only use prev_ConfirmedCases for the first day to predict

In [None]:
marginal = Marginal().explain_data(X, Y, name = 'Full train data')
show(marginal)

## Train and Predict with Explainable Boosting Machine (EBM)

In [None]:
# IMPORTANT NOTE: assuming that X_features is sorted by number of days "Days"

def evaluate_rmse(Y_predicted,Y_true):
    return np.sqrt(mean_squared_error(Y_predicted,Y_true))

def predict(X_features,Y,num_validation_days,num_days_to_predict):
    unique_region_list = list(set(X_features["Region"]))
    unique_region_list.sort()
    print("No of unique region list: {}".format(len(unique_region_list)))
    
    # Split to train and validation
    X_train,X_val,Y_train,Y_val = split_train_val(X,Y, unique_region_list,num_validation_days)
    
    # Train
    model = ExplainableBoostingRegressor(random_state=seed)
    model.fit(X_train,Y_train)
    
    # Predict for val
    Y_val_predicted = np.zeros((X_val.shape[0],1))
    
    for i in range(X_val.shape[0]):
        
        if(i==0 or X_val.iloc[i-1]["Region"] != X_val.iloc[i]["Region"]):
            pred = model.predict(X_val.iloc[[i]])[0]
        else:
            X_dummy  = X_val.iloc[[i]].copy(deep=True)
            X_dummy["prev_ConfirmedCases"] = pred
            pred = model.predict(X_dummy)

        Y_val_predicted[i] = pred
        
    # Report validation accuracy
    val_rmse = evaluate_rmse(Y_val,Y_val_predicted)
    ###############################################################################
    # Train with full data
    model_full = ExplainableBoostingRegressor(random_state=seed)
    model_full.fit(X_features,Y)
    
    
    # Predict for test
    Y_test_predicted = np.zeros((len(unique_region_list)*num_days_to_predict,1))
    count=0
    for region in unique_region_list:
        mask = X_features["Region"]==region
        
        prev_ConfirmedCase_ = Y[mask]["ConfirmedCases"].iloc[-1]
        #print(prev_ConfirmedCase_,np.exp(prev_ConfirmedCase_)-1)
        
        X_dummy = X[mask].iloc[[-1]].copy(deep=True)
        X_dummy["prev_ConfirmedCases"] = prev_ConfirmedCase_
        X_dummy["Days"] = X_dummy["Days"]+1
        
        pred = model_full.predict(X_dummy)
        Y_test_predicted[count] = pred
        count = count+1
        
        for days_ahead in range(2,num_days_to_predict+1):
            
            X_dummy["prev_ConfirmedCases"] = pred
            X_dummy["Days"] = X_dummy["Days"]+1
            pred = model_full.predict(X_dummy)
            Y_test_predicted[count] = pred
            
            count = count+1
      
    assert count==len(Y_test_predicted), "Something wrong"
    

    return unique_region_list,Y_val,Y_val_predicted,val_rmse,Y_test_predicted



In [None]:
unique_region_list,Y_val,Y_val_predicted,val_rmse,Y_test_predicted=predict(X,Y,10,43)

In [None]:
# Validation error
print(val_rmse)

In [None]:
# This is the final value
Y_test_predicted_final = np.exp(Y_test_predicted)-1

In [None]:
print(Y_test_predicted_final)
print("*****")
print(len(Y_test_predicted_final))

## DUMP PLAYGROUND

In [None]:
def get_unique_region_list(filename):
    df = pd.read_csv(filename)

    # Create category called Region: country_province
    region_list = ["{}_{}".format(df["Country_Region"][i], df["Province_State"][i]) for i in range(df.shape[0])]
    df["Region"]=region_list

    # Get first day of corona virus for each region
    unique_region_list = list(set(region_list))
    unique_region_list.sort()
    
    num_list = [np.sum(np.array(region_list)==region) for region in unique_region_list]
    return np.array(unique_region_list),np.array(num_list)