In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
X_train = train.drop(["Fatalities", "ConfirmedCases"], axis=1)

In [4]:
countries = X_train["Country/Region"]

In [5]:
X_train = X_train.drop(["Id"], axis=1)
X_test = test.drop(["ForecastId"], axis=1)

In [6]:
X_train['Date']= pd.to_datetime(X_train['Date']) 
X_test['Date']= pd.to_datetime(X_test['Date']) 

In [7]:
X_train = X_train.set_index(['Date'])
X_test = X_test.set_index(['Date'])

In [8]:
def create_time_features(df):
    """
    Creates time series features from datetime index
    """
    df['date'] = df.index
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear
    
    X = df[['hour','dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
    return X

In [9]:
create_time_features(X_train)
create_time_features(X_test)

Unnamed: 0_level_0,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-12,0,3,1,3,2020,72,12,11
2020-03-13,0,4,1,3,2020,73,13,11
2020-03-14,0,5,1,3,2020,74,14,11
2020-03-15,0,6,1,3,2020,75,15,11
2020-03-16,0,0,1,3,2020,76,16,12
...,...,...,...,...,...,...,...,...
2020-04-19,0,6,2,4,2020,110,19,16
2020-04-20,0,0,2,4,2020,111,20,17
2020-04-21,0,1,2,4,2020,112,21,17
2020-04-22,0,2,2,4,2020,113,22,17


In [10]:
X_train.drop("date", axis=1, inplace=True)
X_test.drop("date", axis=1, inplace=True)

In [11]:
X_train = pd.concat([X_train,pd.get_dummies(X_train['Province/State'], prefix='ps')],axis=1)
X_train.drop(['Province/State'],axis=1, inplace=True)
X_test = pd.concat([X_test,pd.get_dummies(X_test['Province/State'], prefix='ps')],axis=1)
X_test.drop(['Province/State'],axis=1, inplace=True)

In [12]:
X_train = pd.concat([X_train,pd.get_dummies(X_train['Country/Region'], prefix='cr')],axis=1)
X_train.drop(['Country/Region'],axis=1, inplace=True)
X_test = pd.concat([X_test,pd.get_dummies(X_test['Country/Region'], prefix='cr')],axis=1)
X_test.drop(['Country/Region'],axis=1, inplace=True)

In [13]:
y_train = train["Fatalities"]

In [14]:
# reg = xgb.XGBRegressor(n_estimators=1000,max_depth=10,silent=0,nthread=6,verbosity=2,num_parallel_tree=10,n_jobs=1000)

In [15]:
# reg.fit(X_train, y_train, verbose=True)

In [16]:
# plot = plot_importance(reg, height=0.9)

In [17]:
# y_train = train["ConfirmedCases"]
# confirmed_reg = xgb.XGBRegressor(n_estimators=1000,max_depth=10,silent=0,nthread=6,verbosity=2,num_parallel_tree=5,n_jobs=100)
# confirmed_reg.fit(X_train, y_train, verbose=True)
# preds = confirmed_reg.predict(X_test)
# preds = np.array(preds)
# preds[preds < 0] = 0
# preds = np.round(preds, 0)

In [18]:
# preds = np.array(preds)

In [19]:
submissionOrig = pd.read_csv("submission.csv")

In [20]:
y_train = train["ConfirmedCases"]
confirmed_reg = RandomForestRegressor(max_depth=100,n_jobs=-1,n_estimators=100)
confirmed_reg.fit(X_train, y_train)
preds = confirmed_reg.predict(X_test)
preds = np.array(preds)
preds[preds < 0] = 0
preds = np.round(preds, 0)
preds = np.array(preds)
submissionOrig["ConfirmedCases"]=pd.Series(preds)

In [21]:
y_train = train["Fatalities"]
confirmed_reg = RandomForestRegressor(max_depth=100,n_jobs=-1,n_estimators=100)
confirmed_reg.fit(X_train, y_train)
preds = confirmed_reg.predict(X_test)
preds = np.array(preds)
preds[preds < 0] = 0
preds = np.round(preds, 0)
submissionOrig["Fatalities"]=pd.Series(preds)



In [22]:
submissionOrig.to_csv('submission.csv',index=False)