In [11]:
import pandas as pd
import numpy as np
df_train = pd.read_csv('train.csv',index_col=0)
df_test = pd.read_csv('test.csv',index_col=0)

class etl:
    def __init__(self,data,training=True,split_cities=True):
        """ETL use cases: training data, making predictions and training on the context of different cities"""
        self.data = data
        self.training = training
        self.split = 0.8
        self.split_cities = split_cities
        
    def transform(self):
        """forward fill null values and drop date/year columns"""
        iq = self.data[self.data['city']=='iq'].sort_values(by='week_start_date',ascending=True)
        sj = self.data[self.data['city']=='sj'].sort_values(by='week_start_date',ascending=True)
        sj.fillna(method='ffill', inplace=True)
        iq.fillna(method='ffill', inplace=True)
        self.data = pd.concat([sj,iq],axis=0)
        self.data.drop(['week_start_date','year'],axis=1,inplace=True)
        self.data = self.format_city()
        return self.data
    def format_city(self):
        """convert the city into a machine readible variable"""
        self.data['city'] = self.data['city'].apply(lambda x: 1 if x=='sj' else 0)
        return self.data
    
    def split_data(self,df):
        """split the data into train and test sets"""
        mask = np.random.rand(len(df))<self.split
        train = df[mask]
        test = df[~mask]
        return train,test
    
    def load(self):
        data = self.transform()
        if self.training:
            if self.split_cities:
                sj = data[data['city']==1].drop('city',axis=1)
                iq = data[data['city']==0].drop('city',axis=1)
                sj_train,sj_test = self.split_data(sj)
                iq_train,iq_test = self.split_data(iq)
                return sj_train,sj_test,iq_train,iq_test
            else:
                return self.split_data(data)
        else:
            if self.split_cities:
                sj = data[data['city']==1].drop('city',axis=1)
                iq = data[data['city']==0].drop('city',axis=1)
                return sj,iq
            else:
                return data
    
train, test = etl(df_train,split_cities=False).load()
features = [j for j in df_train.columns if j not in ['week_start_date','year']]
target = 'total_cases'
X_train = train[features].drop(target,axis=1)
X_test = test[features].drop(target,axis=1)
y_train = train[target]
y_test = test[target]

Unnamed: 0,city,weekofyear,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,...,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
0,1,18,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,...,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,4
1,1,19,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,...,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,5
2,1,20,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,...,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,4
5,1,23,0.1962,0.17485,0.254314,0.181743,9.58,299.63,299.764286,295.851429,...,79.891429,9.58,17.212857,2.1,28.114286,6.942857,34.4,23.9,39.1,2
6,1,24,0.1129,0.0928,0.205071,0.210271,3.48,299.207143,299.221429,295.865714,...,82.0,3.48,17.234286,2.042857,27.414286,6.771429,32.2,23.3,29.7,4


In [16]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error,make_scorer
import warnings
warnings.filterwarnings('ignore')

#scorer = make_scorer(mean_absolute_error, greater_is_better=False)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
pipe_rf = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestRegressor())])

In [59]:
criterion = ["squared_error", "absolute_error", "poisson"]
max_depth = [None,5,10,20]
max_features = ["sqrt", "log2", None]
rf_parameters = {'criterion':criterion,'max_depth':max_depth,'max_features':max_features,'max_depth':max_depth}
rf = RandomForestRegressor()
tree = GridSearchCV(rf, rf_parameters,scoring=scorer,cv=4)
tree.fit(X,y)
print(tree.best_estimator_)
print(tree.best_score_)

RandomForestRegressor(criterion='poisson', max_features='log2')
-21.816684240993045


In [70]:
loss = ["squared_error", "absolute_error", "poisson", "quantile"]
learning_rate = np.arange(0.001,2,0.01)
hgbr_params = {'loss':loss,'learning_rate':learning_rate}
hgbr = HistGradientBoostingRegressor()
hgbr_gs = GridSearchCV(hgbr, hgbr_params,scoring=scorer,cv=4)
hgbr_gs.fit(X,y)
print(hgbr_gs.best_estimator_)
print(hgbr_gs.best_score_)

HistGradientBoostingRegressor(learning_rate=0.08099999999999999, loss='poisson')
-16.973879517402874


In [None]:
for lr in learing_rate:
    hgbr

In [93]:
import numpy as np
class count_model:
    def __init__(self,model,params=None):
        self.model = model()
        if params is not None:
            self.model = model(**params)
    def fit(self,X,y):
        return self.model.fit(X,y)
    def predict(self,X):
        Y = self.model.predict(X)
        #return 0 if something is negative or infinity
        Y.fillna()
        Y = list(map(lambda x: max(x,0),Y))
        return Y

In [119]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

loss = ["poisson", "least_absolute_deviation"]
learning_rate = np.arange(0.001,2,0.01)
max_leaf_nodes = 

best_err = 1000
best_params = []   
for l in loss:
    for rate in learning_rate: 
        hgbr = HistGradientBoostingRegressor(loss=l,learning_rate=rate)
        pipe_hgbr = Pipeline([('scaler', StandardScaler()), ('hgbr',hgbr)])
        pipe_hgbr.fit(X_train,y_train)
        y_pred = pipe_hgbr.predict(X_test)
        y_pred[y_pred == inf] = 0
        err = mean_absolute_error(y_test,y_pred)
        if err<best_err:
            best_err = err
            if len(best_params)!=0:
                best_params.pop()
            best_params.append([l,rate])
print(best_err)
print(best_params)

13.376952060247417
[['poisson', 0.10099999999999998]]


In [121]:
y_pred_train = pipe_hgbr.predict(X_train)
y_pred_train[(y_pred_train==np.inf)]=0
err_train = mean_absolute_error(y_train,y_pred_train)
print(err_train)

10.02345161934709


In [123]:
import random
random.randint(0, 9)

AttributeError: module 'numpy' has no attribute 'randint'

In [129]:
from sklearn.model_selection import train_test_split
import random
best_hgbr_params = []
best_hgbr_scores = []
for i in range(10):
    best_err = 100
    best_params = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random.randint(0, 100))
    for rate in learning_rate: 
        hgbr = HistGradientBoostingRegressor(loss='poisson',learning_rate=rate)
        pipe_hgbr = Pipeline([('scaler', StandardScaler()), ('hgbr',hgbr)])
        pipe_hgbr.fit(X_train,y_train)
        y_pred = pipe_hgbr.predict(X_test)
        y_pred[y_pred==np.inf]=0
        err = mean_absolute_error(y_test,y_pred)
        if err<best_err:
            best_err = err
            if len(best_params)!=0:
                best_params.pop()
            best_params.append(['poisson',rate])
    best_hgbr_params.append(best_params)
    best_hgbr_scores.append(best_err)
print(np.mean(best_hgbr_scores))

15.158274092520951


In [136]:
best_hgbr_params

[[['least_absolute_deviation', 0.11099999999999999]],
 [['least_absolute_deviation', 0.17099999999999999]],
 [['least_absolute_deviation', 0.20099999999999996]],
 [['least_absolute_deviation', 0.20099999999999996]],
 [['least_absolute_deviation', 0.20099999999999996]],
 [['least_absolute_deviation', 0.23099999999999996]],
 [['least_absolute_deviation', 0.141]],
 [['least_absolute_deviation', 0.08099999999999999]],
 [['least_absolute_deviation', 0.31099999999999994]],
 [['least_absolute_deviation', 0.18099999999999997]]]

In [115]:
criterion = ["poisson"]
max_depth = [None,20,25,30,35]
max_features = ["sqrt", "log2", None]

best_err = 1000
best_params = [] 

for c in criterion:
    for d in max_depth:
        for f in max_features:
            rf = RandomForestRegressor(criterion=c,max_depth=d,max_features=f)
            pipe_rf = Pipeline([('scaler', StandardScaler()), ('rf',rf)])
            pipe_rf.fit(X_train,y_train)
            y_pred = pipe_rf.predict(X_test)
            y_pred[y_pred==np.inf]=0
            y_pred[y_pred<0]=0
            err = mean_absolute_error(y_test,y_pred)
            if err<best_err:
                best_err = err
                if len(best_params)!=0:
                    best_params.pop()
                best_params.append([c,d,f])
                
print(best_err)
print(best_params)

19.8796857679169
[['poisson', 25, 'log2']]


In [108]:
best_err

13.376952060247417