In [17]:
import pandas as pd
import sklearn as sfs
import matplotlib.pyplot as plt
import numpy as np
import sys
sys.path.append('..')
from model_handler import ModelHandler
from feature_selection import FeatureSelectionAndGeneration
handler = ModelHandler()
dataset = handler.dataset
train_set = dataset[handler.train_mask]
test_set = dataset[~handler.train_mask]

In [24]:
import random
# y_test = [random.randrange(4) for x in range(len(test_set))] -> I don't understand if you retrieve it from somewhere else

The dataset includes different risks that need a prediction. Every risk is considered as a different target of labels, namely a response variable.

The aim is to build a model able to predict each risk in the most accurate way possible. However, the learning process is different for each of them, meaning that the minimum set of variables that best explain the largest amount of variance in the dataset is unique for every risk. As a consequence, the following pipeline will be executed as much time as the number of risks in order to return as more precise predictions as possible. 

# Dataset splitting

The first step consists in splitting the dataset into training and test sets. The first will be used during the feature selection part, which is implemented using a boosted logistic regression model. This is a supervised learning approach, thus labels are needed for the regression to be carried out. In this dataset risks are assigned to only some of the cities, therefore it's wise to select as training set all the entries containing values for the given risk. All the rest will be referred to as test set, used for the classification task, since those cities will be the ones needing a prediction.

# Feature selection

When there is a highly non-linear and complex relationship between the predictors and the labels decision trees are preferable. The dataset has many different predictors and we don't know whether this relationship is linear or not.

The most robust approach among the ensemble method is `Boosting`. It allows to aggregate many decision trees, differently from `Random Forest`, and grow them sequentially, instead of using boostrap sampling like in `Bagging`. 

The procedure consists in fitting small trees to the residuals in order to slowly improve the prediction error. Generally, model that learn slowly tend to perform better. A pitfall of Boosting, however, is that it relies very much on its tuning parameters. Hence, it's important to undergo `Cross Validation` in order to select the combination returning the highest accuracy, for every target. 
For this purpose we decided to use 10-fold cross validation in such a way to speed up the tuning process, which is already slow given the amount of parameters that need to be optimized.

In [7]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
import shutil
import os
memory_dir = '.pipeline_cache.tmp'

XgBoost has as default objective function `reg:squarederror`, which corresponds to a linear regression with mean-squared error as loss function.

In [8]:
!pip install bayesian-optimization



In [25]:
from bayes_opt import BayesianOptimization
if os.path.isdir(memory_dir):
    shutil.rmtree(memory_dir)

def init_model(**model_params):
    return Pipeline([('generation_and_selection', FeatureSelectionAndGeneration(feats_num=200)), ('classifier', xgb.XGBClassifier(**model_params,use_label_encoder=False))],memory=memory_dir)
    



# def boosting_reg(train, y_train, risk, best_parameters):
    
#     '''Cross Validation'''

In [26]:
from sklearn.model_selection import cross_val_score
from data.labeled.preprocessed import RISKS_MAPPING
optimal_params = {}
CONSTANTS = {'subsample': 0.8}
for (risk, total_set, [train_set, valid_set]) in handler.get_total_train_val_set_per_risk():
    print(f"\n\n**Risk: {RISKS_MAPPING[risk]}**")
    print(f"Annotated Samples Size: {total_set.shape[0]}\n")
    def xgb_evaluate(max_depth, 
                     gamma, 
                     alpha,
                     colsample_bytree, n_estimators):
        params = {'max_depth': int(max_depth),
                  'subsample': 0.8,
                  'alpha': alpha,
                  'gamma': gamma,
                  'colsample_bytree': colsample_bytree,
                   'n_estimators': int(n_estimators),
                  "eval_metric": 'merror'}
        params.update(CONSTANTS)

        model = init_model(**params)
        train_tuple = (total_set[handler.feat_names], total_set[risk])
        reg_cv = model.fit(*train_tuple)
        cv_result = np.mean(cross_val_score(model, *train_tuple, cv=3))
        return cv_result
    xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (1, 7), 
                                                 'alpha': (0,20),
                                                 'gamma': (0, 1),
                                                 'colsample_bytree': (0.3, 0.9),
                                                 "n_estimators":[200,1000],
                                                }
                                  
                                 )
    
    # Use the expected improvement acquisition function to handle negative numbers
    # Optimally needs quite a few more initiation points and number of iterations
    xgb_bo.maximize(init_points=10, n_iter=10)
    params = xgb_bo.max['params']
    params['max_depth'] = int(params['max_depth'])
    params['n_estimators'] = int(params['n_estimators'])
    params.update(CONSTANTS)
    optimal_params[risk] = params



**Risk: Higher water prices**
Annotated Samples Size: 87

|   iter    |  target   |   alpha   | colsam... |   gamma   | max_depth | n_esti... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.5287  [0m | [0m 8.261   [0m | [0m 0.5943  [0m | [0m 0.9781  [0m | [0m 1.554   [0m | [0m 320.8   [0m |
| [0m 2       [0m | [0m 0.4943  [0m | [0m 6.947   [0m | [0m 0.8533  [0m | [0m 0.2975  [0m | [0m 1.408   [0m | [0m 758.0   [0m |
| [0m 3       [0m | [0m 0.4713  [0m | [0m 3.671   [0m | [0m 0.6832  [0m | [0m 0.05666 [0m | [0m 5.44    [0m | [0m 684.4   [0m |
| [0m 4       [0m | [0m 0.5287  [0m | [0m 9.094   [0m | [0m 0.7876  [0m | [0m 0.7497  [0m | [0m 3.975   [0m | [0m 790.8   [0m |
| [0m 5       [0m | [0m 0.5287  [0m | [0m 17.54   [0m | [0m 0.4289  [0m | [0m 0.6056  [0m | [0m 4.397   [0m | [0m 445.3   [0m |
| [0m 6       [0m | [0m 0.4598  [0m | [0m 2.145   [0

| [0m 13      [0m | [0m 0.7318  [0m | [0m 9.79    [0m | [0m 0.7216  [0m | [0m 0.3374  [0m | [0m 2.821   [0m | [0m 393.8   [0m |
| [0m 14      [0m | [0m 0.7318  [0m | [0m 12.23   [0m | [0m 0.8831  [0m | [0m 0.7808  [0m | [0m 3.342   [0m | [0m 641.8   [0m |
| [0m 15      [0m | [0m 0.7318  [0m | [0m 20.0    [0m | [0m 0.9     [0m | [0m 1.0     [0m | [0m 1.0     [0m | [0m 718.8   [0m |
| [0m 16      [0m | [0m 0.7318  [0m | [0m 13.44   [0m | [0m 0.835   [0m | [0m 0.5752  [0m | [0m 5.261   [0m | [0m 567.0   [0m |
| [0m 17      [0m | [0m 0.7318  [0m | [0m 11.65   [0m | [0m 0.4412  [0m | [0m 0.3482  [0m | [0m 4.168   [0m | [0m 900.4   [0m |
| [0m 18      [0m | [0m 0.4981  [0m | [0m 0.2669  [0m | [0m 0.843   [0m | [0m 0.04698 [0m | [0m 5.645   [0m | [0m 882.2   [0m |
| [0m 19      [0m | [0m 0.5019  [0m | [0m 0.0     [0m | [0m 0.5942  [0m | [0m 0.7009  [0m | [0m 1.0     [0m | [0m 562.1   [0m |
| [0m

| [0m 4       [0m | [0m 0.6926  [0m | [0m 2.66    [0m | [0m 0.877   [0m | [0m 0.9934  [0m | [0m 5.448   [0m | [0m 847.5   [0m |
| [0m 5       [0m | [0m 0.6926  [0m | [0m 4.474   [0m | [0m 0.303   [0m | [0m 0.3728  [0m | [0m 6.371   [0m | [0m 436.0   [0m |
| [0m 6       [0m | [0m 0.7078  [0m | [0m 8.586   [0m | [0m 0.4623  [0m | [0m 0.7262  [0m | [0m 1.18    [0m | [0m 916.3   [0m |
| [0m 7       [0m | [0m 0.5253  [0m | [0m 2.098   [0m | [0m 0.8703  [0m | [0m 0.03036 [0m | [0m 5.388   [0m | [0m 639.8   [0m |
| [0m 8       [0m | [0m 0.7078  [0m | [0m 4.846   [0m | [0m 0.3938  [0m | [0m 0.7002  [0m | [0m 4.126   [0m | [0m 897.2   [0m |
| [0m 9       [0m | [0m 0.6169  [0m | [0m 1.823   [0m | [0m 0.6433  [0m | [0m 0.5901  [0m | [0m 1.939   [0m | [0m 951.9   [0m |
| [0m 10      [0m | [0m 0.6926  [0m | [0m 3.542   [0m | [0m 0.7987  [0m | [0m 0.7036  [0m | [0m 2.044   [0m | [0m 656.8   [0m |
| [0m

Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/bayes_opt/target_space.py", line 191, in probe
    target = self._cache[_hashable(x)]
KeyError: (9.224291665943925, 0.405069903776063, 0.38451774224936985, 5.632823250027073, 927.7087637209715)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/core.py", line 436, in inner_f
    return f(**kwargs)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py", line 1123, in fit
    raise V

| [0m 1       [0m | [0m nan     [0m | [0m 9.224   [0m | [0m 0.4051  [0m | [0m 0.3845  [0m | [0m 5.633   [0m | [0m 927.7   [0m |


Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/bayes_opt/target_space.py", line 191, in probe
    target = self._cache[_hashable(x)]
KeyError: (17.885140702387325, 0.6299477709799476, 0.46234766111048287, 2.2983450693476817, 977.4586737343027)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/core.py", line 436, in inner_f
    return f(**kwargs)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py", line 1123, in fit
    rais

| [0m 2       [0m | [0m nan     [0m | [0m 17.89   [0m | [0m 0.6299  [0m | [0m 0.4623  [0m | [0m 2.298   [0m | [0m 977.5   [0m |


Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/bayes_opt/target_space.py", line 191, in probe
    target = self._cache[_hashable(x)]
KeyError: (16.42403197508265, 0.8312044559606662, 0.6892187336569529, 1.6240213815400224, 589.5250062434823)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/core.py", line 436, in inner_f
    return f(**kwargs)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py", line 1123, in fit
    raise 

| [0m 3       [0m | [0m nan     [0m | [0m 16.42   [0m | [0m 0.8312  [0m | [0m 0.6892  [0m | [0m 1.624   [0m | [0m 589.5   [0m |


Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/bayes_opt/target_space.py", line 191, in probe
    target = self._cache[_hashable(x)]
KeyError: (9.845707460002098, 0.3572957464812805, 0.7237174858251034, 1.764959627381097, 444.24890791386395)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/core.py", line 436, in inner_f
    return f(**kwargs)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py", line 1123, in fit
    raise 

| [0m 4       [0m | [0m nan     [0m | [0m 9.846   [0m | [0m 0.3573  [0m | [0m 0.7237  [0m | [0m 1.765   [0m | [0m 444.2   [0m |


Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/bayes_opt/target_space.py", line 191, in probe
    target = self._cache[_hashable(x)]
KeyError: (11.19192634866512, 0.7806634823754345, 0.1774749509121949, 1.3259293360069417, 921.4651034409363)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/core.py", line 436, in inner_f
    return f(**kwargs)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py", line 1123, in fit
    raise 

| [0m 5       [0m | [0m nan     [0m | [0m 11.19   [0m | [0m 0.7807  [0m | [0m 0.1775  [0m | [0m 1.326   [0m | [0m 921.5   [0m |


Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/bayes_opt/target_space.py", line 191, in probe
    target = self._cache[_hashable(x)]
KeyError: (5.43293847686259, 0.802894410527998, 0.8776710165710168, 5.432261527868649, 347.731657479545)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/core.py", line 436, in inner_f
    return f(**kwargs)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py", line 1123, in fit
    raise Valu

| [0m 6       [0m | [0m nan     [0m | [0m 5.433   [0m | [0m 0.8029  [0m | [0m 0.8777  [0m | [0m 5.432   [0m | [0m 347.7   [0m |


Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/bayes_opt/target_space.py", line 191, in probe
    target = self._cache[_hashable(x)]
KeyError: (14.53743273094572, 0.701686896216219, 0.10935991946107115, 4.052015990438923, 618.0029600001818)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/core.py", line 436, in inner_f
    return f(**kwargs)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py", line 1123, in fit
    raise V

| [0m 7       [0m | [0m nan     [0m | [0m 14.54   [0m | [0m 0.7017  [0m | [0m 0.1094  [0m | [0m 4.052   [0m | [0m 618.0   [0m |


Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/bayes_opt/target_space.py", line 191, in probe
    target = self._cache[_hashable(x)]
KeyError: (9.065329476921924, 0.8544750746233969, 0.36315833836611344, 4.392416366632438, 883.3501831683908)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/core.py", line 436, in inner_f
    return f(**kwargs)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py", line 1123, in fit
    raise 

| [0m 8       [0m | [0m nan     [0m | [0m 9.065   [0m | [0m 0.8545  [0m | [0m 0.3632  [0m | [0m 4.392   [0m | [0m 883.4   [0m |


Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/bayes_opt/target_space.py", line 191, in probe
    target = self._cache[_hashable(x)]
KeyError: (12.152941308638823, 0.6331169762754207, 0.8796874153693808, 3.6370908607322554, 939.587050160504)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/core.py", line 436, in inner_f
    return f(**kwargs)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py", line 1123, in fit
    raise 

| [0m 9       [0m | [0m nan     [0m | [0m 12.15   [0m | [0m 0.6331  [0m | [0m 0.8797  [0m | [0m 3.637   [0m | [0m 939.6   [0m |


Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/bayes_opt/target_space.py", line 191, in probe
    target = self._cache[_hashable(x)]
KeyError: (11.31601818770167, 0.8752055834147827, 0.47340905064803973, 3.9628918660171752, 579.3553674878542)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/sklearn/pipeline.py", line 346, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/core.py", line 436, in inner_f
    return f(**kwargs)
  File "/Users/antonella/opt/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py", line 1123, in fit
    raise

| [0m 10      [0m | [0m nan     [0m | [0m 11.32   [0m | [0m 0.8752  [0m | [0m 0.4734  [0m | [0m 3.963   [0m | [0m 579.4   [0m |


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
#from data.model import MODEL_BEST_PARAMS_PATH
#pd.DataFrame(optimal_params).to_csv(MODEL_BEST_PARAMS_PATH)

In [30]:
optimal_params = {'risk0': {'alpha': 8.261465084688773,
  'colsample_bytree': 0.5942798565322505,
  'gamma': 0.9780710626692427,
  'max_depth': 1,
  'n_estimators': 320,
  'subsample': 0.8},
 'risk1': {'alpha': 13.90401059454134,
  'colsample_bytree': 0.8285387334623371,
  'gamma': 0.4349182425943219,
  'max_depth': 5,
  'n_estimators': 968,
  'subsample': 0.8},
 'risk2': {'alpha': 17.27853343713799,
  'colsample_bytree': 0.588032054558853,
  'gamma': 0.0981480232590467,
  'max_depth': 6,
  'n_estimators': 666,
  'subsample': 0.8},
 'risk3': {'alpha': 8.192315865044893,
  'colsample_bytree': 0.7562427208831419,
  'gamma': 0.9169370907046851,
  'max_depth': 2,
  'n_estimators': 485,
  'subsample': 0.8},
 'risk4': {'alpha': 7.922430928147,
  'colsample_bytree': 0.807955446502681,
  'gamma': 0.7970662889259271,
  'max_depth': 4,
  'n_estimators': 950,
  'subsample': 0.8},
 'risk5': {'alpha': 13.362754510732369,
  'colsample_bytree': 0.6246517434414811,
  'gamma': 0.6516365370180226,
  'max_depth': 1,
  'n_estimators': 415,
  'subsample': 0.8}}

def classification(train_tuple, valid_set, y_test, risk, best_parameters):
    
    model = xgb.XGBClassifier(use_label_encoder=False,**best_parameters[risk])
    model.fit(*train_tuple)
    
    y_pred = model.predict(valid_set)
    predictions = [value for value in y_pred]
    accuracy = accuracy_score(y_test, predictions)
    print(risk, accuracy*100.0)
    return predictions

predictions = dict()
#risks = optimal_params.keys()

'Iterate on the risks and get the partitioned data before running the classification() function'
for (risk, total_set, [train_set, valid_set]) in handler.get_total_train_val_set_per_risk():
    train_tuple = (total_set[handler.feat_names], total_set[risk])
    #train, test, y_train, y_test = data_splitting(dt,risk)
    predictions[risk] = classification(train_tuple, valid_set, y_test, risk, optimal_params)



ValueError: Feature shape mismatch, expected: 148, got 27