# The following gave worse results than xgboost, so it was dropped

In [1]:
import pandas as pd
import sklearn as sfs
import matplotlib.pyplot as plt
import numpy as np
import sys
sys.path.append('..')
from model_handler import ModelHandler
from feature_selection import FeatureSelectionAndGeneration
handler = ModelHandler()
dataset = handler.dataset
train_set = dataset[handler.train_mask]

The dataset includes different risks that need a prediction. Every risk is considered as a different target of labels, namely a response variable.

The aim is to build a model able to predict each risk in the most accurate way possible. However, the learning process is different for each of them, meaning that the minimum set of variables that best explain the largest amount of variance in the dataset is unique for every risk. As a consequence, the following pipeline will be executed as much time as the number of risks in order to return as more precise predictions as possible. 

# Dataset splitting

The first step consists in splitting the dataset into training and test sets. The first will be used during the feature selection part, which is implemented using a boosted logistic regression model. This is a supervised learning approach, thus labels are needed for the regression to be carried out. In this dataset risks are assigned to only some of the cities, therefore it's wise to select as training set all the entries containing values for the given risk. All the rest will be referred to as test set, used for the classification task, since those cities will be the ones needing a prediction.

# Feature selection

When there is a highly non-linear and complex relationship between the predictors and the labels decision trees are preferable. The dataset has many different predictors and we don't know whether this relationship is linear or not.

The most robust approach among the ensemble method is `Boosting`. It allows to aggregate many decision trees, differently from `Random Forest`, and grow them sequentially, instead of using boostrap sampling like in `Bagging`. 

The procedure consists in fitting small trees to the residuals in order to slowly improve the prediction error. Generally, model that learn slowly tend to perform better. A pitfall of Boosting, however, is that it relies very much on its tuning parameters. Hence, it's important to undergo `Cross Validation` in order to select the combination returning the highest accuracy, for every target. 
For this purpose we decided to use 10-fold cross validation in such a way to speed up the tuning process, which is already slow given the amount of parameters that need to be optimized.

In [3]:
from sklearn.ensemble import AdaBoostRegressor

In [4]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
import shutil
import os
memory_dir = '.pipeline_cache.tmp'

XgBoost has as default objective function `reg:squarederror`, which corresponds to a linear regression with mean-squared error as loss function.

In [5]:
from bayes_opt import BayesianOptimization
if os.path.isdir(memory_dir):
    shutil.rmtree(memory_dir)

def init_model(**model_params):
    return Pipeline([('generation_and_selection', FeatureSelectionAndGeneration(feats_num=200)),
                     ('regressor', AdaBoostRegressor(**model_params))],memory=memory_dir)
    

In [7]:
from sklearn.model_selection import cross_val_score
from data.labeled.preprocessed import RISKS_MAPPING
from classification import RANDOM_SEED
optimal_params = {}
CONSTANTS = {"random_state": RANDOM_SEED}
for (risk, total_set, [train_set, valid_set]) in handler.get_total_train_val_set_per_risk():
    print(f"\n\n**Risk: {RISKS_MAPPING[risk]}**\n")
    print(f"Annotated Samples Size: {total_set.shape[0]}")
    print(f"To be used for parameters estimation: {train_set.shape[0]}\n")
    def evaluate(n_estimators, learning_rate):
        params = {'n_estimators': int(n_estimators),
                 'learning_rate':learning_rate}
        params.update(CONSTANTS)

        model = init_model(**params)
        train_tuple = (train_set[handler.feat_names], train_set[risk])
        reg_cv = model.fit(*train_tuple)
        cv_result = np.mean(cross_val_score(model, *train_tuple, cv=3,scoring='neg_mean_squared_error'))
        return cv_result
    xgb_bo = BayesianOptimization(evaluate, {
                                                 "n_estimators":[200,500],
                                                 "learning_rate":[0.1,10]
                                                }
                                  
                                 )
    
    # Use the expected improvement acquisition function to handle negative numbers
    # Optimally needs quite a few more initiation points and number of iterations
    xgb_bo.maximize(init_points=10, n_iter=10)
    params = xgb_bo.max['params']
    params['n_estimators'] = int(params['n_estimators'])
    params.update(CONSTANTS)
    optimal_params[risk] = params



**Risk: Higher water prices**

Annotated Samples Size: 87
To be used for parameters estimation: 60

|   iter    |  target   | learni... | n_esti... |
-------------------------------------------------
| [0m 1       [0m | [0m-1.965   [0m | [0m 3.01    [0m | [0m 425.5   [0m |
| [0m 2       [0m | [0m-2.01    [0m | [0m 2.664   [0m | [0m 399.6   [0m |
| [95m 3       [0m | [95m-1.717   [0m | [95m 6.241   [0m | [95m 380.4   [0m |
| [0m 4       [0m | [0m-5.05    [0m | [0m 9.546   [0m | [0m 286.7   [0m |
| [0m 5       [0m | [0m-1.883   [0m | [0m 6.565   [0m | [0m 487.1   [0m |
| [0m 6       [0m | [0m-5.05    [0m | [0m 9.719   [0m | [0m 240.8   [0m |
| [95m 7       [0m | [95m-1.49    [0m | [95m 0.8848  [0m | [95m 440.6   [0m |
| [0m 8       [0m | [0m-2.481   [0m | [0m 3.183   [0m | [0m 495.0   [0m |
| [0m 9       [0m | [0m-4.05    [0m | [0m 6.968   [0m | [0m 375.0   [0m |
| [0m 10      [0m | [0m-1.53    [0m | [0m 0.274

| [0m 5       [0m | [0m-2.772   [0m | [0m 9.375   [0m | [0m 428.7   [0m |
| [0m 6       [0m | [0m-3.337   [0m | [0m 7.097   [0m | [0m 211.7   [0m |
| [0m 7       [0m | [0m-4.883   [0m | [0m 7.126   [0m | [0m 497.2   [0m |
| [0m 8       [0m | [0m-1.544   [0m | [0m 2.891   [0m | [0m 253.6   [0m |
| [0m 9       [0m | [0m-2.004   [0m | [0m 3.842   [0m | [0m 414.7   [0m |
| [0m 10      [0m | [0m-2.212   [0m | [0m 2.601   [0m | [0m 451.4   [0m |
| [0m 11      [0m | [0m-2.294   [0m | [0m 10.0    [0m | [0m 280.5   [0m |
| [95m 12      [0m | [95m-1.488   [0m | [95m 0.1     [0m | [95m 327.5   [0m |
| [0m 13      [0m | [0m-2.294   [0m | [0m 9.727   [0m | [0m 345.2   [0m |
| [0m 14      [0m | [0m-2.294   [0m | [0m 10.0    [0m | [0m 311.2   [0m |
| [0m 15      [0m | [0m-1.489   [0m | [0m 0.1     [0m | [0m 394.4   [0m |
| [0m 16      [0m | [0m-2.294   [0m | [0m 9.736   [0m | [0m 399.0   [0m |
| [0m 17   

In [6]:
# from data.model import MODEL_BEST_PARAMS_PATH
# pd.DataFrame(optimal_params).to_csv(MODEL_BEST_PARAMS_PATH)