In [15]:
import pandas as pd 
import catboost as cb 
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from bayes_opt import BayesianOptimization
from bayes_opt import SequentialDomainReductionTransformer
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs
from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
train_values = pd.read_csv('train_values.csv', index_col='building_id')
train_values1 = pd.read_csv('train_values_short1.csv', index_col='building_id')
train_values_baseline = pd.get_dummies(train_values)
train_values_short1 = train_values1.merge(train_values[["geo_level_1_id", "geo_level_2_id", "geo_level_3_id"]],
                                            left_index=True, right_index=True)
train_values_catboost = pd.read_csv("train_values_catboost.csv", index_col='building_id')
train_values_lgbm = pd.read_csv("train_values_lightGBM.csv", index_col='building_id')

train_labels = pd.read_csv('train_labels.csv', index_col='building_id')
le = LabelEncoder()
train_labels_encoded = le.fit_transform(y=train_labels.values.ravel())

In [4]:
categoricas = []
for col in train_values_catboost.columns:
    if ((col != 'min_geo_id') & (col != 'max_geo_id')):
        categoricas.append(col+'_cat')
        train_values_catboost[col] = train_values_catboost[col].astype("category")
        # Las renombro porque tiene las mismas columnas que los otros pero son disntintas
        train_values_catboost.rename(columns={col:col+'_cat'}, inplace=True)

In [5]:
cols_baseline = train_values_baseline.columns
cols_short1 = train_values_short1.columns
cols_catboost = train_values_catboost.columns
cols_lgbm = train_values_lgbm.columns

In [6]:
# Guardo todas las columnas en el mismo DF
train_values_baseline.loc[:,cols_short1] = train_values_short1
train_values_baseline.loc[:,cols_catboost] = train_values_catboost
train_values_baseline.loc[:, cols_lgbm] = train_values_lgbm
train_values = train_values_baseline

In [14]:
xgb_baseline = XGBClassifier(random_state=2021, max_depth= 10, n_estimators= 273,
                                                         verbosity=0, use_label_encoder=False)
xgb_baseline.get_params

<bound method XGBModel.get_params of XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=10,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=273, n_jobs=None, num_parallel_tree=None,
              random_state=2021, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              use_label_encoder=False, validate_parameters=None, verbosity=0)>

In [16]:
class EncontradorPesos(ClassifierMixin, BaseEstimator):
    def __init__(self, peso_xgb_baseline, peso_xgb_short1, peso_catboost, peso_lgbm, categoricas_catboost):
        
        self.peso_xgb_baseline = peso_xgb_baseline
        self.xgb_baseline = XGBClassifier(random_state=2021, max_depth= 10, n_estimators= 273,
                                                         verbosity=0, use_label_encoder=False)
        
        self.peso_xgb_short1 = peso_xgb_short1
        self.xgb_short1=XGBClassifier( objective = "multi:softmax", use_label_encoder=False, seed=30,
                         max_depth=9,
                         min_child_weight=6,
                         gamma=0.2554138025988315,
                        colsample_bytree = 0.7773754946860542,
                        subsample = 0.9030471581301206,
                         learning_rate =0.1, 
                         n_estimators=273,
                        verbosity=0
                         )
        
        self.peso_catboost = peso_catboost
        self.catboost = cb.CatBoostClassifier(loss_function='MultiClass', eval_metric='TotalF1:average=Micro', 
                                              iterations=100, verbose=False)
        self.categoricas_catboost = categoricas_catboost
        
        self.lgbm = LGBMClassifier(random_state=2021, num_iterations= 273,
                                   num_leaves= 70, objective= 'regression', colsample_bytree=0.77)
        self.peso_lgbm = peso_lgbm
    
                 
    def fit(self, X, y):
        self.xgb_short1.fit(X[cols_short1], y)
        self.xgb_baseline.fit(X[cols_baseline], y)
        self.catboost.fit(X[cols_catboost], y, self.categoricas_catboost)
        self.lgbm.fit(X[cols_lgbm],y)
        
    def predict(self, X):
        probas_xgb_short1 =  self.xgb_short1.predict_proba(X[cols_short1]) * self.peso_xgb_short1
        probas_xgb_baseline =  self.xgb_baseline.predict_proba(X[cols_baseline]) * self.peso_xgb_baseline
        probas_catboost =  self.catboost.predict_proba(X[cols_catboost]) * self.peso_catboost
        probas_lgbm =  self.lgbm.predict_proba(X[cols_lgbm]) * self.peso_lgbm

        
        preds = []
        for i in range(len(X[cols_short1])):
            proba_grado1 = probas_xgb_short1[i][0] + probas_xgb_baseline[i][0] \
                           + probas_catboost[i][0] + probas_lgbm[i][0]
            proba_grado2 = probas_xgb_short1[i][1] + probas_xgb_baseline[i][1] \
                           + probas_catboost[i][1] + probas_lgbm[i][1]
            proba_grado3 = probas_xgb_short1[i][2] + probas_xgb_baseline[i][2] \
                           + probas_catboost[i][2] + probas_lgbm[i][2]
            
            
            max_proba = max([proba_grado1, proba_grado2, proba_grado3])
            preds.append([proba_grado1, proba_grado2, proba_grado3].index(max_proba))
        
 
        return preds

In [17]:
def funcion_a_optimizar(peso_baseline, peso_short1, peso_lgbm):

            model = EncontradorPesos(peso_xgb_baseline=peso_baseline,
                                     peso_xgb_short1=peso_short1, peso_catboost=1, peso_lgbm = peso_lgbm,
                        categoricas_catboost=categoricas)

            score = cross_val_score(model, train_values, train_labels_encoded, scoring='f1_micro', cv=8).mean()
            print("Score: " + str(score))
            print("\t Peso baseline: " + str(peso_baseline))
            print("\t Peso short1: " + str(peso_short1))
            print("\t Peso lgbm: " + str(peso_lgbm))
            print()
            
            return(score)

In [18]:
limites = {"peso_baseline":(4,5), "peso_short1":(3,4), "peso_lgbm":(1.5,2.5)}

In [19]:
optimizer = BayesianOptimization(
    f=funcion_a_optimizar,
    random_state=35320,
    pbounds=limites,
)


logger = JSONLogger(path="./logs_ultima_carta1.json")
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

In [20]:
optimizer.probe({'peso_baseline': 4.418722982360061,
  'peso_lgbm': 1.847955192063759,
  'peso_short1': 3.6931656858666138}, lazy=True)

In [None]:
optimizer.maximize(
    init_points=10,
    n_iter=3000,
)

In [17]:
optimizer.max

{'target': 0.7604600000000001,
 'params': {'peso_baseline': 4.418722982360061,
  'peso_lgbm': 1.847955192063759,
  'peso_short1': 3.6931656858666138}}

In [18]:
def target(resultado):
    return resultado['target']

def mostrar(resultados, min=0):
    
    a_borrar = []
    for resultado in resultados:
        if(resultado['target']<min):
            a_borrar.append(resultado)
    resultados_importantes = [a for a in resultados if a not in a_borrar]
    
    resultados_importantes.sort(reverse=True, key=target)
    
    for resultado in resultados_importantes:
        print("target: " , resultado['target'])
        for key,value in resultado['params'].items():
            print("\t\t", key, ':', value)
        print()

In [None]:
mostrar(optimizer.res)