In [1]:
import pandas as pd 
import catboost as cb 
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from bayes_opt import BayesianOptimization
from bayes_opt import SequentialDomainReductionTransformer
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs
from sklearn.metrics import f1_score
from sklearn.base import BaseEstimator

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
train_values = pd.read_csv('train_values.csv', index_col='building_id')
train_values1 = pd.read_csv('train_values_short1.csv', index_col='building_id')
train_values_baseline = pd.get_dummies(train_values)
train_values_short1 = train_values1.merge(train_values[["geo_level_1_id", "geo_level_2_id", "geo_level_3_id"]],
                                            left_index=True, right_index=True)
train_values_catboost = pd.read_csv("train_values_catboost.csv", index_col='building_id')
train_values_lgbm = pd.read_csv("train_values_lightGBM.csv", index_col='building_id')

train_labels = pd.read_csv('train_labels.csv', index_col='building_id')
le = LabelEncoder()
train_labels_encoded = le.fit_transform(y=train_labels.values.ravel())

In [4]:
categoricas = []
for col in train_values_catboost.columns:
    if ((col != 'min_geo_id') & (col != 'max_geo_id')):
        categoricas.append(col+'_cat')
        train_values_catboost[col] = train_values_catboost[col].astype("category")
        # Las renombro porque tiene las mismas columnas que los otros pero son disntintas
        train_values_catboost.rename(columns={col:col+'_cat'}, inplace=True)

In [5]:
cols_baseline = train_values_baseline.columns
cols_short1 = train_values_short1.columns
cols_catboost = train_values_catboost.columns
cols_lgbm = train_values_lgbm.columns

In [6]:
# Guardo todas las columnas en el mismo DF
train_values_baseline.loc[:,cols_short1] = train_values_short1
train_values_baseline.loc[:,cols_catboost] = train_values_catboost
train_values_baseline.loc[:, cols_lgbm] = train_values_lgbm
train_values = train_values_baseline

In [16]:
class EncontradorPesos_Probas(ClassifierMixin, BaseEstimator):
    def __init__(self, peso_xgb_baseline, peso_xgb_short1, peso_catboost, peso_lgbm, categoricas_catboost):
        
        self.peso_xgb_baseline = peso_xgb_baseline
        self.xgb_baseline = XGBClassifier(random_state=2021, max_depth= 10, n_estimators= 273,
                                                         verbosity=0, use_label_encoder=False)
        
        self.peso_xgb_short1 = peso_xgb_short1
        self.xgb_short1=XGBClassifier( objective = "multi:softmax", use_label_encoder=False, seed=30,
                         max_depth=9,
                         min_child_weight=6,
                         gamma=0.2554138025988315,
                        colsample_bytree = 0.7773754946860542,
                        subsample = 0.9030471581301206,
                         learning_rate =0.1, 
                         n_estimators=273,
                        verbosity=0
                         )
        
        self.peso_catboost = peso_catboost
        self.catboost = cb.CatBoostClassifier(loss_function='MultiClass', eval_metric='TotalF1:average=Micro', 
                                              iterations=100, verbose=False)
        self.categoricas_catboost = categoricas_catboost
        
        self.lgbm = LGBMClassifier(random_state=2021, num_iterations= 273,
                                   num_leaves= 70, objective= 'regression', colsample_bytree=0.77)
        self.peso_lgbm = peso_lgbm
    
                 
    def fit(self, X, y):
        self.xgb_short1.fit(X[cols_short1], y)
        self.xgb_baseline.fit(X[cols_baseline], y)
        self.catboost.fit(X[cols_catboost], y, self.categoricas_catboost)
        self.lgbm.fit(X[cols_lgbm],y)
        
    def predict(self, X):
        probas_xgb_short1 =  self.xgb_short1.predict_proba(X[cols_short1]) * self.peso_xgb_short1
        probas_xgb_baseline =  self.xgb_baseline.predict_proba(X[cols_baseline]) * self.peso_xgb_baseline
        probas_catboost =  self.catboost.predict_proba(X[cols_catboost]) * self.peso_catboost
        probas_lgbm =  self.lgbm.predict_proba(X[cols_lgbm]) * self.peso_lgbm

        
        preds = []
        for i in range(len(X[cols_short1])):
            proba_grado1 = probas_xgb_short1[i][0] + probas_xgb_baseline[i][0] \
                           + probas_catboost[i][0] + probas_lgbm[i][0]
            proba_grado2 = probas_xgb_short1[i][1] + probas_xgb_baseline[i][1] \
                           + probas_catboost[i][1] + probas_lgbm[i][1]
            proba_grado3 = probas_xgb_short1[i][2] + probas_xgb_baseline[i][2] \
                           + probas_catboost[i][2] + probas_lgbm[i][2]
            
            
            max_proba = max([proba_grado1, proba_grado2, proba_grado3])
            preds.append([proba_grado1, proba_grado2, proba_grado3].index(max_proba))
        
 
        return preds

In [17]:
def funcion_a_optimizar(peso_baseline, peso_short1, peso_lgbm):

            model = EncontradorPesosProbas(peso_xgb_baseline=peso_baseline,
                                     peso_xgb_short1=peso_short1, peso_catboost=1, peso_lgbm = peso_lgbm,
                        categoricas_catboost=categoricas)

            score = cross_val_score(model, train_values, train_labels_encoded, scoring='f1_micro', cv=8).mean()
            print("Score: " + str(score))
            print("\t Peso baseline: " + str(peso_baseline))
            print("\t Peso short1: " + str(peso_short1))
            print("\t Peso lgbm: " + str(peso_lgbm))
            print()
            
            return(score)

In [18]:
limites = {"peso_baseline":(4,5), "peso_short1":(3,4), "peso_lgbm":(1.5,2.5)}

In [19]:
optimizer = BayesianOptimization(
    f=funcion_a_optimizar,
    random_state=35320,
    pbounds=limites,
)


logger = JSONLogger(path="./logs_ultima_carta1.json")
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

In [20]:
optimizer.probe({'peso_baseline': 4.418722982360061,
  'peso_lgbm': 1.847955192063759,
  'peso_short1': 3.6931656858666138}, lazy=True)

In [21]:
optimizer.maximize(
    init_points=10,
    n_iter=3000,
)

Score: 0.7611252446843237
	 Peso baseline: 4.418722982360061
	 Peso short1: 3.6931656858666138
	 Peso lgbm: 1.847955192063759

Score: 0.7612288520983388
	 Peso baseline: 4.219716801624352
	 Peso short1: 3.411887326897567
	 Peso lgbm: 2.39216802199134

Score: 0.7603501136771422
	 Peso baseline: 4.790657193713024
	 Peso short1: 3.0707311039383702
	 Peso lgbm: 1.7114425278466747

Score: 0.7611789681596572
	 Peso baseline: 4.345625576132316
	 Peso short1: 3.4096593183764643
	 Peso lgbm: 2.2495859005575944

Score: 0.760833612704436
	 Peso baseline: 4.639437892618754
	 Peso short1: 3.2094891833959354
	 Peso lgbm: 2.1159947105940367

Score: 0.760749190840768
	 Peso baseline: 4.668134148436347
	 Peso short1: 3.2409416665138666
	 Peso lgbm: 1.9119364092721693

Score: 0.7612518747116372
	 Peso baseline: 4.353867943211865
	 Peso short1: 3.923264751056289
	 Peso lgbm: 1.978399139968495

Score: 0.7612979219407534
	 Peso baseline: 4.073835832891402
	 Peso short1: 3.9708730769907343
	 Peso lgbm: 1.59

KeyboardInterrupt: 

In [22]:
optimizer.max

{'target': 0.7612979219407534,
 'params': {'peso_baseline': 4.073835832891402,
  'peso_lgbm': 1.5910896220676003,
  'peso_short1': 3.9708730769907343}}

In [24]:
# Pruebo lo mismo pero sin las probailidades (con el grado de danio que predice cada uno)
class EncontradorPesos(ClassifierMixin, BaseEstimator):
    def __init__(self, peso_xgb_baseline, peso_xgb_short1, peso_catboost, peso_lgbm, categoricas_catboost):
        
        self.peso_xgb_baseline = peso_xgb_baseline
        self.xgb_baseline = XGBClassifier(random_state=2021, max_depth= 10, n_estimators= 273,
                                                         verbosity=0, use_label_encoder=False)
        
        self.peso_xgb_short1 = peso_xgb_short1
        self.xgb_short1=XGBClassifier( objective = "multi:softmax", use_label_encoder=False, seed=30,
                         max_depth=9,
                         min_child_weight=6,
                         gamma=0.2554138025988315,
                        colsample_bytree = 0.7773754946860542,
                        subsample = 0.9030471581301206,
                         learning_rate =0.1, 
                         n_estimators=273,
                        verbosity=0
                         )
        
        self.peso_catboost = peso_catboost
        self.catboost = cb.CatBoostClassifier(loss_function='MultiClass', eval_metric='TotalF1:average=Micro', 
                                              iterations=100, verbose=False, random_seed=2021)
        self.categoricas_catboost = categoricas_catboost
        
        self.lgbm = LGBMClassifier(random_state=2021, num_iterations= 273,
                                   num_leaves= 70, objective= 'regression', colsample_bytree=0.77)
        self.peso_lgbm = peso_lgbm
    
                 
    def fit(self, X, y):
        self.xgb_short1.fit(X[cols_short1], y)
        self.xgb_baseline.fit(X[cols_baseline], y)
        self.catboost.fit(X[cols_catboost], y, self.categoricas_catboost)
        self.lgbm.fit(X[cols_lgbm],y)
        
    def predict(self, X):
        preds_xgb_short1 =  self.xgb_short1.predict(X[cols_short1]) * self.peso_xgb_short1
        preds_xgb_baseline =  self.xgb_baseline.predict(X[cols_baseline]) * self.peso_xgb_baseline
        preds_catboost =  self.catboost.predict(X[cols_catboost]) * self.peso_catboost
        preds_lgbm =  self.lgbm.predict(X[cols_lgbm]) * self.peso_lgbm
        
        preds_catboost_arreglado = []
        for i in preds_catboost:
            preds_catboost_arreglado.append(i[0])
            
        return np.round((preds_xgb_short1+ preds_xgb_baseline+ preds_catboost_arreglado+ preds_lgbm)/   \
        ( self.peso_xgb_baseline+ self.peso_xgb_short1+ self.peso_catboost+ self.peso_lgbm))

In [25]:
def funcion_a_optimizar1(peso_baseline, peso_short1, peso_lgbm, peso_catboost):

            model = EncontradorPesos(peso_xgb_baseline=peso_baseline,
                                     peso_xgb_short1=peso_short1, peso_catboost=peso_catboost, peso_lgbm = peso_lgbm,
                        categoricas_catboost=categoricas)

            score = cross_val_score(model, train_values, train_labels_encoded, scoring='f1_micro', cv=8, 
                                   ).mean()
            print("Score: " + str(score))
            print("\t Peso baseline: " + str(peso_baseline))
            print("\t Peso short1: " + str(peso_short1))
            print("\t Peso lgbm: " + str(peso_lgbm))
            print("\t Peso catboost: " + str(peso_catboost))
            print()
            
            return(score)

In [26]:
limites = {"peso_baseline":(3,5), "peso_short1":(2.5,4.5), "peso_lgbm":(1,3), "peso_catboost":(0.5,2)}

In [40]:
optimizer = BayesianOptimization(
    f=funcion_a_optimizar1,
    random_state=54300,
    pbounds=limites,
)

load_logs(optimizer,"./logs/Encontrar pesos/Sin Probas/logs_ultima_carta_sin_probas1.json")
logger = JSONLogger(path="./logs/Encontrar pesos/Sin Probas/logs_ultima_carta_sin_probas2.json")
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

In [28]:
optimizer.probe({"peso_baseline": 3.353088965342895, "peso_catboost": 1.0052309329869793,
                "peso_lgbm": 1.3465747679887474, "peso_short1": 4.237636212714062}, lazy=True)

In [42]:
optimizer.probe({"peso_baseline":4.237636212714062, "peso_catboost": 1.0052309329869793,
                "peso_lgbm": 1.3465747679887474, "peso_short1": 3.353088965342895}, lazy=True)

In [43]:
optimizer.maximize(
    init_points=10,
    n_iter=3000,
)

Score: 0.7607952364207503
	 Peso baseline: 4.237636212714062
	 Peso short1: 3.353088965342895
	 Peso lgbm: 1.3465747679887474
	 Peso catboost: 1.0052309329869793



KeyboardInterrupt: 

In [45]:
optimizer.max

{'target': 0.7616471152834927,
 'params': {'peso_baseline': 3.353088965342895,
  'peso_catboost': 1.0052309329869793,
  'peso_lgbm': 1.3465747679887474,
  'peso_short1': 4.237636212714062}}

In [None]:
model = EncontradorPesos(peso_xgb_baseline=3.353088965342895,
                                     peso_xgb_short1=peso_short1, 
                         peso_catboost=peso_catboost, peso_lgbm = peso_lgbm,
                        categoricas_catboost=categoricas)