In [1]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, precision_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from scipy.stats import randint, uniform
import pandas as pd

class RandomForest:
    def __init__(self, n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, 
                 max_features, bootstrap, learning_rate, subsample, colsample_bytree, num_leaves, gamma):
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.random_state = 42
        self.stacking_model = None  
        self.predictions = []
        self.learning_rate = learning_rate  
        self.subsample = subsample  
        self.colsample_bytree = colsample_bytree  
        self.num_leaves = num_leaves
        self.gamma = gamma

    def fit(self, X, y):        
        n_estimators_per_model = self.n_estimators // 3
        # Definir el clasificador de nivel 1
        estimators = [
            ('xgb', XGBClassifier(n_estimators=n_estimators_per_model, max_depth=self.max_depth, random_state=self.random_state,
                                  learning_rate=self.learning_rate, subsample=self.subsample, colsample_bytree=self.colsample_bytree,
                                  gamma=self.gamma)),
            ('lgbm', LGBMClassifier(n_estimators=n_estimators_per_model, max_depth=self.max_depth, random_state=self.random_state,
                                    learning_rate=self.learning_rate, subsample=self.subsample, colsample_bytree=self.colsample_bytree,
                                    num_leaves=self.num_leaves)),
            ('extra_trees', ExtraTreesClassifier(n_estimators=n_estimators_per_model, criterion=self.criterion,
                                                 max_depth=self.max_depth, min_samples_split=self.min_samples_split,
                                                 min_samples_leaf=self.min_samples_leaf, max_features=self.max_features,
                                                 bootstrap=self.bootstrap, random_state=self.random_state))]
            
        # Definir el clasificador de nivel 2 (modelo base)
        rf_base = RandomForestClassifier(n_estimators=self.n_estimators, random_state=self.random_state)
        
        # Crear el modelo de stacking
        self.stacking_model = StackingClassifier(estimators=estimators, final_estimator=rf_base)
        self.stacking_model.fit(X, y)

    def predict(self, X):
        return self.stacking_model.predict(X)
    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'criterion': self.criterion,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split,
            'min_samples_leaf': self.min_samples_leaf,
            'max_features': self.max_features,
            'bootstrap': self.bootstrap,
            'learning_rate': self.learning_rate,
            'subsample': self.subsample,
            'colsample_bytree': self.colsample_bytree,
            'num_leaves': self.num_leaves,
            'gamma': self.gamma
        }

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

class RandomForestClassifierWrapper:
    def __init__(self, X, y, n_estimators, criterion, max_depth, min_samples_split,
                 min_samples_leaf, max_features, bootstrap, learning_rate, subsample, colsample_bytree, num_leaves, gamma):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.X_train = self.X_train.values
        self.y_train = self.y_train.values
        
        self.rf = RandomForest(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
                                min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                max_features=max_features, bootstrap=bootstrap,
                                learning_rate=learning_rate, subsample=subsample,
                                colsample_bytree=colsample_bytree, num_leaves=num_leaves, gamma=gamma)
        self.estimators = []

    def train(self):
        self.rf.fit(self.X_train, self.y_train)
        
    def predict(self):
        return self.rf.predict(self.X_test)

    def evaluate(self):
        y_pred = self.predict()
        
        accuracy = accuracy_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred, average='macro')
        recall = recall_score(self.y_test, y_pred, average='macro')
        f1 = f1_score(self.y_test, y_pred, average='macro')
        return accuracy, precision, recall, f1

def custom_scoring(y_true, y_pred):
    return precision_score(y_true, y_pred, average='macro')

# Leer datos
df = pd.read_excel("C:\\Users\\klgt1\\Downloads\\dataset_BALANCEADO.xlsx")
X = df.drop('CONDUCTA', axis=1)
y = df['CONDUCTA']

# Definir el espacio de búsqueda de hiperparámetros
param_dist = {
    'n_estimators': randint(20, 100),
    'criterion': ['gini', 'entropy'],
    'max_depth': randint(3, 20),
    'min_samples_split': randint(1, 6),
    'min_samples_leaf': randint(1, 6),
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
    'learning_rate': uniform(0.001, 0.5),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'num_leaves': randint(2, 20),
    'gamma': uniform(0, 5)
}

# Crear instancia del clasificador
rf_wrapper = RandomForestClassifierWrapper(X, y, n_estimators=100, criterion='gini', max_depth=10, min_samples_split=2,
                                           min_samples_leaf=1, max_features=None, bootstrap=True, learning_rate=0.1,
                                           subsample=0.8, colsample_bytree=0.8, num_leaves=31, gamma=0)
# Entrenar el modelo
rf_wrapper.train()

# Crear instancia de RandomizedSearchCV
random_search = RandomizedSearchCV(rf_wrapper.rf, param_distributions=param_dist, n_iter=100, cv=5, scoring='precision_macro', verbose=2, n_jobs=-1)

# Entrenar el modelo
random_search.fit(X, y)

# Imprimir resultados
print("Mejores hiperparámetros:")
print(random_search.best_params_)
print("Mejor score: ", random_search.best_score_)


found 0 physical cores < 1
  File "C:\Users\klgt1\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


[LightGBM] [Info] Number of positive: 906, number of negative: 1115
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000215 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 559
[LightGBM] [Info] Number of data points in the train set: 2021, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448293 -> initscore=-0.207570
[LightGBM] [Info] Start training from score -0.207570
[LightGBM] [Info] Number of positive: 724, number of negative: 892
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 554
[LightGBM] [Info] Number of data points in the train set: 1616, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448020 -> initscore=-0.208675
[LightGBM] [I

90 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\klgt1\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\klgt1\AppData\Local\Temp\ipykernel_17204\1834542126.py", line 49, in fit
  File "C:\Users\klgt1\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\ensemble\_stacking.py", line 669, in fit
    return super().fit(X, y_encoded, sample_weight)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\klgt1\AppData\Local\Programs\Python\Python31

[LightGBM] [Info] Number of positive: 1123, number of negative: 1404
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000407 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 556
[LightGBM] [Info] Number of data points in the train set: 2527, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.444400 -> initscore=-0.223322
[LightGBM] [Info] Start training from score -0.223322
[LightGBM] [Info] Number of positive: 898, number of negative: 1123
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002565 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 562
[LightGBM] [Info] Number of data points in the train set: 2021, number of used features: 32
[LightGBM] [Info] [binary:B

# Nuestro modelo con búsqueda exhaustiva para selección de parámetros

In [None]:
!pip uninstall xgboost
!pip install -U xgboost
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

class RandomForest:
    def __init__(self, n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, 
                 max_features, bootstrap, learning_rate, subsample, colsample_bytree, num_leaves, gamma,
                 objective='binary', max_leaf_nodes=None, min_impurity_decrease=0.0,
                 min_weight_fraction_leaf=0.0, min_child_samples=20, feature_fraction=0.6):
        self.n_estimators = n_estimators
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.random_state = 42
        self.stacking_model = None  
        self.predictions = []
        self.learning_rate = learning_rate  
        self.subsample = subsample  
        self.colsample_bytree = colsample_bytree  
        self.num_leaves = num_leaves
        self.gamma = gamma
        self.objective = objective
        self.max_leaf_nodes = max_leaf_nodes
        self.min_impurity_decrease = min_impurity_decrease
        self.min_weight_fraction_leaf = min_weight_fraction_leaf
        self.min_child_samples = min_child_samples
        self.feature_fraction = feature_fraction

    def fit(self, X, y):        
        n_estimators_per_model = self.n_estimators // 3
        # Definir el clasificador de nivel 1
        estimators = [
            ('xgb', XGBClassifier(n_estimators=n_estimators_per_model, max_depth=self.max_depth, random_state=self.random_state,
                                  learning_rate=self.learning_rate, subsample=self.subsample, colsample_bytree=self.colsample_bytree,
                                  gamma=self.gamma, objective=self.objective, max_leaf_nodes=self.max_leaf_nodes,
                                  min_impurity_decrease=self.min_impurity_decrease, min_weight_fraction_leaf=self.min_weight_fraction_leaf,
                                  min_child_samples=self.min_child_samples, feature_fraction=self.feature_fraction)),
            ('lgbm', LGBMClassifier(n_estimators=n_estimators_per_model, max_depth=self.max_depth, random_state=self.random_state,
                                    learning_rate=self.learning_rate, subsample=self.subsample, colsample_bytree=self.colsample_bytree,
                                    num_leaves=self.num_leaves, max_leaf_nodes=self.max_leaf_nodes,
                                    min_child_samples=self.min_child_samples, feature_fraction=self.feature_fraction)),
            ('extra_trees', ExtraTreesClassifier(n_estimators=n_estimators_per_model, criterion=self.criterion,
                                                 max_depth=self.max_depth, min_samples_split=self.min_samples_split,
                                                 min_samples_leaf=self.min_samples_leaf, max_features=self.max_features,
                                                 bootstrap=self.bootstrap, random_state=self.random_state))]
            
        # Definir el clasificador de nivel 2 (modelo base)
        rf_base = RandomForestClassifier(n_estimators=self.n_estimators, random_state=self.random_state)
        
        # Crear el modelo de stacking
        self.stacking_model = StackingClassifier(estimators=estimators, final_estimator=rf_base)
        self.stacking_model.fit(X, y)

    def predict(self, X):
        return self.stacking_model.predict(X)
    def get_params(self, deep=True):
        return {
            'n_estimators': self.n_estimators,
            'criterion': self.criterion,
            'max_depth': self.max_depth,
            'min_samples_split': self.min_samples_split,
            'min_samples_leaf': self.min_samples_leaf,
            'max_features': self.max_features,
            'bootstrap': self.bootstrap,
            'learning_rate': self.learning_rate,
            'subsample': self.subsample,
            'colsample_bytree': self.colsample_bytree,
            'num_leaves': self.num_leaves,
            'gamma': self.gamma,
            'objective': self.objective,
            'max_leaf_nodes': self.max_leaf_nodes,
            'min_impurity_decrease': self.min_impurity_decrease,
            'min_weight_fraction_leaf': self.min_weight_fraction_leaf,
            'min_child_samples': self.min_child_samples,
            'feature_fraction': self.feature_fraction
        }

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        return self

class RandomForestClassifierWrapper:
    def __init__(self, X, y, n_estimators, criterion, max_depth, min_samples_split,
                 min_samples_leaf, max_features, bootstrap, learning_rate, subsample, colsample_bytree, num_leaves, gamma,
                 objective='binary', max_leaf_nodes=None, min_impurity_decrease=0.0,
                 min_weight_fraction_leaf=0.0, min_child_samples=20, feature_fraction=0.6):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.X_train = self.X_train.values
        self.y_train = self.y_train.values
        
        self.rf = RandomForest(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth,
                                min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
                                max_features=max_features, bootstrap=bootstrap,
                                learning_rate=learning_rate, subsample=subsample,
                                colsample_bytree=colsample_bytree, num_leaves=num_leaves, gamma=gamma,
                                objective=objective, max_leaf_nodes=max_leaf_nodes,
                                min_impurity_decrease=min_impurity_decrease, min_weight_fraction_leaf=min_weight_fraction_leaf,
                                min_child_samples=min_child_samples, feature_fraction=feature_fraction)
        self.estimators = []

    def train(self):
        self.rf.fit(self.X_train, self.y_train)
        
    def predict(self):
        return self.rf.predict(self.X_test)

    def evaluate(self):
        y_pred = self.predict()
        
        accuracy = accuracy_score(self.y_test, y_pred)
        precision = precision_score(self.y_test, y_pred, average='macro')
        recall = recall_score(self.y_test, y_pred, average='macro')
        f1 = f1_score(self.y_test, y_pred, average='macro')
        return accuracy, precision, recall, f1

# Leer datos
df = pd.read_excel("C:\\Users\\klgt1\\Downloads\\dataset_BALANCEADO.xlsx")
X = df.drop('CONDUCTA', axis=1)
y = df['CONDUCTA']

# Definir el espacio de búsqueda de hiperparámetros
param_grid = {
    'n_estimators': [50],
    'criterion': ['entropy'],
    'max_depth': [20],
    'min_samples_split': [4,],
    'min_samples_leaf': [1],
    'max_features': [0.25],
    'bootstrap': [True],
    'learning_rate': [0.1],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [1.0],
    'num_leaves': [31],
    'gamma': [1],
    'metric':['logloss'], #FALTA
    'objective':['binary:logistic'],
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
    'min_weight_fraction_leaf': 0.0,
    'min_child_samples': 20,
    'feature_fraction': 0.6,
}
# Crear instancia del clasificador
rf_wrapper = RandomForestClassifierWrapper(X, y, n_estimators=100, criterion='gini', max_depth=10, min_samples_split=2,
                                           min_samples_leaf=1, max_features=None, bootstrap=True, learning_rate=0.1,
                                           subsample=0.8, colsample_bytree=0.8, num_leaves=31, gamma=0)

# Entrenar el modelo
rf_wrapper.train()

# Crear instancia de GridSearchCV
grid_search = GridSearchCV(rf_wrapper.rf, param_grid=param_grid, cv=5, scoring='precision_macro', verbose=2, n_jobs=-1)

# Entrenar el modelo
grid_search.fit(X, y)

# Imprimir resultados
print("Mejores hiperparámetros:")
print(grid_search.best_params_)
print("Mejor score: ", grid_search.best_score_)
