In [None]:
!pip install numpy==1.24.4 catboost==1.2
!pip install kneed

In [None]:
!pip install xgboost

In [None]:
!pip install lightgbm

In [None]:
!pip install ngboost

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import copy

from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score
import warnings



from catboost import CatBoostRegressor
from kneed import KneeLocator

# Inicializando os dados

In [None]:
df = pd.read_csv('/Users/joao altarugio/Desktop/Projeto LaMav/data/Really important data/RefractiveIndex_clean.csv')
df

In [None]:
df = df.loc[:, (df != 0).any(axis=0)]

In [None]:
df.describe()

# Modelo 3 boostings

# CV - Todos 
- Se quiser rodar todos os tune junto; O código abaixo não esta 100% otimizado em termos de paralelização como os tunes individuais

import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from multiprocessing import Pool
import warnings
from joblib import Parallel, delayed

def parallel_evaluate_objectives(param_jobs, n_jobs):
    return Parallel(n_jobs=n_jobs)(
        delayed(objective_global)(job) for job in param_jobs
    )


warnings.filterwarnings("ignore")
def evaluate_param_objective(args):
    objective_func, param = args
    return (objective_func(**param), param)


#--- Métrica RRMSE ---
def rrmse(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    rrmse = rmse / np.std(y_true)
    return rrmse

#--- RandomSearch Customizado ---
class RandomSearch:
    def __init__(self, param_space, max_iter=2, n_jobs=2, random_state=None):
        self.param_space = param_space
        self.max_iter = max_iter
        self.n_jobs = n_jobs
        self.rng = np.random.default_rng(random_state)

    def sample_params(self):
        return {
            key: self.rng.choice(values) for key, values in self.param_space.items()
        }

    def fmin(self, objective, **kwargs):
        param_list = [self.sample_params() for _ in range(self.max_iter)]
        param_jobs = [(objective, p) for p in param_list]

        if self.n_jobs > 1:
            with Pool(self.n_jobs) as pool:
                results = pool.map(evaluate_param_objective, param_jobs)
        else:
            results = [evaluate_param_objective(job) for job in param_jobs]

        results.sort(key=lambda x: x[0])  # menor erro
        return results[0]  # (erro, melhores_params)


#--- Parâmetros de busca para cada modelo ---

num = 10  # Número de pontos a serem gerados em distribuições com linspace

param_spaces = {
    "CatBoost": {
        "iterations": np.linspace(100, 1000, num=num, dtype=int).tolist(),
        "learning_rate": np.round(np.linspace(0.01, 0.40, num=num), 4).tolist(),
        "depth": list(range(1, 17)),  # [1, ..., 16]
        "l2_leaf_reg": np.round(np.linspace(0.0, 7.0, num=num), 4).tolist(),
        "random_strength": np.round(np.linspace(0.0, 1.0, num=num), 4).tolist(),
        "bagging_temperature": np.round(np.linspace(0.0, 1.5, num=num), 4).tolist(),
        "border_count": list(range(128, 255, 14)),
    },
    "XGBoost": {
        "n_estimators": np.linspace(100, 1000, num=num, dtype=int).tolist(),
        "learning_rate": np.round(np.linspace(0.01, 0.40, num=num), 4).tolist(),
        "max_depth": list(range(3, 17, 2)),  # geralmente não se usa profundidade 1-2 em XGB
        "subsample": np.round(np.linspace(0.5, 1.0, num=6), 4).tolist(),
        "colsample_bytree": np.round(np.linspace(0.5, 1.0, num=6), 4).tolist(),
        "reg_lambda": np.round(np.linspace(0.0, 5.0, num=6), 4).tolist(),
        "reg_alpha": np.round(np.linspace(0.0, 5.0, num=6), 4).tolist(),
    },
    "LightGBM": {
        "n_estimators": np.linspace(100, 1000, num=num, dtype=int).tolist(),
        "learning_rate": np.round(np.linspace(0.01, 0.40, num=num), 4).tolist(),
        "max_depth": list(range(3, 17, 2)),
        "num_leaves": list(range(20, 130, 11)),  # [20, 31, 42, ..., 129]
        "subsample": np.round(np.linspace(0.5, 1.0, num=6), 4).tolist(),
        "colsample_bytree": np.round(np.linspace(0.5, 1.0, num=6), 4).tolist(),
        "reg_lambda": np.round(np.linspace(0.0, 5.0, num=6), 4).tolist(),
        "reg_alpha": np.round(np.linspace(0.0, 5.0, num=6), 4).tolist(),
    },
    "HistGradientBoosting": {
        "max_iter": np.linspace(100, 1000, num=num, dtype=int).tolist(),
        "learning_rate": np.round(np.linspace(0.01, 0.40, num=num), 4).tolist(),
        "max_depth": [None] + list(range(3, 17, 2)),  # incluir None para profundidade ilimitada
        "l2_regularization": np.round(np.linspace(0.0, 5.0, num=num), 4).tolist(),
        "max_bins": list(range(128, 255, 14)),  # similar ao CatBoost border_count
        "min_samples_leaf": list(range(10, 51, 5)),  # controle do overfitting
    },
}


#--- Modelos ---
def get_model(name, params):
    if name == "XGBoost":
        return XGBRegressor(objective="reg:squarederror", n_jobs=-1, **params)
    elif name == "HistGradientBoosting":
        return HistGradientBoostingRegressor(**params)
    elif name == "LightGBM":
        return LGBMRegressor(n_jobs=-1, **params)
    elif name == "CatBoost":
        return CatBoostRegressor(verbose=0, thread_count=-1, **params)
    else:
        raise ValueError("Modelo desconhecido")

#--- Tuning com validação cruzada ---
def objective(params, model_name, X, y, kf):
    scores = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = get_model(model_name, params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        scores.append(rrmse(y_val, y_pred))
    return np.mean(scores)

def objective_global(params_model_data):
    params, model_name, X, y, kf = params_model_data
    return objective(params, model_name, X, y, kf), params

def tune_model(model_name, X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    resultados = pd.Series(index=X.index, dtype=float)

    search = RandomSearch(param_spaces[model_name], max_iter=2, n_jobs=2)

    # Prepara lista de amostras de hiperparâmetros + dados fixos para paralelizar
    param_list = [search.sample_params() for _ in range(search.max_iter)]
    param_jobs = [(p, model_name, X, y, kf) for p in param_list]

    if search.n_jobs > 1:
        results = parallel_evaluate_objectives(param_jobs, search.n_jobs)

    else:
        results = [objective_global(job) for job in param_jobs]

    results.sort(key=lambda x: x[0])
    best_score, best_params = results[0]

    # Preencher predições finais com melhores parâmetros
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train = y.iloc[train_idx]
        model = get_model(model_name, best_params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        resultados.iloc[val_idx] = y_pred

    return resultados, best_params, best_score



#--- Executar tuning para todos os modelos ---
X = df.drop(columns="RefractiveIndex")
y = df["RefractiveIndex"]
resultados = pd.DataFrame(index=df.index)

modelos = ["XGBoost", "HistGradientBoosting", "LightGBM", "CatBoost"]

for nome in modelos:
    print(f"Tunando {nome}...")
    preds, best_params, score = tune_model(nome, X, y)
    resultados[nome + "_pred"] = preds
    print(f"Melhores parâmetros de {nome}: {best_params}")
    print(f"RRMSE: {score:.5f}\n")


# XGBoost

In [None]:
def parallel_evaluate_objectives(param_jobs, n_jobs):
    return Parallel(n_jobs=n_jobs)(
        delayed(objective_global)(job) for job in param_jobs
    )


warnings.filterwarnings("ignore")
def evaluate_param_objective(args):
    objective_func, param = args
    return (objective_func(**param), param)


# --- Métrica RRMSE ---
def rrmse(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    rrmse = rmse / np.std(y_true)
    return rrmse

# --- RandomSearch Customizado ---
from joblib import Parallel, delayed

def evaluate_param_objective(args):
    objective_func, param = args
    return (objective_func(**param), param)

class RandomSearch:
    def __init__(self, param_space, max_iter=200, n_jobs=-1, random_state=None):
        self.param_space = param_space
        self.max_iter = max_iter
        self.n_jobs = n_jobs
        self.rng = np.random.default_rng(random_state)

    def sample_params(self):
        return {
            key: self.rng.choice(values) for key, values in self.param_space.items()
        }

    def fmin(self, objective, **kwargs):
        param_list = [self.sample_params() for _ in range(self.max_iter)]
        param_jobs = [(objective, p) for p in param_list]

        if self.n_jobs > 1:
            results = Parallel(n_jobs=self.n_jobs)(
                delayed(evaluate_param_objective)(job) for job in param_jobs
            )
        else:
            results = [evaluate_param_objective(job) for job in param_jobs]

        results.sort(key=lambda x: x[0])  # menor erro
        return results[0]



# --- Parâmetros de busca para cada modelo ---

num = 100  # Número de pontos a serem gerados em distribuições com linspace

param_spaces = {
    "XGBoost": {
        "n_estimators": np.linspace(100, 1000, num=num, dtype=int).tolist(),
        "learning_rate": np.round(np.linspace(0.01, 0.40, num=num), 4).tolist(),
        "max_depth": list(range(3, 17, 2)),  # geralmente não se usa profundidade 1-2 em XGB
        "subsample": np.round(np.linspace(0.5, 1.0, num=6), 4).tolist(),
        "colsample_bytree": np.round(np.linspace(0.5, 1.0, num=6), 4).tolist(),
        "reg_lambda": np.round(np.linspace(0.0, 5.0, num=6), 4).tolist(),
        "reg_alpha": np.round(np.linspace(0.0, 5.0, num=6), 4).tolist(),
    }
}


# --- Modelos ---
def get_model(name, params):
    if name == "XGBoost":
        return XGBRegressor(objective="reg:squarederror", tree_method="hist", n_jobs=-1, **params)
    else:
        raise ValueError("Modelo desconhecido")

# --- Tuning com validação cruzada ---
def objective(params, model_name, X, y, kf):
    scores = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = get_model(model_name, params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        scores.append(rrmse(y_val, y_pred))
    return np.mean(scores)

def objective_global(params_model_data):
    params, model_name, X, y, kf = params_model_data
    return objective(params, model_name, X, y, kf), params

def tune_model(model_name, X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    resultados = pd.Series(index=X.index, dtype=float)

    search = RandomSearch(param_spaces[model_name], max_iter=200, n_jobs=-1)

    # Prepara lista de amostras de hiperparâmetros + dados fixos para paralelizar
    param_list = [search.sample_params() for _ in range(search.max_iter)]
    param_jobs = [(p, model_name, X, y, kf) for p in param_list]

    if search.n_jobs > 1:
        results = parallel_evaluate_objectives(param_jobs, search.n_jobs)

    else:
        results = [objective_global(job) for job in param_jobs]

    results.sort(key=lambda x: x[0])
    best_score, best_params = results[0]

    # Preencher predições finais com melhores parâmetros
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train = y.iloc[train_idx]
        model = get_model(model_name, best_params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        resultados.iloc[val_idx] = y_pred

    return resultados, best_params, best_score



# --- Executar tuning para todos os modelos ---
X = df.drop(columns="RefractiveIndex")
y = df["RefractiveIndex"]
resultados = pd.DataFrame(index=df.index)

modelos = ["XGBoost"]

for nome in modelos:
    print(f"Tunando {nome}...")
    preds, best_params, score = tune_model(nome, X, y)
    resultados[nome + "_pred"] = preds
    print(f"Melhores parâmetros de {nome}: {best_params}")
    print(f"RRMSE: {score:.5f}\n")


# Hist

In [None]:
def parallel_evaluate_objectives(param_jobs, n_jobs):
    return Parallel(n_jobs=n_jobs)(
        delayed(objective_global)(job) for job in param_jobs
    )

warnings.filterwarnings("ignore")
def evaluate_param_objective(args):
    objective_func, param = args
    return (objective_func(**param), param)


# --- Métrica RRMSE ---
def rrmse(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    rrmse = rmse / np.std(y_true)
    return rrmse

# --- RandomSearch Customizado ---
from joblib import Parallel, delayed

def evaluate_param_objective(args):
    objective_func, param = args
    return (objective_func(**param), param)

class RandomSearch:
    def __init__(self, param_space, max_iter=200, n_jobs=-1, random_state=None):
        self.param_space = param_space
        self.max_iter = max_iter
        self.n_jobs = n_jobs
        self.rng = np.random.default_rng(random_state)

    def sample_params(self):
        return {
            key: self.rng.choice(values) for key, values in self.param_space.items()
        }

    def fmin(self, objective, **kwargs):
        param_list = [self.sample_params() for _ in range(self.max_iter)]
        param_jobs = [(objective, p) for p in param_list]

        if self.n_jobs > 1:
            results = Parallel(n_jobs=self.n_jobs)(
                delayed(evaluate_param_objective)(job) for job in param_jobs
            )
        else:
            results = [evaluate_param_objective(job) for job in param_jobs]

        results.sort(key=lambda x: x[0])  # menor erro
        return results[0]



# --- Parâmetros de busca para cada modelo ---

num = 100  # Número de pontos a serem gerados em distribuições com linspace

param_spaces = {
    "HistGradientBoosting": {
        "max_iter": np.linspace(100, 1000, num=num, dtype=int).tolist(),
        "learning_rate": np.round(np.linspace(0.01, 0.40, num=num), 4).tolist(),
        "max_depth": [None] + list(range(3, 17, 2)),  # incluir None para profundidade ilimitada
        "l2_regularization": np.round(np.linspace(0.0, 5.0, num=num), 4).tolist(),
        "max_bins": list(range(128, 255, 14)),  # similar ao CatBoost border_count
        "min_samples_leaf": list(range(10, 51, 5)),  # controle do overfitting
    }
}


# --- Modelos ---
def get_model(name, params):
    if name == "HistGradientBoosting":
        return HistGradientBoostingRegressor(**params)
    else:
        raise ValueError("Modelo desconhecido")

# --- Tuning com validação cruzada ---
def objective(params, model_name, X, y, kf):
    scores = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = get_model(model_name, params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        scores.append(rrmse(y_val, y_pred))
    return np.mean(scores)

def objective_global(params_model_data):
    params, model_name, X, y, kf = params_model_data
    return objective(params, model_name, X, y, kf), params

def tune_model(model_name, X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    resultados = pd.Series(index=X.index, dtype=float)

    search = RandomSearch(param_spaces[model_name], max_iter=200, n_jobs=-1)

    # Prepara lista de amostras de hiperparâmetros + dados fixos para paralelizar
    param_list = [search.sample_params() for _ in range(search.max_iter)]
    param_jobs = [(p, model_name, X, y, kf) for p in param_list]

    if search.n_jobs > 1:
        results = parallel_evaluate_objectives(param_jobs, search.n_jobs)

    else:
        results = [objective_global(job) for job in param_jobs]

    results.sort(key=lambda x: x[0])
    best_score, best_params = results[0]

    # Preencher predições finais com melhores parâmetros
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train = y.iloc[train_idx]
        model = get_model(model_name, best_params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        resultados.iloc[val_idx] = y_pred

    return resultados, best_params, best_score



# --- Executar tuning para todos os modelos ---
X = df.drop(columns="RefractiveIndex")
y = df["RefractiveIndex"]
resultados = pd.DataFrame(index=df.index)

modelos = ["HistGradientBoosting"]

for nome in modelos:
    print(f"Tunando {nome}...")
    preds, best_params, score = tune_model(nome, X, y)
    resultados[nome + "_pred"] = preds
    print(f"Melhores parâmetros de {nome}: {best_params}")
    print(f"RRMSE: {score:.5f}\n")


# LightGB

In [None]:
def parallel_evaluate_objectives(param_jobs, n_jobs):
    return Parallel(n_jobs=n_jobs)(
        delayed(objective_global)(job) for job in param_jobs
    )


warnings.filterwarnings("ignore")
def evaluate_param_objective(args):
    objective_func, param = args
    return (objective_func(**param), param)


# --- Métrica RRMSE ---
def rrmse(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    rrmse = rmse / np.std(y_true)
    return rrmse

# --- RandomSearch Customizado ---
from joblib import Parallel, delayed

def evaluate_param_objective(args):
    objective_func, param = args
    return (objective_func(**param), param)

class RandomSearch:
    def __init__(self, param_space, max_iter=200, n_jobs=-1, random_state=None):
        self.param_space = param_space
        self.max_iter = max_iter
        self.n_jobs = n_jobs
        self.rng = np.random.default_rng(random_state)

    def sample_params(self):
        return {
            key: self.rng.choice(values) for key, values in self.param_space.items()
        }

    def fmin(self, objective, **kwargs):
        param_list = [self.sample_params() for _ in range(self.max_iter)]
        param_jobs = [(objective, p) for p in param_list]

        if self.n_jobs > 1:
            results = Parallel(n_jobs=self.n_jobs)(
                delayed(evaluate_param_objective)(job) for job in param_jobs
            )
        else:
            results = [evaluate_param_objective(job) for job in param_jobs]

        results.sort(key=lambda x: x[0])  # menor erro
        return results[0]



# --- Parâmetros de busca para cada modelo ---

num = 100  # Número de pontos a serem gerados em distribuições com linspace

param_spaces = {
    "LightGBM": {
        "n_estimators": np.linspace(100, 1000, num=num, dtype=int).tolist(),
        "learning_rate": np.round(np.linspace(0.01, 0.40, num=num), 4).tolist(),
        "max_depth": list(range(3, 17, 2)),
        "num_leaves": list(range(20, 130, 11)),  # [20, 31, 42, ..., 129]
        "subsample": np.round(np.linspace(0.5, 1.0, num=6), 4).tolist(),
        "colsample_bytree": np.round(np.linspace(0.5, 1.0, num=6), 4).tolist(),
        "reg_lambda": np.round(np.linspace(0.0, 5.0, num=6), 4).tolist(),
        "reg_alpha": np.round(np.linspace(0.0, 5.0, num=6), 4).tolist(),
    }
}


# --- Modelos ---
def get_model(name, params):
    if name == "LightGBM":
        return LGBMRegressor(n_jobs=-1, **params)
    else:
        raise ValueError("Modelo desconhecido")

# --- Tuning com validação cruzada ---
def objective(params, model_name, X, y, kf):
    scores = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = get_model(model_name, params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        scores.append(rrmse(y_val, y_pred))
    return np.mean(scores)

def objective_global(params_model_data):
    params, model_name, X, y, kf = params_model_data
    return objective(params, model_name, X, y, kf), params

def tune_model(model_name, X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    resultados = pd.Series(index=X.index, dtype=float)

    search = RandomSearch(param_spaces[model_name], max_iter=200, n_jobs=-1)

    # Prepara lista de amostras de hiperparâmetros + dados fixos para paralelizar
    param_list = [search.sample_params() for _ in range(search.max_iter)]
    param_jobs = [(p, model_name, X, y, kf) for p in param_list]

    if search.n_jobs > 1:
        results = parallel_evaluate_objectives(param_jobs, search.n_jobs)

    else:
        results = [objective_global(job) for job in param_jobs]

    results.sort(key=lambda x: x[0])
    best_score, best_params = results[0]

    # Preencher predições finais com melhores parâmetros
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train = y.iloc[train_idx]
        model = get_model(model_name, best_params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        resultados.iloc[val_idx] = y_pred

    return resultados, best_params, best_score



# --- Executar tuning para todos os modelos ---
X = df.drop(columns="RefractiveIndex")
y = df["RefractiveIndex"]
resultados = pd.DataFrame(index=df.index)

modelos = ["LightGBM"]

for nome in modelos:
    print(f"Tunando {nome}...")
    preds, best_params, score = tune_model(nome, X, y)
    resultados[nome + "_pred"] = preds
    print(f"Melhores parâmetros de {nome}: {best_params}")
    print(f"RRMSE: {score:.5f}\n")


# CatB

In [None]:
n = 200
def parallel_evaluate_objectives(param_jobs, n_jobs):
    return Parallel(n_jobs=n_jobs)(
        delayed(objective_global)(job) for job in param_jobs
    )


warnings.filterwarnings("ignore")
def evaluate_param_objective(args):
    objective_func, param = args
    return (objective_func(**param), param)


# --- Métrica RRMSE ---
def rrmse(y_true, y_pred):
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    rrmse = rmse / np.std(y_true)
    return rrmse

# --- RandomSearch Customizado ---
from joblib import Parallel, delayed

def evaluate_param_objective(args):
    objective_func, param = args
    return (objective_func(**param), param)

class RandomSearch:
    def __init__(self, param_space, max_iter=n, n_jobs=-1, random_state=None):
        self.param_space = param_space
        self.max_iter = max_iter
        self.n_jobs = n_jobs
        self.rng = np.random.default_rng(random_state)

    def sample_params(self):
        return {
            key: self.rng.choice(values) for key, values in self.param_space.items()
        }

    def fmin(self, objective, **kwargs):
        param_list = [self.sample_params() for _ in range(self.max_iter)]
        param_jobs = [(objective, p) for p in param_list]

        if self.n_jobs > 1:
            results = Parallel(n_jobs=self.n_jobs)(
                delayed(evaluate_param_objective)(job) for job in param_jobs
            )
        else:
            results = [evaluate_param_objective(job) for job in param_jobs]

        results.sort(key=lambda x: x[0])  # menor erro
        return results[0]



# --- Parâmetros de busca para cada modelo ---

num = 100  # Número de pontos a serem gerados em distribuições com linspace

param_spaces = {
    "CatBoost": {
        "iterations": np.linspace(100, 1000, num=num, dtype=int).tolist(),
        "learning_rate": np.round(np.linspace(0.01, 0.40, num=num), 4).tolist(),
        "depth": list(range(1, 15, 2)),
        "random_strength": np.round(np.linspace(0.0, 1.0, num=num), 4).tolist(),
        "bagging_temperature": np.round(np.linspace(0.0, 1.5, num=num), 4).tolist(),
        "border_count": list(range(128, 255, 14)),
    }
}


# --- Modelos ---
def get_model(name, params):
    if name == "CatBoost":
        return CatBoostRegressor(verbose=0, thread_count=-1, **params)
    else:
        raise ValueError("Modelo desconhecido")

# --- Tuning com validação cruzada ---
def objective(params, model_name, X, y, kf):
    scores = []
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = get_model(model_name, params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        scores.append(rrmse(y_val, y_pred))
    return np.mean(scores)

def objective_global(params_model_data):
    params, model_name, X, y, kf = params_model_data
    return objective(params, model_name, X, y, kf), params

def tune_model(model_name, X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    resultados = pd.Series(index=X.index, dtype=float)

    search = RandomSearch(param_spaces[model_name], max_iter=n, n_jobs=-1)

    # Prepara lista de amostras de hiperparâmetros + dados fixos para paralelizar
    param_list = [search.sample_params() for _ in range(search.max_iter)]
    param_jobs = [(p, model_name, X, y, kf) for p in param_list]

    best_score = np.inf
    best_params = None

    for i, job in enumerate(param_jobs):
        if search.n_jobs > 1:
            result = parallel_evaluate_objectives([job], search.n_jobs)[0]
        else:
            result = objective_global(job)

        score, params = result

        if score < best_score:
            best_score = score
            best_params = params
            print(f"[Iteração {i+1}] Novo melhor RRMSE: {best_score:.5f}")
            print(f"Hiperparâmetros: {best_params}\n")

    # Preencher predições finais com melhores parâmetros encontrados
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train = y.iloc[train_idx]
        model = get_model(model_name, best_params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        resultados.iloc[val_idx] = y_pred

    return resultados, best_params, best_score




# --- Executar tuning para todos os modelos ---
X = df.drop(columns="RefractiveIndex")
y = df["RefractiveIndex"]
resultados = pd.DataFrame(index=df.index)

modelos = ["CatBoost"]

for nome in modelos:
    print(f"Tunando {nome}...")
    preds, best_params, score = tune_model(nome, X, y)
    resultados[nome + "_pred"] = preds
    print(f"Melhores parâmetros de {nome}: {best_params}")
    print(f"RRMSE: {score:.5f}\n")


# Boostings mais std

In [None]:
X = df.drop(columns=['RefractiveIndex'])
y = df['RefractiveIndex']

# Inicializa o KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Define os modelos
modelos = {
    "XGBoost": XGBRegressor(n_estimators=581, learning_rate = 0.0376, max_depth = 13, subsample = 0.8, colsample_bytree = 1.0, reg_lambda = 5.0, reg_alpha = 0.0, random_state=42, tree_method="hist", verbosity=0),
    "HistGradientBoosting": HistGradientBoostingRegressor(max_iter=936, learning_rate = 0.1518, max_depth= 13, l2_regularization = 2.1212, max_bins = 198, min_samples_leaf = 10, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=600, learning_rate =  0.0927, max_depth = 15, num_leaves = 108, subsample = 0.6, colsample_bytree = 0.7, reg_lambda = 3.0, reg_alpha = 0.0, random_state=42),
    "CatBoost": CatBoostRegressor(iterations = 936, learning_rate =  0.337, random_strength = 0.1313, bagging_temperature = 0.5455, border_count = 254, random_state=42, verbose=0),
}

# Inicializa dicionário para armazenar as métricas de cada fold
metricas_por_fold = {nome: {"RD": [], "R2": [], "RMSE": [], "RRMSE": []} for nome in modelos}

# Loop de validação cruzada
for train_idx, val_idx in kf.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    for nome, modelo in modelos.items():
        modelo.fit(X_train, y_train)
        y_pred = modelo.predict(X_val)

        # Cálculo das métricas para este fold
        rd = np.mean(np.abs(y_val - y_pred) / np.abs(y_val)) * 100
        r2 = r2_score(y_val, y_pred)
        rmse = np.sqrt(np.mean((y_val - y_pred) ** 2))
        rrmse = rmse / np.std(y_val)

        # Armazena as métricas
        metricas_por_fold[nome]["RD"].append(rd)
        metricas_por_fold[nome]["R2"].append(r2)
        metricas_por_fold[nome]["RMSE"].append(rmse)
        metricas_por_fold[nome]["RRMSE"].append(rrmse)

# Agora calcula média e desvio padrão
estatisticas_metricas = {}

for nome, metricas in metricas_por_fold.items():
    estatisticas_metricas[nome] = {
        "RD médio": np.mean(metricas["RD"]),
        "RD std": np.std(metricas["RD"], ddof=1),
        "R^2 médio": np.mean(metricas["R2"]),
        "R^2 std": np.std(metricas["R2"], ddof=1),
        "RMSE médio": np.mean(metricas["RMSE"]),
        "RMSE std": np.std(metricas["RMSE"], ddof=1),
        "RRMSE médio": np.mean(metricas["RRMSE"]),
        "RRMSE std": np.std(metricas["RRMSE"], ddof=1)
    }

# Converte para DataFrame para melhor visualização
df_estatisticas = pd.DataFrame(estatisticas_metricas).T
print(df_estatisticas)


In [None]:
import requests

def notificar_telegram(token, chat_id, mensagem):
    url = f"https://api.telegram.org/bot{token}/sendMessage"
    payload = {"chat_id": chat_id, "text": mensagem}
    response = requests.post(url, data=payload)
    if response.status_code != 200:
        print("Erro ao enviar mensagem:", response.text)

# Exemplo de uso no final do seu script
if __name__ == "__main__":
    # ... seu código principal ...
    
    # Notifique o Telegram no final
    TOKEN = "7657381805:AAH_IsWMKewMROrhpqLCpU9zY8oX45He0MA"
    CHAT_ID = "7178661110"
    MENSAGEM = " -> Boosting terminou de rodar com sucesso!"

    notificar_telegram(TOKEN, CHAT_ID, MENSAGEM)


# Rodando para dados limpos

In [None]:
caminho_arquivo = "/Users/joao altarugio/Desktop/Projeto LaMav/data/df_filtrado.pkl"
with open(caminho_arquivo, "rb") as f:
   df_filtrado = pickle.load(f)

In [None]:
X = df_filtrado.drop(columns=['RefractiveIndex'])
y = df_filtrado['RefractiveIndex']

# Inicializa o KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Define os modelos
modelos = {
    "XGBoost": XGBRegressor(n_estimators=581, learning_rate = 0.0376, max_depth = 13, subsample = 0.8, colsample_bytree = 1.0, reg_lambda = 5.0, reg_alpha = 0.0, random_state=42, tree_method="hist", verbosity=0),
    "HistGradientBoosting": HistGradientBoostingRegressor(max_iter=936, learning_rate = 0.1518, max_depth= 13, l2_regularization = 2.1212, max_bins = 198, min_samples_leaf = 10, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=600, learning_rate =  0.0927, max_depth = 15, num_leaves = 108, subsample = 0.6, colsample_bytree = 0.7, reg_lambda = 3.0, reg_alpha = 0.0, random_state=42),
    "CatBoost": CatBoostRegressor(iterations = 936, learning_rate =  0.337, random_strength = 0.1313, bagging_temperature = 0.5455, border_count = 254, random_state=42, verbose=0),
}

# Inicializa dicionário para armazenar as métricas de cada fold
metricas_por_fold = {nome: {"RD": [], "R2": [], "RMSE": [], "RRMSE": []} for nome in modelos}

# Loop de validação cruzada
for train_idx, val_idx in kf.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    for nome, modelo in modelos.items():
        modelo.fit(X_train, y_train)
        y_pred = modelo.predict(X_val)

        # Cálculo das métricas para este fold
        rd = np.mean(np.abs(y_val - y_pred) / np.abs(y_val)) * 100
        r2 = r2_score(y_val, y_pred)
        rmse = np.sqrt(np.mean((y_val - y_pred) ** 2))
        rrmse = rmse / np.std(y_val)

        # Armazena as métricas
        metricas_por_fold[nome]["RD"].append(rd)
        metricas_por_fold[nome]["R2"].append(r2)
        metricas_por_fold[nome]["RMSE"].append(rmse)
        metricas_por_fold[nome]["RRMSE"].append(rrmse)

# Agora calcula média e desvio padrão
estatisticas_metricas = {}

for nome, metricas in metricas_por_fold.items():
    estatisticas_metricas[nome] = {
        "RD médio": np.mean(metricas["RD"]),
        "RD std": np.std(metricas["RD"], ddof=1),
        "R^2 médio": np.mean(metricas["R2"]),
        "R^2 std": np.std(metricas["R2"], ddof=1),
        "RMSE médio": np.mean(metricas["RMSE"]),
        "RMSE std": np.std(metricas["RMSE"], ddof=1),
        "RRMSE médio": np.mean(metricas["RRMSE"]),
        "RRMSE std": np.std(metricas["RRMSE"], ddof=1)
    }

# Converte para DataFrame para melhor visualização
df_estatisticas = pd.DataFrame(estatisticas_metricas).T
print(df_estatisticas)


In [None]:
import requests

def notificar_telegram(token, chat_id, mensagem):
    url = f"https://api.telegram.org/bot{token}/sendMessage"
    payload = {"chat_id": chat_id, "text": mensagem}
    response = requests.post(url, data=payload)
    if response.status_code != 200:
        print("Erro ao enviar mensagem:", response.text)

# Exemplo de uso no final do seu script
if __name__ == "__main__":
    # ... seu código principal ...
    
    # Notifique o Telegram no final
    TOKEN = "7657381805:AAH_IsWMKewMROrhpqLCpU9zY8oX45He0MA"
    CHAT_ID = "7178661110"
    MENSAGEM = " -> Boosting terminou de rodar com sucesso!"

    notificar_telegram(TOKEN, CHAT_ID, MENSAGEM)
