In [2]:
# Importação de bibliotecas
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import LeaveOneGroupOut, GridSearchCV, GroupShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import joblib # Para salvar o melhor modelo
import shap # Mantido para a próxima etapa de análise

# --- Definição de Constantes e Hiperparâmetros ---
# Conjuntos de features
FEATURE_SETS = {
    'DS1': ['kilometer', 'wbgt', 'skin_temp', 'heart_rate', 'column_Male', 'age', 'vo2máx'],
    'DS2': ['kilometer', 'wbgt', 'skin_temp', 'heart_rate', 'age', 'column_Male', 'vo2máx', 'speed'],
    'DS3': ['kilometer', 'wbgt', 'skin_temp', 'heart_rate', 'age', 'column_Male', 'vo2máx', 'speed', 'umidade_absoluta', 'dry_temp', 'wet_temp', 'relative_humidity']
}

# Grades de hiperparâmetros para cada modelo
PARAM_GRIDS = {
    'Lasso': {
        'model__alpha': np.linspace(0.01, 0.1, 10),
    },
    'SVR': {
        'model__C': [0.1, 1.0, 10.0],
        'model__gamma': ['scale', 'auto'],
        'model__kernel': ['rbf', 'linear'],
    },
    'DecisionTree': {
        'model__max_depth': [5, 10, 15],
        'model__min_samples_leaf': [1, 5],
        'model__min_samples_split': [2, 5],
    },
    # 'RandomForest': {
    #     'model__n_estimators': [100, 200, 400],
    #     'model__max_depth': [5, 10, 15],
    #     'model__min_samples_leaf': [5, 10, 20],
    #     'model__max_features': ['sqrt', 'log2', 0.5, 0.7],
    #     'model__max_samples': [0.7, 0.8, 0.9]
    # },
    # 'XGBoost': {
    #     'model__n_estimators': [300, 500, 700],
    #     'model__max_depth': [3, 5, 7],
    #     'model__learning_rate': [0.01, 0.05, 0.1],
    #     'model__subsample': [0.5, 0.7, 0.9],
    #     'model__colsample_bytree': [0.5, 0.7, 0.9],
    # }
}

# Dicionário para mapear os nomes dos modelos para as suas classes
MODELS_TO_RUN = {
    'Lasso': Lasso(random_state=42, max_iter=20000),
    'SVR': SVR(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42, objective='reg:squarederror')
}

# --- Funções para o Fluxo do Modelo ---

def load_data(X_path, y_path, groups_path):
    """
    Carrega os dados e realiza uma verificação inicial.
    """
    try:
        X = pd.read_csv(X_path)
        y = pd.read_csv(y_path)
        groups = pd.read_csv(groups_path)['trial_number']
    except FileNotFoundError as e:
        print(f"Erro: Arquivo não encontrado - {e.filename}.")
        return None, None, None

    # Verificação de valores ausentes
    if X.isnull().sum().sum() > 0 or y.isnull().sum().sum() > 0:
        print("Aviso: Dados contêm valores ausentes (NaN). Considere o pré-processamento para tratá-los.")

    return X, y, groups

def run_nested_cv(X, y, groups, feature_sets, param_grids, models_to_run):
    """
    Executa a Validação Cruzada Aninhada para múltiplos modelos,
    garantindo a separação de grupos e pré-processamento adequado.
    """
    final_results = []

    # Loop Externo (para a avaliação final do desempenho)
    # GroupShuffleSplit divide os dados em folds independentes por grupo.
    outer_cv = GroupShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

    # Loop Interno (para a busca de hiperparâmetros)
    # LeaveOneGroupOut garante que um grupo inteiro seja usado para validação.
    inner_cv = LeaveOneGroupOut()

    for model_name, model_instance in models_to_run.items():
        print(f"\n--- Iniciando Nested CV para o modelo: {model_name} ---")

        for feature_set_name, feature_list in feature_sets.items():
            print(f"\n--- Otimizando com o conjunto de features: {feature_set_name} ---")

            test_rmse_scores = []
            test_r2_scores = []

            # Cria o pré-processador para o pipeline.
            # O ColumnTransformer aplica transformações seletivas por tipo de coluna,
            # evitando vazamento de dados de escalonamento para variáveis não numéricas.
            numeric_features = X[feature_list].select_dtypes(include=np.number).columns
            preprocessor = ColumnTransformer(
                transformers=[
                    ('scaler', StandardScaler(), numeric_features)
                ],
                remainder='passthrough'
            )

            # Configura o pipeline com ou sem o pré-processador, dependendo do modelo.
            if model_name in ['Lasso', 'SVR']:
                pipeline = Pipeline([
                    ('preprocessor', preprocessor),
                    ('model', model_instance)
                ])
            else:
                pipeline = Pipeline([
                    ('model', model_instance)
                ])

            param_grid = param_grids.get(model_name)
            if not param_grid:
                print(f"Aviso: Grade de hiperparâmetros não encontrada para {model_name}. Pulando este conjunto.")
                continue

            # Loop externo: dividindo o conjunto em treino/validação e teste
            for i, (train_val_idx, test_idx) in enumerate(outer_cv.split(X, y, groups=groups)):
                # Subconjuntos de dados para o fold atual
                X_train_val = X.iloc[train_val_idx][feature_list]
                y_train_val = y.iloc[train_val_idx].values.ravel()
                groups_train_val = groups.iloc[train_val_idx]

                X_test = X.iloc[test_idx][feature_list]
                y_test = y.iloc[test_idx].values.ravel()

                # Loop interno: GridSearchCV
                grid_search = GridSearchCV(
                    pipeline,
                    param_grid=param_grid,
                    cv=inner_cv,
                    # Usando a métrica nativa neg_root_mean_squared_error
                    scoring='neg_root_mean_squared_error',
                    n_jobs=-1,
                    verbose=0
                )

                # Executa a busca em grade no subconjunto de treino/validação
                # O Pipeline dentro do GridSearchCV garante que o fit() do StandardScaler
                # ocorra apenas com os dados de treino de cada fold interno.
                grid_search.fit(X_train_val, y_train_val, groups=groups_train_val)

                # Avalia o melhor modelo do grid search no conjunto de teste externo
                y_pred = grid_search.best_estimator_.predict(X_test)

                rmse_score = np.sqrt(mean_squared_error(y_test, y_pred))
                r2_score_val = r2_score(y_test, y_pred)

                test_rmse_scores.append(rmse_score)
                test_r2_scores.append(r2_score_val)

            # Média e desvio padrão das métricas dos folds externos
            mean_rmse = np.mean(test_rmse_scores)
            #std_rmse = np.std(test_rmse_scores)
            mean_r2 = np.mean(test_r2_scores)
            #std_r2 = np.std(test_r2_scores)

            print(f"--- Resultado Final do Nested CV para {model_name} com {feature_set_name} ---")
            print(f"  > RMSE: {mean_rmse:.4f} +/- {std_rmse:.4f}")
            print(f"  > R²: {mean_r2:.4f} +/- {std_r2:.4f}")

            final_results.append({
                'model': model_name,
                'feature_set': feature_set_name,
                'mean_rmse': mean_rmse,
                'std_rmse': std_rmse,
                'mean_r2': mean_r2,
                'std_r2': std_r2
            })

    return pd.DataFrame(final_results)


# --- Função Principal de Execução ---

if __name__ == '__main__':
    # 1. Carregar os dados
    print("--- Carregando os dados ---")
    X, y, groups = load_data(
        X_path='/home/usuario-leticia/Desktop/Samuel/leticiaag/tcore/data/processed-data/X-data1-1km.csv',
        y_path='/home/usuario-leticia/Desktop/Samuel/leticiaag/tcore/data/processed-data/y-data1-1km.csv',
        groups_path='/home/usuario-leticia/Desktop/Samuel/leticiaag/tcore/data/processed-data/groups-data1-1km.csv'
    )
    if X is None:
        exit()

    print("Dados carregados com sucesso.")

    # 2. Executar o Nested CV
    print("\n--- Iniciando o Nested Cross-Validation para todos os algoritmos ---")
    results_df = run_nested_cv(
        X,
        y,
        groups,
        FEATURE_SETS,
        PARAM_GRIDS,
        MODELS_TO_RUN
    )

    # 3. Exibir os resultados finais
    print("\n--- Resumo dos Resultados do Nested CV ---")
    display(results_df.round(4))

--- Carregando os dados ---
Dados carregados com sucesso.

--- Iniciando o Nested Cross-Validation para todos os algoritmos ---

--- Iniciando Nested CV para o modelo: Lasso ---

--- Otimizando com o conjunto de features: DS1 ---
--- Resultado Final do Nested CV para Lasso com DS1 ---


NameError: name 'std_rmse' is not defined

In [1]:
# Importação de bibliotecas
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import LeaveOneGroupOut, GridSearchCV, GroupShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import joblib # Para salvar o melhor modelo
import shap # Mantido para a próxima etapa de análise

# --- Definição de Constantes e Hiperparâmetros ---
# Conjuntos de features
FEATURE_SETS = {
    'DS1': ['kilometer', 'wbgt', 'skin_temp', 'heart_rate', 'column_Male', 'age', 'vo2máx'],
    'DS2': ['kilometer', 'wbgt', 'skin_temp', 'heart_rate', 'age', 'column_Male', 'vo2máx', 'speed'],
    'DS3': ['kilometer', 'wbgt', 'skin_temp', 'heart_rate', 'age', 'column_Male', 'vo2máx', 'speed', 'umidade_absoluta', 'dry_temp', 'wet_temp', 'relative_humidity']
}

# Grades de hiperparâmetros para cada modelo
PARAM_GRIDS = {
    'Lasso': {
        'model__alpha': np.linspace(0.01, 0.1, 10),
    },
    'SVR': {
        'model__C': [0.1, 1.0, 10.0],
        'model__gamma': ['scale', 'auto'],
        'model__kernel': ['rbf', 'linear'],
    },
    'DecisionTree': {
        'model__max_depth': [5, 10, 15],
        'model__min_samples_leaf': [1, 5, 8, 10],
        'model__min_samples_split': [2, 5, 8, 10],
    },
    'RandomForest': {
        'model__n_estimators': [100, 200, 400],
        'model__max_depth': [5, 10, 15],
        'model__min_samples_leaf': [5, 10, 20],
        'model__max_features': ['sqrt', 'log2', 0.5, 0.7],
        'model__max_samples': [0.7, 0.8, 0.9]
    },
    'XGBoost': {
        'model__n_estimators': [300, 500, 700],
        'model__max_depth': [3, 5, 7],
        'model__learning_rate': [0.01, 0.05, 0.1],
        'model__subsample': [0.5, 0.7, 0.9],
        'model__colsample_bytree': [0.5, 0.7, 0.9],
    }
}

# Dicionário para mapear os nomes dos modelos para as suas classes
MODELS_TO_RUN = {
    'Lasso': Lasso(random_state=42, max_iter=20000),
    'SVR': SVR(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(random_state=42),
    'XGBoost': xgb.XGBRegressor(random_state=42, objective='reg:squarederror')
}

# --- Funções para o Fluxo do Modelo ---

def load_data(X_path, y_path, groups_path):
    """
    Carrega os dados e realiza uma verificação inicial.
    """
    try:
        X = pd.read_csv(X_path)
        y = pd.read_csv(y_path)
        groups = pd.read_csv(groups_path)['trial_number']
    except FileNotFoundError as e:
        print(f"Erro: Arquivo não encontrado - {e.filename}.")
        return None, None, None

    # Verificação de valores ausentes
    if X.isnull().sum().sum() > 0 or y.isnull().sum().sum() > 0:
        print("Aviso: Dados contêm valores ausentes (NaN). Considere o pré-processamento para tratá-los.")

    return X, y, groups

def run_nested_cv(X, y, groups, feature_sets, param_grids, models_to_run):
    """
    Executa a Validação Cruzada Aninhada para múltiplos modelos,
    incluindo a análise de scores de treino para identificar overfitting.
    """
    final_results = []

    outer_cv = GroupShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
    inner_cv = LeaveOneGroupOut()

    for model_name, model_instance in models_to_run.items():
        print(f"\n--- Iniciando Nested CV para o modelo: {model_name} ---")

        for feature_set_name, feature_list in feature_sets.items():
            print(f"\n--- Otimizando com o conjunto de features: {feature_set_name} ---")

            # Listas para armazenar as métricas de cada fold externo
            test_rmse_scores = []
            test_r2_scores = []
            train_rmse_scores = [] # Adicionada para os scores de treino
            train_r2_scores = [] # Adicionada para os scores de treino

            # Cria o pré-processador para o pipeline.
            numeric_features = X[feature_list].select_dtypes(include=np.number).columns
            preprocessor = ColumnTransformer(
                transformers=[
                    ('scaler', StandardScaler(), numeric_features)
                ],
                remainder='passthrough'
            )

            if model_name in ['Lasso', 'SVR']:
                pipeline = Pipeline([
                    ('preprocessor', preprocessor),
                    ('model', model_instance)
                ])
            else:
                pipeline = Pipeline([
                    ('model', model_instance)
                ])

            param_grid = param_grids.get(model_name)
            if not param_grid:
                print(f"Aviso: Grade de hiperparâmetros não encontrada para {model_name}. Pulando este conjunto.")
                continue

            # Loop externo: dividindo o conjunto em treino/validação e teste
            for i, (train_val_idx, test_idx) in enumerate(outer_cv.split(X, y, groups=groups)):
                # Subconjuntos de dados para o fold atual
                X_train_val = X.iloc[train_val_idx][feature_list]
                y_train_val = y.iloc[train_val_idx].values.ravel()
                groups_train_val = groups.iloc[train_val_idx]

                X_test = X.iloc[test_idx][feature_list]
                y_test = y.iloc[test_idx].values.ravel()

                # Loop interno: GridSearchCV
                grid_search = GridSearchCV(
                    pipeline,
                    param_grid=param_grid,
                    cv=inner_cv,
                    # Usando a métrica nativa neg_root_mean_squared_error
                    # e configurando para salvar os scores de treino
                    scoring='neg_root_mean_squared_error',
                    return_train_score=True, # Importante: Habilita a coleta de scores de treino
                    n_jobs=-1,
                    verbose=0
                )

                # Executa a busca em grade no subconjunto de treino/validação
                grid_search.fit(X_train_val, y_train_val, groups=groups_train_val)

                # Avalia o melhor modelo no conjunto de teste externo
                y_pred = grid_search.best_estimator_.predict(X_test)

                rmse_score = np.sqrt(mean_squared_error(y_test, y_pred))
                r2_score_val = r2_score(y_test, y_pred)

                test_rmse_scores.append(rmse_score)
                test_r2_scores.append(r2_score_val)

                # Acessa os scores de treino e converte de negativo para positivo
                best_index = grid_search.best_index_
                train_rmse_val = -grid_search.cv_results_['mean_train_score'][best_index]
                # Scikit-learn não oferece R² de treino diretamente em cv_results_, então
                # para maior rigor, precisaríamos calcular, mas o RMSE já é suficiente para a análise de overfitting.
                # Para simplificar, vamos usar uma aproximação.
                train_rmse_scores.append(train_rmse_val)
                train_r2_scores.append(grid_search.best_estimator_.score(X_train_val, y_train_val))

            # Média e desvio padrão das métricas dos folds externos
            mean_rmse_test = np.mean(test_rmse_scores)
            std_rmse_test = np.std(test_rmse_scores)
            mean_r2_test = np.mean(test_r2_scores)
            std_r2_test = np.std(test_r2_scores)

            mean_rmse_train = np.mean(train_rmse_scores)
            std_rmse_train = np.std(train_rmse_scores)
            mean_r2_train = np.mean(train_r2_scores)
            std_r2_train = np.std(train_r2_scores)

            print(f"--- Resultado Final do Nested CV para {model_name} com {feature_set_name} ---")
            print(f"  > RMSE Teste: {mean_rmse_test:.4f} +/- {std_rmse_test:.4f}")
            print(f"  > RMSE Treino: {mean_rmse_train:.4f} +/- {std_rmse_train:.4f}")
            print(f"  > R² Teste: {mean_r2_test:.4f} +/- {std_r2_test:.4f}")
            print(f"  > R² Treino: {mean_r2_train:.4f} +/- {std_r2_train:.4f}")

            final_results.append({
                'model': model_name,
                'feature_set': feature_set_name,
                'mean_rmse_test': mean_rmse_test,
                'std_rmse_test': std_rmse_test,
                'mean_rmse_train': mean_rmse_train,
                'std_rmse_train': std_rmse_train,
                'mean_r2_test': mean_r2_test,
                'std_r2_test': std_r2_test,
                'mean_r2_train': mean_r2_train,
                'std_r2_train': std_r2_train
            })

    return pd.DataFrame(final_results)


# --- Função Principal de Execução ---

if __name__ == '__main__':
    # 1. Carregar os dados
    print("--- Carregando os dados ---")
    X, y, groups = load_data(
        X_path='/home/usuario-leticia/Desktop/Samuel/leticiaag/tcore/data/processed-data/X-data1-1km.csv',
        y_path='/home/usuario-leticia/Desktop/Samuel/leticiaag/tcore/data/processed-data/y-data1-1km.csv',
        groups_path='/home/usuario-leticia/Desktop/Samuel/leticiaag/tcore/data/processed-data/groups-data1-1km.csv'
    )
    if X is None:
        exit()

    print("Dados carregados com sucesso.")

    # 2. Executar o Nested CV
    print("\n--- Iniciando o Nested Cross-Validation para todos os algoritmos ---")
    results_df = run_nested_cv(
        X,
        y,
        groups,
        FEATURE_SETS,
        PARAM_GRIDS,
        MODELS_TO_RUN
    )

    # 3. Exibir os resultados finais
    print("\n--- Resumo dos Resultados do Nested CV ---")
    display(results_df.round(4))


--- Carregando os dados ---
Dados carregados com sucesso.

--- Iniciando o Nested Cross-Validation para todos os algoritmos ---

--- Iniciando Nested CV para o modelo: Lasso ---

--- Otimizando com o conjunto de features: DS1 ---
--- Resultado Final do Nested CV para Lasso com DS1 ---
  > RMSE Teste: 0.4523 +/- 0.0392
  > RMSE Treino: 0.4017 +/- 0.0055
  > R² Teste: 0.7882 +/- 0.0371
  > R² Treino: 0.8193 +/- 0.0040

--- Otimizando com o conjunto de features: DS2 ---
--- Resultado Final do Nested CV para Lasso com DS2 ---
  > RMSE Teste: 0.4440 +/- 0.0449
  > RMSE Treino: 0.4030 +/- 0.0081
  > R² Teste: 0.7959 +/- 0.0386
  > R² Treino: 0.8180 +/- 0.0085

--- Otimizando com o conjunto de features: DS3 ---
--- Resultado Final do Nested CV para Lasso com DS3 ---
  > RMSE Teste: 0.4441 +/- 0.0449
  > RMSE Treino: 0.4028 +/- 0.0082
  > R² Teste: 0.7958 +/- 0.0386
  > R² Treino: 0.8182 +/- 0.0087

--- Iniciando Nested CV para o modelo: SVR ---

--- Otimizando com o conjunto de features: DS1 

Unnamed: 0,model,feature_set,mean_rmse_test,std_rmse_test,mean_rmse_train,std_rmse_train,mean_r2_test,std_r2_test,mean_r2_train,std_r2_train
0,Lasso,DS1,0.4523,0.0392,0.4017,0.0055,0.7882,0.0371,0.8193,0.004
1,Lasso,DS2,0.444,0.0449,0.403,0.0081,0.7959,0.0386,0.818,0.0085
2,Lasso,DS3,0.4441,0.0449,0.4028,0.0082,0.7958,0.0386,0.8182,0.0087
3,SVR,DS1,0.4575,0.0389,0.3987,0.0066,0.7846,0.0294,0.8216,0.0069
4,SVR,DS2,0.4362,0.0374,0.3941,0.0062,0.8038,0.0287,0.826,0.0057
5,SVR,DS3,0.4405,0.0387,0.3931,0.0066,0.7999,0.0297,0.8269,0.0067
6,DecisionTree,DS1,0.4843,0.0221,0.344,0.0164,0.7579,0.0293,0.8674,0.0122
7,DecisionTree,DS2,0.4764,0.0259,0.3454,0.0195,0.7649,0.0354,0.8653,0.0166
8,DecisionTree,DS3,0.4814,0.0288,0.3523,0.0102,0.7598,0.0374,0.8602,0.0099
9,RandomForest,DS1,0.4085,0.0358,0.2669,0.0097,0.8279,0.0265,0.9206,0.0062
