In [4]:
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # RandomForestRegressor

# Load Data
data = pd.read_csv('TrainDataset2024.csv', index_col=0)

# Handle Missing Data
data.replace(999, np.nan, inplace=True)
imputer = SimpleImputer(strategy='median')
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns, index=data.index)

# Target and Features
target = data[['RelapseFreeSurvival (outcome)']]
data.drop(columns=['pCR (outcome)', 'RelapseFreeSurvival (outcome)'], axis=1, inplace=True)
key_features = data[['ER', 'HER2', 'Gene']]
data.drop(columns=['ER', 'HER2', 'Gene'], axis=1, inplace=True)

# Normalize and Reduce Features
normalizer = Normalizer()
vector_normalized_data = normalizer.fit_transform(data)
pca = PCA(n_components=0.95)
data_reduced = pca.fit_transform(vector_normalized_data)
pca_complete = pd.DataFrame(data_reduced, index=data.index)
pca_complete = pd.concat([pca_complete, key_features], axis=1)

# 🛠️ New Parameter Grid for RandomForestRegressor
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [5, 10, 20, None],  # Max depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples to split
    'min_samples_leaf': [1, 2, 5, 10]  # Minimum number of samples in a leaf
}

# ✅ Changed Model to RandomForestRegressor
model = RandomForestRegressor(random_state=42)

n_folds = 4

def train_model(data):
    # Outer K-fold cross-validation
    outer_cv = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    data = data.rename(str, axis="columns")
    
    # Perform nested cross-validation
    outer_results = []
    for train_idx, test_idx in outer_cv.split(data, target):
        # Split data
        X_train, X_test = data.iloc[train_idx], data.iloc[test_idx]
        y_train, y_test = np.ravel(target.iloc[train_idx]), np.ravel(target.iloc[test_idx])

        # Inner loop: Hyperparameter tuning using GridSearchCV
        inner_cv = KFold(n_splits=n_folds, shuffle=True, random_state=42)
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv)
        grid_search.fit(X_train, y_train)

        # Evaluate on the test set
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)

        # Collect results using regression metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)

        # Collect results
        outer_results.append({
            "best_params": grid_search.best_params_,
            "mse": mse,
            "r2": r2,
            "mae": mae,
            "model_score": best_model.score(X_test, y_test)
        })

    # Print the results for each fold
    for i, result in enumerate(outer_results, 1):
        print(f"Fold {i}")
        print(f"Best Parameters: {result['best_params']}")
        print(f"Mean Squared Error (MSE): {result['mse']:.4f}")
        print(f"R-squared (R²): {result['r2']:.4f}")
        print(f"Mean Absolute Error (MAE): {result['mae']:.4f}")
        print(f"Model Score (R² on test set): {result['model_score']:.4f}")
        print("-" * 40)

    # Overall results
    mean_mse = np.mean([res["mse"] for res in outer_results])
    mean_r2 = np.mean([res["r2"] for res in outer_results])
    mean_mae = np.mean([res["mae"] for res in outer_results])

    print(f"Mean MSE: {mean_mse:.4f}")
    print(f"Mean R²: {mean_r2:.4f}")
    print(f"Mean MAE: {mean_mae:.4f}")


In [6]:
train_model(pca_ complete)

Fold 1
Best Parameters: {'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 100}
Mean Squared Error (MSE): 816.9092
R-squared (R²): -0.0672
Mean Absolute Error (MAE): 22.4601
Model Score (R² on test set): -0.0672
----------------------------------------
Fold 2
Best Parameters: {'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 200}
Mean Squared Error (MSE): 767.8209
R-squared (R²): -0.0510
Mean Absolute Error (MAE): 21.6029
Model Score (R² on test set): -0.0510
----------------------------------------
Fold 3
Best Parameters: {'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 100}
Mean Squared Error (MSE): 677.5240
R-squared (R²): -0.0177
Mean Absolute Error (MAE): 20.8123
Model Score (R² on test set): -0.0177
----------------------------------------
Fold 4
Best Parameters: {'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 200}
Mean Squared Error (MSE): 782.2355
R-squa