# Imports

In [22]:
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Data Prep


In [23]:
data = pd.read_csv('../TrainDataset2024.csv', index_col=0)

#MISSING DATA
data.replace(999, np.nan, inplace=True)

imputer = SimpleImputer(strategy='median')
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns,index=data.index)


target = data[['RelapseFreeSurvival (outcome)']]#'pCR (outcome)']]
data.drop(columns=['pCR (outcome)','RelapseFreeSurvival (outcome)'], axis=1, inplace=True)

key_features = data[['ER', 'HER2', 'Gene']]
data.drop(columns=['ER', 'HER2', 'Gene'], axis=1, inplace=True)


In [24]:

#NORMALISATION
normalizer = Normalizer()
vector_normalized_data = normalizer.fit_transform(data)


In [25]:

#FEATURE REDUCTION
pca = PCA(n_components=0.95)
data_reduced = pca.fit_transform(vector_normalized_data)

pca_complete = pd.DataFrame(data_reduced, index=data.index)
pca_complete = pd.concat([pca_complete, key_features], axis=1)


# Training Data

In [26]:
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'epsilon': [0.01, 0.1, 0.2],  # Epsilon values (larger values will ignore smaller errors)
    'kernel': ['rbf']  # RBF is often effective, but you can also test 'linear' and 'poly'
}

#CHANGE ME
model = SVR()


In [None]:
n_folds = 4

def train_model(data):
    # Outer K-fold cross-validation
    outer_cv = KFold(n_splits=n_folds, shuffle=True, random_state=42)

    data = data.rename(str,axis="columns") 
    
    # Perform nested cross-validation
    outer_results = []
    for train_idx, test_idx in outer_cv.split(data, target):
        # Split data
        X_train, X_test = data.iloc[train_idx], data.iloc[test_idx]
        y_train, y_test = np.ravel(target.iloc[train_idx]), np.ravel(target.iloc[test_idx])

        # Inner loop: Hyperparameter tuning using GridSearchCV
        inner_cv = KFold(n_splits=n_folds, shuffle=True, random_state=42)
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv)
        grid_search.fit(X_train, y_train)

        # Evaluate on the test set
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)

        # Collect results using regression metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)

        # Collect results
        outer_results.append({
            "best_params": grid_search.best_params_,
            "mse": mse,
            "r2": r2,
            "mae": mae,
            "model_score": best_model.score(X_test, y_test)
        })

    # Print the results for each fold
    for i, result in enumerate(outer_results, 1):
        print(f"Fold {i}")
        print(f"Best Parameters: {result['best_params']}")
        print(f"Mean Squared Error (MSE): {result['mse']:.4f}")
        print(f"R-squared (R²): {result['r2']:.4f}")
        print(f"Mean Absolute Error (MAE): {result['mae']:.4f}")
        print(f"Model Score (R² on test set): {result['model_score']:.4f}")
        print("-" * 40)

    # Overall results
    mean_mse = np.mean([res["mse"] for res in outer_results])
    mean_r2 = np.mean([res["r2"] for res in outer_results])
    mean_mae = np.mean([res["mae"] for res in outer_results])

    print(f"Mean MSE: {mean_mse:.4f}")
    print(f"Mean R²: {mean_r2:.4f}")
    print(f"Mean MAE: {mean_mae:.4f}")


In [28]:
train_model(pca_complete)

Fold 1
Best Parameters: {'C': 0.1, 'epsilon': 0.01, 'kernel': 'rbf'}
Mean Squared Error (MSE): 747.2811
R-squared (R²): 0.0018
Mean Absolute Error (MAE): 21.4990
Model Score (R² on test set): 0.0018
----------------------------------------
Fold 2
Best Parameters: {'C': 10, 'epsilon': 0.2, 'kernel': 'rbf'}
Mean Squared Error (MSE): 778.2653
R-squared (R²): -0.0803
Mean Absolute Error (MAE): 21.5004
Model Score (R² on test set): -0.0803
----------------------------------------
Mean MSE: 762.7732
Mean R²: -0.0393
Mean MAE: 21.4997


# Re-Train Single Final Model on entire dataset using best-performing hyperparameters