In [1]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import Normalizer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('TrainDataset2024.csv', index_col=0)

from sklearn.impute import KNNImputer

# Replace missing values (999) with NaN
data.replace(999, np.nan, inplace=True)

# Apply KNN imputation
knn_imputer = KNNImputer(n_neighbors=5, weights="uniform")  # Use 5 nearest neighbors
data_imputed = pd.DataFrame(knn_imputer.fit_transform(data), columns=data.columns, index=data.index)

# Post-process to restore binary columns (if necessary)
binary_columns = ['ER', 'HER2']  # Add any binary columns here
for col in binary_columns:
    data_imputed[col] = (data_imputed[col] >= 0.5).astype(int)  # Thresholding to restore binary nature

# Update data with imputed values
data = data_imputed

# Prepare the dataset as before
target = data[['RelapseFreeSurvival (outcome)']]
data.drop(columns=['pCR (outcome)', 'RelapseFreeSurvival (outcome)'], axis=1, inplace=True)
key_features = data[['ER', 'HER2', 'Gene']]
data.drop(columns=['ER', 'HER2', 'Gene'], axis=1, inplace=True)


In [3]:
# NORMALIZATION
normalizer = Normalizer()
vector_normalized_data = normalizer.fit_transform(data)

In [4]:
# FEATURE REDUCTION
pca = PCA(n_components=0.95)
data_reduced = pca.fit_transform(vector_normalized_data)

pca_complete = pd.DataFrame(data_reduced, index=data.index)
pca_complete = pd.concat([pca_complete, key_features], axis=1)


In [5]:
# LINEAR REGRESSION MODEL
model = LinearRegression()
n_folds = 4

In [6]:
def train_model(data):
    # Outer K-fold cross-validation
    outer_cv = KFold(n_splits=n_folds, shuffle=True, random_state=42)

    data = data.rename(str, axis="columns") 

    # Perform nested cross-validation
    outer_results = []
    for train_idx, test_idx in outer_cv.split(data, target):
        # Split data
        X_train, X_test = data.iloc[train_idx], data.iloc[test_idx]
        y_train, y_test = np.ravel(target.iloc[train_idx]), np.ravel(target.iloc[test_idx])

        # Train and evaluate the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Collect results using regression metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)

        # Collect results
        outer_results.append({
            "mse": mse,
            "r2": r2,
            "mae": mae,
            "model_score": model.score(X_test, y_test)
        })

    # Print the results for each fold
    for i, result in enumerate(outer_results, 1):
        print(f"Fold {i}")
        print(f"Mean Squared Error (MSE): {result['mse']:.4f}")
        print(f"R-squared (R²): {result['r2']:.4f}")
        print(f"Mean Absolute Error (MAE): {result['mae']:.4f}")
        print(f"Model Score (R² on test set): {result['model_score']:.4f}")
        print("-" * 40)

    # Overall results
    mean_mse = np.mean([res["mse"] for res in outer_results])
    mean_r2 = np.mean([res["r2"] for res in outer_results])
    mean_mae = np.mean([res["mae"] for res in outer_results])

    print(f"Mean MSE: {mean_mse:.4f}")
    print(f"Mean R²: {mean_r2:.4f}")
    print(f"Mean MAE: {mean_mae:.4f}")

In [7]:
train_model(pca_complete)

Fold 1
Mean Squared Error (MSE): 784.0274
R-squared (R²): -0.0243
Mean Absolute Error (MAE): 22.0130
Model Score (R² on test set): -0.0243
----------------------------------------
Fold 2
Mean Squared Error (MSE): 728.6782
R-squared (R²): 0.0026
Mean Absolute Error (MAE): 21.6288
Model Score (R² on test set): 0.0026
----------------------------------------
Fold 3
Mean Squared Error (MSE): 737.7183
R-squared (R²): -0.1081
Mean Absolute Error (MAE): 21.7586
Model Score (R² on test set): -0.1081
----------------------------------------
Fold 4
Mean Squared Error (MSE): 821.7749
R-squared (R²): -0.0627
Mean Absolute Error (MAE): 21.8359
Model Score (R² on test set): -0.0627
----------------------------------------
Mean MSE: 768.0497
Mean R²: -0.0481
Mean MAE: 21.8091


In [8]:
def train_model_single(data):
    data = data.rename(str, axis="columns") 

    # Split data into 80% training and 20% testing
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

    # Train the model on the training data
    model.fit(X_train, np.ravel(y_train))

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate the regression metrics
    mse = mean_squared_error(np.ravel(y_test), y_pred)
    r2 = r2_score(np.ravel(y_test), y_pred)
    mae = mean_absolute_error(np.ravel(y_test), y_pred)

    # Print the results
    print("Mean Squared Error (MSE):", mse)
    print(f"R-squared (R²): {r2:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Model Score (R² on test set): {model.score(X_test, np.ravel(y_test)):.4f}")

In [9]:
train_model_single(pca_complete)

Mean Squared Error (MSE): 807.666093291214
R-squared (R²): -0.0120
Mean Absolute Error (MAE): 21.9301
Model Score (R² on test set): -0.0120


In [10]:
def displayDatasets(train_data):
    #Print the current datasets
    print("Training Dataset:")
    print(data.head())
displayDatasets(data)

Training Dataset:
            Age  PgR  TrippleNegative  ChemoGrade  Proliferation  \
ID                                                                 
TRG002174  41.0  0.0              1.0         3.0            3.0   
TRG002178  39.0  1.0              0.0         3.0            3.0   
TRG002204  31.0  0.0              1.0         2.0            1.0   
TRG002206  35.0  0.0              1.0         3.0            3.0   
TRG002210  61.0  0.0              0.0         2.0            1.0   

           HistologyType  LNStatus  TumourStage  original_shape_Elongation  \
ID                                                                           
TRG002174            1.0       1.0          2.0                   0.813912   
TRG002178            1.0       1.0          2.0                   0.666118   
TRG002204            1.0       0.0          2.0                   0.645083   
TRG002206            1.0       1.0          3.0                   0.770842   
TRG002210            1.0       0.0   