In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
print('Done')

Done


In [2]:
# --- Block 1: Data Loading and Initial Cleaning ---
print('Starting data loading and initial cleaning...')
df = pd.read_csv(r'C:\Users\Maverick\Downloads\brca_metabric_clinical_data.tsv', sep='\t')
print(f"Dataset shape before initial cleaning: {df.shape}")

# Drop irrelevant ID columns
df = df.drop(columns=['Study ID', 'Patient ID', 'Sample ID', 'Oncotree Code'], axis=1)
print(f"Dataset shape after dropping ID columns: {df.shape}")

# Handle missing target values
df = df[~df['Overall Survival Status'].isnull()]
print(f"Dataset shape after handling missing target values: {df.shape}")

# Create derived feature 'Survival_Clinical'
def categorize_by_clinical(x):
    if x < 12:
        return "Very Short (<12)"
    elif x < 36:
        return "Intermediate (12–36)"
    else:
        return "Long (>36)"

df["Survival_Clinical"] = df["Overall Survival (Months)"].apply(categorize_by_clinical)

Starting data loading and initial cleaning...
Dataset shape before initial cleaning: (2509, 39)
Dataset shape after dropping ID columns: (2509, 35)
Dataset shape after handling missing target values: (1981, 35)


In [3]:
# --- Block 2: Train-Test Split ---
X = df.drop(columns=[
    'Overall Survival Status',
    'Survival_Clinical',
    'Overall Survival (Months)'
])
y = df['Overall Survival Status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Train-test split completed.")
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train-test split completed.
Train shape: (1584, 33), Test shape: (397, 33)


In [4]:
# --- Block 3: Custom Transformer (No Column Name Dependency) ---
class CustomNumericalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.quantiles_ = {}
        self.cols_to_transform_idx = []  # Store indices, not names
    
    def fit(self, X, y=None):
        # Identify indices of columns to transform
        if isinstance(X, pd.DataFrame):
            self.cols_to_transform_idx = [i for i, col in enumerate(X.columns) if col in [
                'Cohort', 'Number of Samples Per Patient', 'TMB (nonsynonymous)', 'Tumor Size'
            ]]
            X = X.values
        else:
            # Assume columns are in original order
            self.cols_to_transform_idx = [i for i in range(X.shape[1]) if i < 4]  # Adjust based on your data
        
        # Compute quantiles
        for idx in self.cols_to_transform_idx:
            self.quantiles_[idx] = np.quantile(X[:, idx], 0.99)
        return self
    
    def transform(self, X):
        X = np.array(X)
        for idx in self.cols_to_transform_idx:
            if idx < X.shape[1]:
                X[:, idx] = np.clip(X[:, idx], None, self.quantiles_.get(idx, np.inf))
                X[:, idx] = np.log1p(X[:, idx])
        return X

In [5]:
# --- Block 4: Define Features and Preprocessor ---
numerical_features = [
    'Age at Diagnosis', 'Neoplasm Histologic Grade', 'Tumor Stage', 'Lymph nodes examined positive',
    'Mutation Count', 'Nottingham prognostic index', 'Relapse Free Status (Months)',
    'Cohort', 'Number of Samples Per Patient', 'TMB (nonsynonymous)', 'Tumor Size'
]
categorical_features = [
    col for col in X.select_dtypes(include='object').columns.tolist()
    if col not in ['Overall Survival Status', 'Survival_Clinical', 'Overall Survival (Months)']
]

# Numerical Pipeline
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('custom_transform', CustomNumericalTransformer()),
    ('scaler', StandardScaler())
])

# Categorical Pipeline
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Full Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='drop'
)

In [6]:
# --- Block 5: Unified Evaluation Function ---
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
positive_class_label = label_encoder.inverse_transform([1])[0]

def evaluate_model(model_pipeline, name, X_train, y_train, X_test, y_test, positive_class_label, strategy_name):
    print(f"\n--- Training and Evaluating: {name} ({strategy_name}) ---")
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)
    
    # Handle probability prediction
    y_prob = None
    try:
        preprocessor = model_pipeline.named_steps.get('preprocessor', None)
        if preprocessor:
            X_test_transformed = preprocessor.transform(X_test)
        else:
            X_test_transformed = X_test
        final_estimator = model_pipeline.named_steps.get('classifier', model_pipeline)
        if hasattr(final_estimator, 'predict_proba'):
            y_prob = final_estimator.predict_proba(X_test_transformed)[:, 1]
        elif hasattr(final_estimator, 'decision_function'):
            y_prob = final_estimator.decision_function(X_test_transformed)
    except Exception as e:
        print(f"Warning: Probability prediction failed for {name}. Error: {e}")

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, pos_label=positive_class_label, zero_division=0)
    recall = recall_score(y_test, y_pred, pos_label=positive_class_label, zero_division=0)
    f1 = f1_score(y_test, y_pred, pos_label=positive_class_label, zero_division=0)
    roc_auc = roc_auc_score(label_encoder.transform(y_test), y_prob) if y_prob is not None else np.nan
    cm = confusion_matrix(y_test, y_pred, labels=label_encoder.classes_)
    false_negatives = cm[1, 0]

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print(f"False Negatives: {false_negatives}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return {
        'Model': name,
        'Strategy': strategy_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC AUC': roc_auc,
        'False Negatives': false_negatives
    }

In [7]:
# --- Block 6: Baseline + Resampling Pipelines ---
base_models = {
    'Logistic Regression': LogisticRegression(random_state=42, solver='liblinear'),
}

pipelines = {}
for name, model in base_models.items():
    pipelines[f"{name}_Baseline"] = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    pipelines[f"{name}_SMOTE"] = ImbPipeline([
        ('preprocessor', preprocessor),
        ('sampler', SMOTE(random_state=42)),
        ('classifier', model)
    ])


In [8]:
# --- Block 7: Run Evaluation ---
results = []
for name, pipeline in pipelines.items():
    model_name, strategy = name.split('_', 1)
    result = evaluate_model(
        pipeline, model_name, X_train, y_train, X_test, y_test, positive_class_label, strategy
    )
    results.append(result)

# --- Block 8: Final Summary ---
final_summary_df = pd.DataFrame(results).set_index(['Strategy', 'Model'])


--- Training and Evaluating: Logistic Regression (Baseline) ---
Accuracy: 0.9975
Precision: 1.0000
Recall: 0.9956
F1-Score: 0.9978
ROC AUC Score: 0.9969
False Negatives: 1

Classification Report:
              precision    recall  f1-score   support

    0:LIVING       0.99      1.00      1.00       168
  1:DECEASED       1.00      1.00      1.00       229

    accuracy                           1.00       397
   macro avg       1.00      1.00      1.00       397
weighted avg       1.00      1.00      1.00       397


--- Training and Evaluating: Logistic Regression (SMOTE) ---
Accuracy: 0.9975
Precision: 1.0000
Recall: 0.9956
F1-Score: 0.9978
ROC AUC Score: 0.9971
False Negatives: 1

Classification Report:
              precision    recall  f1-score   support

    0:LIVING       0.99      1.00      1.00       168
  1:DECEASED       1.00      1.00      1.00       229

    accuracy                           1.00       397
   macro avg       1.00      1.00      1.00       397
weighted a

In [9]:
final_summary_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,Precision,Recall,F1-Score,ROC AUC,False Negatives
Strategy,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Baseline,Logistic Regression,0.997481,1.0,0.995633,0.997812,0.996933,1
SMOTE,Logistic Regression,0.997481,1.0,0.995633,0.997812,0.997089,1
