In [1]:
# %% [markdown]
# ## Breast Cancer Survival Prediction Project

# %% [markdown]
# ### 1. Environment Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lifelines import KaplanMeierFitter, CoxPHFitter
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, classification_report, RocCurveDisplay
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')



In [4]:
# Load dataset
df = pd.read_excel('METABRIC_Data.xlsx', sheet_name='Sheet1')


In [5]:
# Clean column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [13]:
print(df.columns)

Index(['patient_id', 'age_at_diagnosis', 'type_of_breast_surgery',
       'cancer_type', 'cancer_type_detailed', 'cellularity', 'chemotherapy',
       'pam50_+_claudin-low_subtype', 'cohort', 'er_status_measured_by_ihc',
       'er_status', 'neoplasm_histologic_grade',
       'her2_status_measured_by_snp6', 'her2_status',
       'tumor_other_histologic_subtype', 'hormone_therapy',
       'inferred_menopausal_state', 'integrative_cluster',
       'primary_tumor_laterality', 'lymph_nodes_examined_positive',
       'mutation_count', 'nottingham_prognostic_index', 'oncotree_code',
       'overall_survival_(months)', 'overall_survival_status', 'pr_status',
       'radio_therapy', 'relapse_free_status_(months)', 'relapse_free_status',
       'sex', '3-gene_classifier_subtype', 'tumor_size', 'tumor_stage',
       'patient's_vital_status'],
      dtype='object')


In [15]:
# Handle missing values
df = df.dropna(subset=['overall_survival_(months)', 'overall_survival_status'])

# %% [markdown]
# ### 3. Target Variable Engineering
# Create 10-year mortality target
df['10_year_mortality'] = np.where(
    (df['overall_survival_(months)'] <= 120) & 
    (df['overall_survival_status'] == 'Deceased'), 1, 0)

# Handle right-censored data
df.loc[(df['overall_survival_(months)'] < 120) & 
       (df['overall_survival_status'] == 'Living'), '10_year_mortality'] = np.nan
df = df.dropna(subset=['10_year_mortality'])
