In [None]:
# breast_cancer_prediction.ipynb
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
import joblib

# Load dataset
df = pd.read_excel('METABRIC_Data.xlsx', sheet_name='Sheet1')

# Feature selection based on clinical relevance
features = [
    'Age at Diagnosis',
    'Type of Breast Surgery',
    'Cellularity',
    'ER Status',
    'HER2 Status',
    'Tumor Size',
    'Tumor Stage',
    'Neoplasm Histologic Grade',
    'Lymph nodes examined positive',
    'Nottingham prognostic index'
]

# Target variable (predicting survival status)
target = 'Overall Survival Status'

# Preprocessing pipeline
numeric_features = ['Age at Diagnosis', 'Tumor Size', 
                   'Lymph nodes examined positive', 'Nottingham prognostic index']
categorical_features = ['Type of Breast Surgery', 'Cellularity',
                       'ER Status', 'HER2 Status', 'Tumor Stage',
                       'Neoplasm Histologic Grade']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Create complete pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        class_weight='balanced',
        random_state=42
    ))])

# Prepare data
X = df[features]
y = df[target].apply(lambda x: 1 if x == 'Deceased' else 0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Train model
pipeline.fit(X_train, y_train)

# Evaluate model
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:,1]

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_proba):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(pipeline, 'breast_cancer_survival_predictor.pkl')
