In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pyplot as plt


In [2]:
pd.read_csv("combined_breast_cancer_validated.csv").head()  #load the combined datasets


Unnamed: 0,symmetry_mean,target,compactness_mean,A Stage,perimeter_worst,smoothness_se,Progesterone Status,radius_se,Survival Months,compactness_worst,...,Tumor Size,fractal_dimension_se,smoothness_mean,compactness_se,concavity_se,area_mean,symmetry_worst,texture_mean,area_worst,source
0,,0,,Regional,,,Positive,,60.0,,...,4.0,,,,,,,,,clinical
1,,0,,Regional,,,Positive,,62.0,,...,35.0,,,,,,,,,clinical
2,,0,,Regional,,,Positive,,75.0,,...,63.0,,,,,,,,,clinical
3,,0,,Regional,,,Positive,,84.0,,...,18.0,,,,,,,,,clinical
4,,0,,Regional,,,Positive,,50.0,,...,41.0,,,,,,,,,clinical


In [3]:
if 'Unnamed: 32' in combined.columns:     #drop junk columns
    combined = combined.drop(columns=['Unnamed: 32'])


NameError: name 'combined' is not defined

In [None]:
X = combined.drop(columns=['target']) #separate features and target
y = combined['target']

In [None]:
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist() #identify column types
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()


In [None]:
from sklearn.impute import SimpleImputer   #mode imputation and one-hot encoding

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)


In [None]:
models = {                                   #define models to compare
    "Logistic Regression": LogisticRegression(max_iter=2000),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier()
}


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [None]:
results = {}          #train and evaluate each model

for name, model in models.items():
    clf = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)[:, 1] if hasattr(clf, "predict_proba") else None
    
    results[name] = {
        "accuracy": accuracy_score(y_test, y_pred),
        "report": classification_report(y_test, y_pred),
        "confusion": confusion_matrix(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob) if y_prob is not None else None
    }

    print(f"\n===== {name} =====")
    print("Accuracy:", results[name]["accuracy"])
    print("ROC AUC:", results[name]["roc_auc"])
    print(results[name]["report"])


In [None]:
best_model = Pipeline(steps=[        #plot ROC curve for best model(RandomForest)
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(n_estimators=300, random_state=42))
])

best_model.fit(X_train, y_train)

RocCurveDisplay.from_estimator(best_model, X_test, y_test)
plt.title("ROC Curve - Best Model")
plt.show()


In [None]:
import joblib
joblib.dump(best_model, "breast_cancer_unified_model.pkl")


In [None]:
import seaborn as sns           #distribution of the target variable
import matplotlib.pyplot as plt

sns.countplot(data=combined, x='target')
plt.title("Target Distribution (Benign vs Malignant)")
plt.xlabel("Target (0 = Benign, 1 = Malignant)")
plt.ylabel("Count")
plt.show()


In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(combined.isna(), cbar=False)
plt.title("Missing Values Heatmap Across Combined Dataset")
plt.show()


In [None]:
diagnostic_only = combined[[col for col in combined.columns if 'mean' in col or 'worst' in col or 'se' in col]]
plt.figure(figsize=(14,10))
sns.heatmap(diagnostic_only.corr(), cmap='coolwarm')
plt.title("Correlation Heatmap of Diagnostic Features")
plt.show()


In [None]:
key_features = ['radius_mean', 'texture_mean', 'area_mean', 'smoothness_mean']
combined[key_features].plot(kind='box', figsize=(10,6))
plt.title("Distribution of Selected Diagnostic Features")
plt.show()


In [None]:
# Fit the best model again
best_model.fit(X_train, y_train)

# Extract feature names after preprocessing
encoded_features = best_model.named_steps['preprocessor'].transformers_[1][1] \
    .named_steps['encoder'].get_feature_names_out(categorical_cols)

all_feature_names = numeric_cols + list(encoded_features)

# Extract importances
importances = best_model.named_steps['model'].feature_importances_

# Create dataframe
fi_df = pd.DataFrame({
    'feature': all_feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False).head(20)

# Plot
plt.figure(figsize=(10,6))
sns.barplot(data=fi_df, x='importance', y='feature')
plt.title("Top 20 Most Important Features (Random Forest)")
plt.show()


In [1]:
gb_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", GradientBoostingClassifier())
])

gb_model.fit(X_train, y_train)

importances = gb_model.named_steps['model'].feature_importances_

fi_df = pd.DataFrame({
    'feature': all_feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False).head(20)

plt.figure(figsize=(10,6))
sns.barplot(data=fi_df, x='importance', y='feature')
plt.title("Top 20 Most Important Features (Gradient Boosting)")
plt.show()


NameError: name 'Pipeline' is not defined