In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [10]:
#1. Load Dataset
df = pd.read_csv("Breast_Cancer.csv")

In [11]:
#2. printing from the dataset
print(df.head())

   Age   Race Marital Status T Stage  N Stage 6th Stage  \
0   68  White        Married       T1      N1       IIA   
1   50  White        Married       T2      N2      IIIA   
2   58  White       Divorced       T3      N3      IIIC   
3   58  White        Married       T1      N1       IIA   
4   47  White        Married       T2      N1       IIB   

               differentiate Grade   A Stage  Tumor Size Estrogen Status  \
0      Poorly differentiated     3  Regional           4        Positive   
1  Moderately differentiated     2  Regional          35        Positive   
2  Moderately differentiated     2  Regional          63        Positive   
3      Poorly differentiated     3  Regional          18        Positive   
4      Poorly differentiated     3  Regional          41        Positive   

  Progesterone Status  Regional Node Examined  Reginol Node Positive  \
0            Positive                      24                      1   
1            Positive                      1

In [12]:
#3. checking for any missing values
print("\nMissing values per column:\n", df.isnull().sum())


Missing values per column:
 Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
differentiate             0
Grade                     0
A Stage                   0
Tumor Size                0
Estrogen Status           0
Progesterone Status       0
Regional Node Examined    0
Reginol Node Positive     0
Survival Months           0
Status                    0
dtype: int64


In [16]:
# 4. Outlier handling
numeric_outlier_cols = ["Age", "Tumor Size",
                        "Regional Node Examined",
                        "Reginol Node Positive",
                        "Survival Months"]

for col in numeric_outlier_cols:
    if col in df.columns:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        # Clip values outside [lower, upper] to reduce undue influence of extreme values
        df[col] = df[col].clip(lower=lower, upper=upper)

In [17]:
# 5. Define Features and Target
y = df["Status"].map({"Alive": 0, "Dead": 1})  # binary target
X = df.drop(columns=["Status"])

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()


In [18]:
# 6. Preprocessing Pipelines
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [19]:
# 7. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [21]:
# 8. Decision Tree + Grid Search
dt_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", DecisionTreeClassifier(random_state=42))
    ]
)

dt_param_grid = {
    "clf__criterion": ["gini", "entropy"],
    "clf__max_depth": [None, 5, 10, 20],
    "clf__min_samples_split": [2, 5, 10]
}

dt_grid = GridSearchCV(
    estimator=dt_pipeline,
    param_grid=dt_param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1
)
dt_grid.fit(X_train, y_train)

dt_best = dt_grid.best_estimator_
y_pred_dt = dt_best.predict(X_test)

print("\n=== Decision Tree ===")
print("Accuracy :", accuracy_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt))
print("Recall   :", recall_score(y_test, y_pred_dt))
print("F1-score :", f1_score(y_test, y_pred_dt))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("\nClassification report (DT):\n",
      classification_report(y_test, y_pred_dt, target_names=["Alive", "Dead"]))



=== Decision Tree ===
Accuracy : 0.8881987577639752
Precision: 0.7323943661971831
Recall   : 0.42276422764227645
F1-score : 0.5360824742268041
Confusion matrix:
 [[663  19]
 [ 71  52]]

Classification report (DT):
               precision    recall  f1-score   support

       Alive       0.90      0.97      0.94       682
        Dead       0.73      0.42      0.54       123

    accuracy                           0.89       805
   macro avg       0.82      0.70      0.74       805
weighted avg       0.88      0.89      0.88       805



In [23]:
# 9. KNN + Grid Search
knn_pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", KNeighborsClassifier())
    ]
)

knn_param_grid = {
    "clf__n_neighbors": [3, 5, 7, 9],
    "clf__metric": ["minkowski", "euclidean"]
}

knn_grid = GridSearchCV(
    estimator=knn_pipeline,
    param_grid=knn_param_grid,
    cv=5,
    scoring="f1",
    n_jobs=-1
)
knn_grid.fit(X_train, y_train)

knn_best = knn_grid.best_estimator_
y_pred_knn = knn_best.predict(X_test)


print("\n=== KNN ===")
print("Accuracy :", accuracy_score(y_test, y_pred_knn))
print("Precision:", precision_score(y_test, y_pred_knn))
print("Recall   :", recall_score(y_test, y_pred_knn))
print("F1-score :", f1_score(y_test, y_pred_knn))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("\nClassification report (KNN):\n",
      classification_report(y_test, y_pred_knn, target_names=["Alive", "Dead"]))


=== KNN ===
Accuracy : 0.8720496894409938
Precision: 0.6190476190476191
Recall   : 0.42276422764227645
F1-score : 0.5024154589371981
Confusion matrix:
 [[650  32]
 [ 71  52]]

Classification report (KNN):
               precision    recall  f1-score   support

       Alive       0.90      0.95      0.93       682
        Dead       0.62      0.42      0.50       123

    accuracy                           0.87       805
   macro avg       0.76      0.69      0.71       805
weighted avg       0.86      0.87      0.86       805

