In [47]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

RANDOM_STATE = 42
RESULTS_PATH = Path("results.txt")


In [63]:
df = pd.read_csv("breast-cancer.csv")
print(df.shape)
df.head()


(569, 32)


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,84232,M,17.99,1.38,122.8,11.0,0.1184,0.2776,0.31,0.1471,...,25.38,17.33,184.6,219.0,0.1622,0.6656,0.7119,0.2654,0.461,0.1189
1,842517,M,2.57,17.77,132.9,1326.0,0.8474,0.7864,0.869,0.717,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.892
2,84393,M,19.69,21.25,13.0,123.0,0.196,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,179.0,0.1444,0.4245,0.454,0.243,0.3613,0.8758
3,8434831,M,11.42,2.38,77.58,386.1,0.1425,0.2839,0.2414,0.152,...,14.91,26.5,98.87,567.7,0.298,0.8663,0.6869,0.2575,0.6638,0.173
4,8435842,M,2.29,14.34,135.1,1297.0,0.13,0.1328,0.198,0.143,...,22.54,16.67,152.2,1575.0,0.1374,0.25,0.4,0.1625,0.2364,0.7678


In [66]:
possible_targets = ["Class", "class", "Diagnosis", "diagnosis", "target", "Target"]
target_column =None
for c in df.columns:
    if c in possible_targets:
        target_column =c
        break
if target_column is None:
    target_column =df.columns[-1]

df = df.replace("?", np.nan)
for col in df.columns:
    if col != target_column:
       df[col] = pd.to_numeric(df[col], errors="coerce")


y_raw =df[target_column]
X = df.drop(columns=[target_col])

X = X.select_dtypes(include=[np.number])

X = X.fillna(X.median(numeric_only=True))

nunique = X.nunique()
X = X.loc[:, nunique[nunique > 1].index]

if y_raw.dtype ==object:
    y = LabelEncoder().fit_transform(y_raw)
else:
    uniq = sorted(pd.Series(y_raw).dropna().unique().tolist())
    if set(uniq) == {2, 4}:
        y = (y_raw == 4).astype(int).values
    else:
        y = y_raw.values

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Class distribution:", pd.Series(y).value_counts().to_dict())
X.head()


X shape: (569, 31)
y shape: (569,)
Class distribution: {0: 357, 1: 212}


Unnamed: 0,id,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,84232,17.99,1.38,122.8,11.0,0.1184,0.2776,0.31,0.1471,0.2419,...,25.38,17.33,184.6,219.0,0.1622,0.6656,0.7119,0.2654,0.461,0.1189
1,842517,2.57,17.77,132.9,1326.0,0.8474,0.7864,0.869,0.717,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.892
2,84393,19.69,21.25,13.0,123.0,0.196,0.1599,0.1974,0.1279,0.269,...,23.57,25.53,152.5,179.0,0.1444,0.4245,0.454,0.243,0.3613,0.8758
3,8434831,11.42,2.38,77.58,386.1,0.1425,0.2839,0.2414,0.152,0.2597,...,14.91,26.5,98.87,567.7,0.298,0.8663,0.6869,0.2575,0.6638,0.173
4,8435842,2.29,14.34,135.1,1297.0,0.13,0.1328,0.198,0.143,0.189,...,22.54,16.67,152.2,1575.0,0.1374,0.25,0.4,0.1625,0.2364,0.7678


In [78]:
def train_and_evaluate(X_data, y_data, title="Model", model=None, test_size=0.2):
    if model is None:
        model = RandomForestClassifier(
            n_estimators=300,
            random_state=RANDOM_STATE,
            n_jobs=-1
        )
    X_train, X_test, y_train, y_test = train_test_split(
        X_data, y_data, test_size=test_size, random_state=RANDOM_STATE, stratify=y_data
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds, digits=4)
    cm = confusion_matrix(y_test, preds)
    print(f"{title} | Accuracy: {acc:.4f}")
    print("\nClassification report:\n", report)
    print("Confusion matrix:\n", cm)
    return acc, report, cm


In [69]:
X_all_std = StandardScaler().fit_transform(X)
baseline_acc, baseline_report, baseline_cm = train_and_evaluate(X_all_std, y, "Baseline RF (All features)")


Baseline RF (All features) | Accuracy: 0.9474

Classification report:
               precision    recall  f1-score   support

           0     0.9231    1.0000    0.9600        72
           1     1.0000    0.8571    0.9231        42

    accuracy                         0.9474       114
   macro avg     0.9615    0.9286    0.9415       114
weighted avg     0.9514    0.9474    0.9464       114

Confusion matrix:
 [[72  0]
 [ 6 36]]


In [70]:
X_pos = MinMaxScaler().fit_transform(X)

k = min(10, X.shape[1])  # choose up to 10 features (or fewer if not available)
chi = SelectKBest(score_func=chi2, k=k)
X_chi = chi.fit_transform(X_pos, y)
chi_feature_idx = chi.get_support(indices=True)
chi_feature_names = X.columns[chi_feature_idx].tolist()

print("FILTER (Chi2) selected features:", chi_feature_names)
chi_acc, chi_report, chi_cm = train_and_evaluate(X_chi, y, "RF on FILTER (Chi2) features")


FILTER (Chi2) selected features: ['area_mean', 'smoothness_mean', 'compactness_mean', 'concave points_mean', 'radius_se', 'perimeter_se', 'area_se', 'concave points_se', 'area_worst', 'concave points_worst']
RF on FILTER (Chi2) features | Accuracy: 0.9298

Classification report:
               precision    recall  f1-score   support

           0     0.9444    0.9444    0.9444        72
           1     0.9048    0.9048    0.9048        42

    accuracy                         0.9298       114
   macro avg     0.9246    0.9246    0.9246       114
weighted avg     0.9298    0.9298    0.9298       114

Confusion matrix:
 [[68  4]
 [ 4 38]]


In [71]:
X_std = StandardScaler().fit_transform(X)
lr_base = LogisticRegression(max_iter=2000, penalty="l2", solver="liblinear", random_state=RANDOM_STATE)

rfe = RFE(estimator=lr_base, n_features_to_select=k)
X_rfe = rfe.fit_transform(X_std, y)
rfe_feature_idx = rfe.get_support(indices=True)
rfe_feature_names = X.columns[rfe_feature_idx].tolist()

print("WRAPPER (RFE) selected features:", rfe_feature_names)
rfe_acc, rfe_report, rfe_cm = train_and_evaluate(X_rfe, y, "RF on WRAPPER (RFE) features")


WRAPPER (RFE) selected features: ['radius_mean', 'texture_mean', 'concave points_mean', 'radius_se', 'perimeter_se', 'area_se', 'symmetry_se', 'radius_worst', 'area_worst', 'symmetry_worst']
RF on WRAPPER (RFE) features | Accuracy: 0.9474

Classification report:
               precision    recall  f1-score   support

           0     0.9342    0.9861    0.9595        72
           1     0.9737    0.8810    0.9250        42

    accuracy                         0.9474       114
   macro avg     0.9539    0.9335    0.9422       114
weighted avg     0.9488    0.9474    0.9468       114

Confusion matrix:
 [[71  1]
 [ 5 37]]


In [72]:

l1 = LogisticRegression(max_iter=3000, penalty="l1", solver="liblinear", random_state=RANDOM_STATE)
l1.fit(X_std, y)
coef_abs = np.abs(l1.coef_).ravel()
order = np.argsort(coef_abs)[::-1]
top_idx = order[:k]

X_l1 = X_std[:, top_idx]
l1_feature_names = X.columns[top_idx].tolist()

print("EMBEDDED (L1-LogReg) selected features:", l1_feature_names)
l1_acc, l1_report, l1_cm = train_and_evaluate(X_l1, y, "RF on EMBEDDED (L1) features")


EMBEDDED (L1-LogReg) selected features: ['symmetry_worst', 'radius_se', 'area_se', 'perimeter_se', 'radius_mean', 'radius_worst', 'area_worst', 'concave points_worst', 'symmetry_se', 'texture_mean']
RF on EMBEDDED (L1) features | Accuracy: 0.9386

Classification report:
               precision    recall  f1-score   support

           0     0.9333    0.9722    0.9524        72
           1     0.9487    0.8810    0.9136        42

    accuracy                         0.9386       114
   macro avg     0.9410    0.9266    0.9330       114
weighted avg     0.9390    0.9386    0.9381       114

Confusion matrix:
 [[70  2]
 [ 5 37]]


In [73]:
summary = pd.DataFrame({
    "Setting": ["Baseline (All)", "Filter (Chi2)", "Wrapper (RFE)", "Embedded (L1)"],
    "Accuracy": [baseline_acc, chi_acc, rfe_acc, l1_acc]
}).sort_values("Accuracy", ascending=False).reset_index(drop=True)

summary


Unnamed: 0,Setting,Accuracy
0,Baseline (All),0.947368
1,Wrapper (RFE),0.947368
2,Embedded (L1),0.938596
3,Filter (Chi2),0.929825


In [79]:
if RESULTS_PATH.exists():
    RESULTS_PATH.unlink()

def write_block(header, acc, report, cm, names=None):
    with open(RESULTS_PATH, "a", encoding="utf-8") as f:
        f.write("\n" + "="*70 + "\n")
        f.write(header + "\n")
        if names is not None:
            f.write(f"Selected features ({len(names)}): {names}\n")
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write("Classification report:\n")
        f.write(report + "\n")
        f.write("Confusion matrix:\n")
        f.write(str(cm) + "\n")

write_block("BASELINE: RandomForest on ALL standardized features",
            baseline_acc, baseline_report, baseline_cm)

write_block("FILTER: RandomForest on Chi2-selected features",
            chi_acc, chi_report, chi_cm, chi_feature_names)

write_block("WRAPPER: RandomForest on RFE-selected features",
            rfe_acc, rfe_report, rfe_cm, rfe_feature_names)

write_block("EMBEDDED: RandomForest on L1-selected features",
            l1_acc, l1_report, l1_cm, l1_feature_names)
print("Saved to results.txt")


Saved to results.txt


In [80]:

best = summary.iloc[0]["Setting"]
print("Best setting:", best)

if "Filter" in best:
    X_best = X_chi
elif "Wrapper" in best:
    X_best = X_rfe
elif "Embedded" in best:
    X_best = X_l1
else:
    X_best = X_all_std

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
rf = RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1)
scores = cross_val_score(rf, X_best, y, scoring="accuracy")
print("accuracy:", scores.mean().round(3))
      


Best setting: Baseline (All)
accuracy: 0.947
