In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

In [15]:
def split_by_indices(X_data, y_data, idx_train, idx_val, idx_test):
    X_train = X_data.iloc[idx_train]
    X_val   = X_data.iloc[idx_val]
    X_test  = X_data.iloc[idx_test]
    y_train = y_data.iloc[idx_train]
    y_val   = y_data.iloc[idx_val]
    y_test  = y_data.iloc[idx_test]
    return X_train, X_val, X_test, y_train, y_val, y_test
def train_and_evaluate_clf(model_name, clf, X_train, y_train, X_val, y_val, X_test, y_test):
    X_train_final = pd.concat([X_train, X_val])
    y_train_final = pd.concat([y_train, y_val])
    clf.fit(X_train_final, y_train_final)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print(f"=== {model_name} ===")
    print(f"Test Accuracy: {acc:.4f}")
    print("Confusion Matrix (rows = true, cols = predicted):")
    print(cm, "\n")
    return acc, cm
def tune_k_knn(X_train, y_train):
    pipe_knn = Pipeline(steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier())])
    param_grid = {"knn__n_neighbors": [3, 5, 7, 9, 11]}
    grid_search = GridSearchCV(estimator=pipe_knn, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print("Best k for KNN:", grid_search.best_params_["knn__n_neighbors"])
    print("Best CV accuracy for KNN:", grid_search.best_score_, "\n")
    return grid_search.best_params_["knn__n_neighbors"]
def get_models(best_k):
    models = {}
    models["KNN"] = Pipeline(steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(n_neighbors=best_k))])
    models["RandomForest"] = Pipeline(steps=[("scaler", StandardScaler()), ("rf", RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))])
    models["SVC"] = Pipeline(steps=[("scaler", StandardScaler()), ("svc", SVC(kernel="rbf", C=1.0, gamma="scale", random_state=42))])
    return models

In [4]:
df = pd.read_csv("datasets/data_refined.csv")
print("Data shape:", df.shape)
print("Columns:", df.columns.tolist(), "\n")
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

Data shape: (569, 10)
Columns: ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'symmetry_mean', 'fractal_dimension_mean', 'diagnosis'] 



In [7]:
y_raw = df["diagnosis"]
X = df.drop(columns=["diagnosis"])
y = y_raw
print("Target distribution:")
print(y.value_counts(), "\n")

Target distribution:
diagnosis
0    357
1    212
Name: count, dtype: int64 



In [8]:
numeric_features = X.select_dtypes(include=[np.number]).columns
X_numeric = X[numeric_features]
correlations = X_numeric.corrwith(y)
corr_sorted = correlations.abs().sort_values(ascending=False)
print("Correlation with target (sorted by |corr|):")
print(corr_sorted, "\n")
threshold = 0.3
important_features = corr_sorted[corr_sorted >= threshold].index.tolist()
print(f"Selected important features (|corr| >= {threshold}):")
print(important_features, "\n")
X_full = X_numeric.copy()
X_reduced_corr = X_numeric[important_features].copy()
print("Full feature set shape:", X_full.shape)
print("Reduced (correlation-based) feature set shape:", X_reduced_corr.shape, "\n")

Correlation with target (sorted by |corr|):
perimeter_mean            0.742636
radius_mean               0.730029
area_mean                 0.708984
concavity_mean            0.696360
compactness_mean          0.596534
texture_mean              0.415185
smoothness_mean           0.358560
symmetry_mean             0.330499
fractal_dimension_mean    0.012838
dtype: float64 

Selected important features (|corr| >= 0.3):
['perimeter_mean', 'radius_mean', 'area_mean', 'concavity_mean', 'compactness_mean', 'texture_mean', 'smoothness_mean', 'symmetry_mean'] 

Full feature set shape: (569, 9)
Reduced (correlation-based) feature set shape: (569, 8) 



In [9]:
indices = np.arange(len(df))
idx_train_full, idx_temp = train_test_split(indices, test_size=0.2, random_state=42, stratify=y)
idx_val, idx_test = train_test_split(idx_temp, test_size=0.5, random_state=42, stratify=y.iloc[idx_temp])
Xf_train, Xf_val, Xf_test, yf_train, yf_val, yf_test = split_by_indices(X_full, y, idx_train_full, idx_val, idx_test)
Xr_train, Xr_val, Xr_test, yr_train, yr_val, yr_test = split_by_indices(X_reduced_corr, y, idx_train_full, idx_val, idx_test)
print("Full features - Train/Val/Test shapes:")
print(Xf_train.shape, Xf_val.shape, Xf_test.shape)
print("Reduced features - Train/Val/Test shapes:")
print(Xr_train.shape, Xr_val.shape, Xr_test.shape, "\n")

Full features - Train/Val/Test shapes:
(455, 9) (57, 9) (57, 9)
Reduced features - Train/Val/Test shapes:
(455, 8) (57, 8) (57, 8) 



In [16]:
print("Tuning KNN (full features)...")
best_k_full = tune_k_knn(Xf_train, yf_train)
print("Tuning KNN (reduced features)...")
best_k_reduced = tune_k_knn(Xr_train, yr_train)

Tuning KNN (full features)...
Best k for KNN: 11
Best CV accuracy for KNN: 0.9384615384615385 

Tuning KNN (reduced features)...
Best k for KNN: 9
Best CV accuracy for KNN: 0.945054945054945 



In [17]:
models_full = get_models(best_k_full)
models_reduced = get_models(best_k_reduced)

In [18]:
print("========== FULL FEATURE SET ==========\n")
results_full = {}
for name, clf in models_full.items():
    acc, cm = train_and_evaluate_clf(
        f"{name} (Full Features)",
        clf,
        Xf_train, yf_train,
        Xf_val,   yf_val,
        Xf_test,  yf_test
    )
    results_full[name] = acc


=== KNN (Full Features) ===
Test Accuracy: 0.9474
Confusion Matrix (rows = true, cols = predicted):
[[34  2]
 [ 1 20]] 

=== RandomForest (Full Features) ===
Test Accuracy: 0.9649
Confusion Matrix (rows = true, cols = predicted):
[[35  1]
 [ 1 20]] 

=== SVC (Full Features) ===
Test Accuracy: 1.0000
Confusion Matrix (rows = true, cols = predicted):
[[36  0]
 [ 0 21]] 



In [19]:
print("========== REDUCED FEATURE SET (Correlation) ==========\n")
results_reduced = {}
for name, clf in models_reduced.items():
    acc, cm = train_and_evaluate_clf(f"{name} (Reduced - Corr)", clf, Xr_train, yr_train, Xr_val,   yr_val, Xr_test,  yr_test)
    results_reduced[name] = acc
print("Accuracy summary (Full vs Reduced by correlation):")
for name in models_full.keys():
    print(f"{name:12s} - Full: {results_full[name]:.4f} | Reduced: {results_reduced[name]:.4f}")


=== KNN (Reduced - Corr) ===
Test Accuracy: 0.9649
Confusion Matrix (rows = true, cols = predicted):
[[35  1]
 [ 1 20]] 

=== RandomForest (Reduced - Corr) ===
Test Accuracy: 0.9474
Confusion Matrix (rows = true, cols = predicted):
[[35  1]
 [ 2 19]] 

=== SVC (Reduced - Corr) ===
Test Accuracy: 1.0000
Confusion Matrix (rows = true, cols = predicted):
[[36  0]
 [ 0 21]] 

Accuracy summary (Full vs Reduced by correlation):
KNN          - Full: 0.9474 | Reduced: 0.9649
RandomForest - Full: 0.9649 | Reduced: 0.9474
SVC          - Full: 1.0000 | Reduced: 1.0000
