In [1]:
# 1. Import libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# 2. Load and prepare the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

# 4. Evaluate base models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred)
    }

results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    metrics = evaluate_model(model, X_test_scaled, y_test)
    results[name] = metrics

# Display initial evaluation
print("Base Model Performance:")
pd.DataFrame(results).T


Base Model Performance:


Unnamed: 0,Accuracy,Precision,Recall,F1-Score
Logistic Regression,0.973684,0.972222,0.985915,0.979021
Random Forest,0.95614,0.958333,0.971831,0.965035
SVM,0.982456,0.972603,1.0,0.986111


In [2]:
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='f1', n_jobs=-1)
grid_rf.fit(X_train_scaled, y_train)
print("Best RF Params:", grid_rf.best_params_)
rf_best = grid_rf.best_estimator_


Best RF Params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 150}


In [3]:
param_dist_svc = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'linear']
}

random_svc = RandomizedSearchCV(SVC(), param_distributions=param_dist_svc, n_iter=10, cv=5, scoring='f1', n_jobs=-1, random_state=42)
random_svc.fit(X_train_scaled, y_train)
print("Best SVC Params:", random_svc.best_params_)
svc_best = random_svc.best_estimator_


Best SVC Params: {'kernel': 'linear', 'gamma': 'scale', 'C': 0.1}


In [4]:
tuned_results = {
    "Tuned Random Forest": evaluate_model(rf_best, X_test_scaled, y_test),
    "Tuned SVM": evaluate_model(svc_best, X_test_scaled, y_test)
}

print("Tuned Model Performance:")
pd.DataFrame(tuned_results).T


Tuned Model Performance:


Unnamed: 0,Accuracy,Precision,Recall,F1-Score
Tuned Random Forest,0.964912,0.958904,0.985915,0.972222
Tuned SVM,0.982456,0.972603,1.0,0.986111


In [5]:
combined_results = {**results, **tuned_results}
df_results = pd.DataFrame(combined_results).T
best_model_name = df_results['F1-Score'].idxmax()
print(f"\n✅ Best Model based on F1-Score: {best_model_name}")
df_results



✅ Best Model based on F1-Score: SVM


Unnamed: 0,Accuracy,Precision,Recall,F1-Score
Logistic Regression,0.973684,0.972222,0.985915,0.979021
Random Forest,0.95614,0.958333,0.971831,0.965035
SVM,0.982456,0.972603,1.0,0.986111
Tuned Random Forest,0.964912,0.958904,0.985915,0.972222
Tuned SVM,0.982456,0.972603,1.0,0.986111
