In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import learning_curve, validation_curve, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')


In [None]:
# Load dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Dataset shape: {X.shape}")
print(f"Number of classes: {len(np.unique(y))}")
print(f"Class distribution:\n{y.value_counts()}")
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")


In [None]:
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=5,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.
    """
    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")

    return plt


In [None]:
# Create examples of underfitting, overfitting, and well-balanced models
underfit_model = LogisticRegression(C=0.001, max_iter=1000)  # High regularization = underfitting
overfit_model = DecisionTreeClassifier(max_depth=None)  # No depth limit = potential overfitting
balanced_model = RandomForestClassifier(n_estimators=100, max_depth=3)  # Balanced complexity

fig, axes = plt.subplots(3, 3, figsize=(20, 15))

# Plot learning curves for underfit model
plot_learning_curve(underfit_model, "Learning Curves (Logistic Regression with high regularization - Underfitting)",
                    X_train_scaled, y_train, axes=axes[0], ylim=(0.7, 1.01))

# Plot learning curves for overfit model
plot_learning_curve(overfit_model, "Learning Curves (Decision Tree with no max_depth - Overfitting)",
                   X_train, y_train, axes=axes[1], ylim=(0.7, 1.01))

# Plot learning curves for balanced model
plot_learning_curve(balanced_model, "Learning Curves (Random Forest with max_depth=3 - Balanced)",
                   X_train, y_train, axes=axes[2], ylim=(0.7, 1.01))

plt.tight_layout()
plt.show()


In [None]:
# Define hyperparameter grid for logistic regression
param_grid_log = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'penalty': ['l1', 'l2']
}

# Create and fit the grid search
grid_search_log = GridSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    param_grid_log,
    cv=5,
    scoring='accuracy',
    return_train_score=True,
    verbose=1,
    n_jobs=-1
)

# Time the grid search
start_time = time()
grid_search_log.fit(X_train_scaled, y_train)
end_time = time()

print(f"GridSearchCV took {end_time - start_time:.2f} seconds for {len(grid_search_log.cv_results_['params'])} candidates parameter settings.")

# Get the best parameters and score
best_params_log = grid_search_log.best_params_
best_score_log = grid_search_log.best_score_

print(f"Best parameters: {best_params_log}")
print(f"Best cross-validation score: {best_score_log:.4f}")

# Evaluate on test set
best_model_log = grid_search_log.best_estimator_
y_pred_log = best_model_log.predict(X_test_scaled)
test_accuracy_log = accuracy_score(y_test, y_pred_log)

print(f"Test accuracy with best model: {test_accuracy_log:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_log))


In [None]:
# Visualize grid search results
results = pd.DataFrame(grid_search_log.cv_results_)

# Plot mean test scores for different values of C, grouped by solver and penalty
plt.figure(figsize=(12, 8))
for solver in ['liblinear', 'saga']:
    for penalty in ['l1', 'l2']:
        mask = (results['param_solver'] == solver) & (results['param_penalty'] == penalty)
        plt.plot(results.loc[mask, 'param_C'], results.loc[mask, 'mean_test_score'], 
                marker='o', label=f"{solver}, {penalty}")

plt.xscale('log')
plt.xlabel('C (regularization parameter)')
plt.ylabel('Mean test accuracy')
plt.title('Grid Search Results: Logistic Regression')
plt.legend(title='Solver, Penalty')
plt.grid(True)
plt.show()


In [None]:
# Define hyperparameter space for Random Forest
param_dist_rf = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create and fit the random search
random_search_rf = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist_rf,
    n_iter=20,  # Number of parameter settings sampled
    cv=5,
    scoring='accuracy',
    return_train_score=True,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Time the random search
start_time = time()
random_search_rf.fit(X_train, y_train)
end_time = time()

print(f"RandomizedSearchCV took {end_time - start_time:.2f} seconds for {random_search_rf.n_iter} candidates parameter settings.")

# Get the best parameters and score
best_params_rf = random_search_rf.best_params_
best_score_rf = random_search_rf.best_score_

print(f"Best parameters: {best_params_rf}")
print(f"Best cross-validation score: {best_score_rf:.4f}")

# Evaluate on test set
best_model_rf = random_search_rf.best_estimator_
y_pred_rf = best_model_rf.predict(X_test)
test_accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f"Test accuracy with best model: {test_accuracy_rf:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))


In [None]:
# Visualize random search results
results_rf = pd.DataFrame(random_search_rf.cv_results_)

# Sort by test score
results_rf = results_rf.sort_values('mean_test_score', ascending=False)

# Plot top 10 results
plt.figure(figsize=(15, 8))
top_results = results_rf.head(10).reset_index(drop=True)

# Create a list to store parameter values as text
param_text = []
for i, params in enumerate(top_results['params']):
    text = ", ".join([f"{k}={v}" for k, v in params.items() 
                     if k in ['n_estimators', 'max_depth']])
    param_text.append(text)

plt.barh(range(len(top_results)), top_results['mean_test_score'], xerr=top_results['std_test_score'])
plt.yticks(range(len(top_results)), param_text)
plt.xlabel('Mean Test Accuracy')
plt.title('Top 10 Parameter Combinations (Random Search)')
plt.xlim(0.9, 1.0)
plt.gca().invert_yaxis()  # Highest score at the top
plt.grid(axis='x')
plt.tight_layout()
plt.show()


In [None]:
# Create validation curve for SVM's C parameter
param_range = np.logspace(-3, 3, 7)
train_scores, test_scores = validation_curve(
    SVC(kernel='rbf', gamma='scale', random_state=42),
    X_train_scaled, y_train,
    param_name="C",
    param_range=param_range,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

# Calculate mean and std of training and test scores
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

# Plot validation curve
plt.figure(figsize=(10, 6))
plt.title("Validation Curve with SVM (RBF kernel)")
plt.xlabel("C (regularization parameter)")
plt.ylabel("Accuracy")
plt.xscale("log")
plt.grid()

plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(param_range, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(param_range, test_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.legend(loc="best")
plt.show()

# Find the optimal C value
best_c_index = np.argmax(test_scores_mean)
best_c = param_range[best_c_index]
print(f"Optimal C parameter: {best_c}")
print(f"Corresponding mean test accuracy: {test_scores_mean[best_c_index]:.4f}")


In [None]:
# Build the optimal SVM model with the best C parameter
optimal_svm = SVC(kernel='rbf', C=best_c, gamma='scale', probability=True, random_state=42)
optimal_svm.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
y_pred_svm = optimal_svm.predict(X_test_scaled)
test_accuracy_svm = accuracy_score(y_test, y_pred_svm)

print(f"Test accuracy with optimal SVM model: {test_accuracy_svm:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_svm))

# Plot confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred_svm)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Optimal SVM Model')
plt.show()


In [None]:
# Create a dataframe with performance metrics for all optimized models
models = ['Logistic Regression (Grid Search)', 'Random Forest (Random Search)', 'SVM (Validation Curve)']
y_preds = [y_pred_log, y_pred_rf, y_pred_svm]

# Calculate performance metrics
accuracies = [accuracy_score(y_test, y_pred) for y_pred in y_preds]

# Calculate precision, recall, and f1-score for class 1 (malignant)
precisions, recalls, f1_scores = [], [], []

for y_pred in y_preds:
    report = classification_report(y_test, y_pred, output_dict=True)
    precisions.append(report['1']['precision'])
    recalls.append(report['1']['recall'])
    f1_scores.append(report['1']['f1-score'])

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': models,
    'Accuracy': accuracies,
    'Precision': precisions,
    'Recall': recalls,
    'F1-Score': f1_scores
})

# Display comparison table
comparison_df.sort_values('F1-Score', ascending=False).reset_index(drop=True)

# Visualize model comparison
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
comparison_df_melted = pd.melt(comparison_df, id_vars='Model', value_vars=metrics, var_name='Metric', value_name='Score')

plt.figure(figsize=(14, 8))
sns.barplot(x='Model', y='Score', hue='Metric', data=comparison_df_melted)
plt.title('Performance Comparison of Optimized Models')
plt.ylim(0.9, 1.0)  # Adjust y-axis for better visualization
plt.legend(title='Metric', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
