<a href="https://colab.research.google.com/github/KrishnaKarthikReddy/ML_186/blob/main/Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Gradient Boosting tuning experiment (scikit-learn)
# Dataset: sklearn breast cancer
# Requires: scikit-learn, pandas, numpy

from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np
import pandas as pd
import time
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# 1. Load data
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

# 2. Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)

# 3. Baseline model (default hyperparameters)
baseline = GradientBoostingClassifier(random_state=42)
start = time.time()
cv_scores = cross_val_score(baseline, X_train, y_train, cv=3, scoring='accuracy', n_jobs=1)
baseline_time = time.time() - start
baseline_cv_mean = cv_scores.mean()

baseline.fit(X_train, y_train)
y_pred_baseline = baseline.predict(X_test)
baseline_test_acc = accuracy_score(y_test, y_pred_baseline)

# 4. Hyperparameter tuning with RandomizedSearchCV (light)
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [1, 2, 3, 4],
    'subsample': [0.6, 0.8, 1.0],
    'max_features': [None, 'sqrt', 'log2', 0.5]
}

gb = GradientBoostingClassifier(random_state=42)
rand = RandomizedSearchCV(gb, param_distributions=param_dist, n_iter=20,
                          scoring='accuracy', cv=3, random_state=42, n_jobs=1, verbose=0)

start = time.time()
rand.fit(X_train, y_train)
tuned_time = time.time() - start

best_params = rand.best_params_
best_score_cv = rand.best_score_

best_model = rand.best_estimator_
y_pred_tuned = best_model.predict(X_test)
tuned_test_acc = accuracy_score(y_test, y_pred_tuned)
tuned_classif_report = classification_report(y_test, y_pred_tuned, target_names=data.target_names, digits=4)
tuned_conf_mat = confusion_matrix(y_test, y_pred_tuned)

# 5. Results
results_summary = {
    'baseline_cv_mean': round(baseline_cv_mean,4),
    'baseline_test_acc': round(baseline_test_acc,4),
    'baseline_fit_time_sec': round(baseline_time,3),
    'tuned_cv_mean': round(best_score_cv,4),
    'tuned_test_acc': round(tuned_test_acc,4),
    'tuned_fit_time_sec': round(tuned_time,3),
    'best_params': best_params
}

print("=== Results summary ===")
for k,v in results_summary.items():
    print(f"{k}: {v}")

print("\n=== Top CV results (first few) ===")
cv_results_df = pd.DataFrame(rand.cv_results_).sort_values('rank_test_score').head(6)[['rank_test_score','mean_test_score','std_test_score','params']]
print(cv_results_df.to_string(index=False))

print("\n=== Tuned model classification report (on test set) ===")
print(tuned_classif_report)
print("Confusion matrix:\n", tuned_conf_mat)

# save summary to a file
with open('gbm_experiment_results.txt', 'w') as f:
    for k,v in results_summary.items():
        f.write(f"{k}: {v}\n")
    f.write("\nClassification report:\n")
    f.write(tuned_classif_report + "\n")


=== Results summary ===
baseline_cv_mean: 0.9561
baseline_test_acc: 0.9561
baseline_fit_time_sec: 1.594
tuned_cv_mean: 0.9758
tuned_test_acc: 0.9474
tuned_fit_time_sec: 12.224
best_params: {'subsample': 0.8, 'n_estimators': 100, 'max_features': 'log2', 'max_depth': 1, 'learning_rate': 0.2}

=== Top CV results (first few) ===
 rank_test_score  mean_test_score  std_test_score                                                                                                 params
               1         0.975848        0.011154  {'subsample': 0.8, 'n_estimators': 100, 'max_features': 'log2', 'max_depth': 1, 'learning_rate': 0.2}
               2         0.973655        0.010708  {'subsample': 0.6, 'n_estimators': 300, 'max_features': 'sqrt', 'max_depth': 3, 'learning_rate': 0.1}
               3         0.971462        0.013485     {'subsample': 0.8, 'n_estimators': 200, 'max_features': 0.5, 'max_depth': 2, 'learning_rate': 0.2}
               4         0.971448        0.006141     {'subsa