# Q.6) Write a Python program to:● Load the Breast Cancer dataset using sklearn.datasets.load_breast_cancer() ● Train a Random Forest Classifier ● Print the top 5 most important features based on feature importance scores.


In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

In [2]:
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names
model = RandomForestClassifier(random_state=42)
model.fit(X, y)

importances = model.feature_importances_
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print("Top 5 Important Features:")
print(importance_df.head(5))

Top 5 Important Features:
                 Feature  Importance
23            worst area    0.139357
27  worst concave points    0.132225
7    mean concave points    0.107046
20          worst radius    0.082848
22       worst perimeter    0.080850


# Q.7) Write a Python program to: ● Train a Bagging Classifier using Decision Trees on the Iris dataset ● Evaluate its accuracy and compare with a single Decision Tree

In [3]:
from sklearn.datasets import load_iris
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42)

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_acc = accuracy_score(y_test, dt.predict(X_test))

bag = BaggingClassifier(DecisionTreeClassifier(), n_estimators=50, random_state=42)
bag.fit(X_train, y_train)
bag_acc = accuracy_score(y_test, bag.predict(X_test))

print("Decision Tree Accuracy:", dt_acc)
print("Bagging Accuracy:", bag_acc)

Decision Tree Accuracy: 1.0
Bagging Accuracy: 1.0


# Q.8) Write a Python program to: ● Train a Random Forest Classifier ● Tune hyperparameters max_depth and n_estimators using GridSearchCV ● Print the best parameters and final accuracy

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, None]
}

grid_search = GridSearchCV(rf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Best Parameters:", grid_search.best_params_)
print(f"Final Accuracy: {accuracy:.2f}")

Best Parameters: {'max_depth': 3, 'n_estimators': 50}
Final Accuracy: 1.00


# Q.9) Write a Python program to: ● Train a Bagging Regressor and a Random Forest Regressor on the California Housing dataset ● Compare their Mean Squared Errors (MSE)

In [7]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [8]:
housing = fetch_california_housing()
X_train, X_test, y_train, y_test = train_test_split(housing.data, housing.target, random_state=42)
bag = BaggingRegressor(random_state=42)
bag.fit(X_train, y_train)
bag_mse = mean_squared_error(y_test, bag.predict(X_test))
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_mse = mean_squared_error(y_test, rf.predict(X_test))

print("Bagging MSE:", bag_mse)
print("Random Forest MSE:", rf_mse)

Bagging MSE: 0.27872278915343157
Random Forest MSE: 0.25424371393528344


# Q.10) You are working as a data scientist at a financial institution to predict loan default. You have access to customer demographic and transaction history data.You decide to use ensemble techniques to increase model performance.Explain your step-by-step approach to: ● Choose between Bagging or Boosting ● Handle overfitting ● Select base models ● Evaluate performance using cross-validation ● Justify how ensemble learning improves decision-making in this real-world context.

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
import numpy as np

In [10]:
data = pd.read_csv("loan_data.csv")
print("Dataset Columns:", list(data.columns))

Dataset Columns: ['age', 'income', 'loan_amount', 'credit_score', 'employment_years', 'transaction_count', 'avg_transaction_amount', 'missed_payments', 'loan_default']


In [11]:
target_col = "loan_default"

if target_col not in data.columns:
    raise ValueError(f"Target column '{target_col}' not found. Available columns: {list(data.columns)}")

X = data.drop(target_col, axis=1)
y = data[target_col]
X = pd.get_dummies(X, drop_first=True)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
bagging_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42)
boosting_model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    random_state=42)

In [14]:
cv_bagging = cross_val_score(bagging_model, X_train, y_train, cv=5, scoring='roc_auc')
cv_boosting = cross_val_score(boosting_model, X_train, y_train, cv=5, scoring='roc_auc')

print(f"Bagging CV AUC: {cv_bagging.mean():.4f}")
print(f"Boosting CV AUC: {cv_boosting.mean():.4f}")

Bagging CV AUC: 0.4486
Boosting CV AUC: 0.4568


In [15]:
# Select Best Model & Evaluate
final_model = boosting_model if cv_boosting.mean() > cv_bagging.mean() else bagging_model
final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)
y_pred_prob = final_model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print(f"Test AUC: {roc_auc_score(y_test, y_pred_prob):.4f}")


Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.94      0.86       159
           1       0.18      0.05      0.08        41

    accuracy                           0.76       200
   macro avg       0.49      0.50      0.47       200
weighted avg       0.67      0.76      0.70       200

Test AUC: 0.4919


In [16]:
"""
Ensemble learning combines multiple models to achieve better predictive performance
than individual models. In predicting loan defaults:

1. Reduces the risk of relying on a single weak model.
2. Captures complex patterns in both demographic and transaction history data.
3. Improves robustness to noise and outliers.
4. Boosting helps focus on difficult-to-predict customers.
5. Bagging improves stability and reduces overfitting, especially with high variance data.

This translates into fewer false approvals and missed defaulters, directly
impacting financial risk management.
"""

'\nEnsemble learning combines multiple models to achieve better predictive performance\nthan individual models. In predicting loan defaults:\n\n1. Reduces the risk of relying on a single weak model.\n2. Captures complex patterns in both demographic and transaction history data.\n3. Improves robustness to noise and outliers.\n4. Boosting helps focus on difficult-to-predict customers.\n5. Bagging improves stability and reduces overfitting, especially with high variance data.\n\nThis translates into fewer false approvals and missed defaulters, directly\nimpacting financial risk management.\n'