In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


In [2]:
'''
Question 6: Write a Python program to:
● Load the Breast Cancer dataset using sklearn.datasets.load_breast_cancer()
● Train a Random Forest Classifier
● Print the top 5 most important features based on feature importance scores.
'''

sklearn.datasets.load_breast_cancer()
data = sklearn.datasets.load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target


X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Initialize and train the Random Forest Classifier
# n_estimators: Number of trees in the forest
# random_state: For reproducibility
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=1)
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}\n")

# Get feature importances from the trained model
feature_importances = rf_classifier.feature_importances_

# Create a Pandas Series for better handling of feature names and importances
feature_importance_series = pd.Series(feature_importances, index=X.columns)

# Sort the features by importance in descending order
sorted_feature_importances = feature_importance_series.sort_values(ascending=False)

# Print the top 5 most important features
print("Top 5 Most Important Features:")
print(sorted_feature_importances.head(5))


Accuracy: 0.94

Top 5 Most Important Features:
worst perimeter         0.128273
worst concave points    0.125543
worst area              0.109792
worst radius            0.103148
mean concave points     0.084547
dtype: float64


In [3]:
'''
Question 7: Write a Python program to:
● Train a Bagging Classifier using Decision Trees on the Iris dataset
● Evaluate its accuracy and compare with a single Decision Tree.
'''

# Load the Iris dataset
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# 1. Train and evaluate a single Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=1)
dt_classifier.fit(X_train, y_train)
dt_predictions = dt_classifier.predict(X_test)
dt_accuracy = accuracy_score(y_test, dt_predictions)
print(f"Accuracy of single Decision Tree: {dt_accuracy:.4f}")

# 2. Train and evaluate a Bagging Classifier with Decision Trees
# n_estimators: number of base estimators (Decision Trees) in the ensemble
# estimator: the base estimator to use (DecisionTreeClassifier in this case)
bagging_classifier = BaggingClassifier(estimator=DecisionTreeClassifier(random_state=1),
                                       n_estimators=10, random_state=1)
bagging_classifier.fit(X_train, y_train)
bagging_predictions = bagging_classifier.predict(X_test)
bagging_accuracy = accuracy_score(y_test, bagging_predictions)
print(f"Accuracy of Bagging Classifier: {bagging_accuracy:.4f}")

# Compare the accuracies
if bagging_accuracy > dt_accuracy:
    print("\nBagging Classifier performed better than the single Decision Tree.")
elif dt_accuracy > bagging_accuracy:
    print("\nSingle Decision Tree performed better than the Bagging Classifier.")
else:
    print("\nBoth models achieved the same accuracy.")

Accuracy of single Decision Tree: 0.9667
Accuracy of Bagging Classifier: 0.9667

Both models achieved the same accuracy.


In [4]:
'''
Question 8: Write a Python program to:
● Train a Random Forest Classifier
● Tune hyperparameters max_depth and n_estimators using GridSearchCV
● Print the best parameters and final accuracy
'''

# loading tips data for working
df = sns.load_dataset('tips')

X = df.drop('time', axis=1)
y = df['time']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1)



cat_cols = ["sex", "smoker", "day"]
num_cols = ["total_bill", "tip", "size"]

num_pipeline = Pipeline(steps = [('imputation', SimpleImputer(strategy = "median")),
                                ('scaling', StandardScaler())])
cat_pipeline = Pipeline(steps = [('imputation', SimpleImputer(strategy = "most_frequent")),
                                ('encoding', OneHotEncoder())])

preprocessor = ColumnTransformer([("num_pipeline", num_pipeline, num_cols),
                  ("cat_pipeline", cat_pipeline, cat_cols)])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


rf = RandomForestClassifier()
param_grid = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20,],
    'n_estimators': [10, 25, 50, 100, 200]
}

gs = GridSearchCV(rf, param_grid, cv=5)
gs.fit(X_train, y_train)

print("The best parameter are: ")
print(gs.best_params_)
print(f"\nThe accuracy score is: {gs.best_score_:.4f}")


The best parameter are: 
{'max_depth': 2, 'min_samples_leaf': 5, 'n_estimators': 10}

The accuracy score is: 0.9795


In [5]:
'''
Question 9: Write a Python program to:
● Train a Bagging Regressor and a Random Forest Regressor on the California Housing dataset
● Compare their Mean Squared Errors (MSE)
'''


from sklearn.datasets import fetch_california_housing
# Load the California Housing dataset
housing = fetch_california_housing()
X = housing.data
y = housing.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Bagging Regressor
bagging_regressor = BaggingRegressor(random_state=42)
bagging_regressor.fit(X_train, y_train)
y_pred_bagging = bagging_regressor.predict(X_test)
mse_bagging = mean_squared_error(y_test, y_pred_bagging)

# Train a Random Forest Regressor
random_forest_regressor = RandomForestRegressor(random_state=42)
random_forest_regressor.fit(X_train, y_train)
y_pred_rf = random_forest_regressor.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)

# Compare Mean Squared Errors
print(f"Mean Squared Error (Bagging Regressor): {mse_bagging:.3f}")
print(f"\nMean Squared Error (Random Forest Regressor): {mse_rf:.3f}")


Mean Squared Error (Bagging Regressor): 0.282

Mean Squared Error (Random Forest Regressor): 0.255


In [None]:
'''
Question 10: You are working as a data scientist at a financial institution to predict loan default.
You have access to customer demographic and transaction history data. You decide to use ensemble techniques to increase model performance.
Explain your step-by-step approach to:
● Choose between Bagging or Boosting
● Handle overfitting
● Select base models
● Evaluate performance using cross-validation
● Justify how ensemble learning improves decision-making in this real-world context.
'''

