<a href="https://colab.research.google.com/github/NarayaniSuresh/CSE340/blob/main/Lab_4_Bagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Steps:

In [11]:
# 1) Import the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error

#2)Load the dataset
diabetes_df = pd.read_csv("diabetes.csv")

# 3) Check for Null and Describe the dataset.
print(diabetes_df.isnull().sum())
print(diabetes_df.describe())

# 4) Remove unnecessary column from the Dataset.
diabetes_df.drop('Age', axis=1, inplace=True)  # Age column seems unnecessary for prediction

# 5) Using StandardScaler to scale the features before applying.
scaler = StandardScaler()
scaled_features = scaler.fit_transform(diabetes_df.drop('Outcome', axis=1))
scaled_df = pd.DataFrame(scaled_features, columns=diabetes_df.columns[:-1])
scaled_df['Outcome'] = diabetes_df['Outcome']

# 6) Split the dataset into training and test sets.
X = scaled_df.drop('Outcome', axis=1)
y = scaled_df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7) Use decision tree classifier as a base estimator
base_classifier = DecisionTreeClassifier()

# 8) Classification model for bagging and Train models and print their accuracy.
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)
bagging_classifier.fit(X_train, y_train)
y_pred_train = bagging_classifier.predict(X_train)
y_pred_test = bagging_classifier.predict(X_test)

print("Accuracy on training set:", accuracy_score(y_train, y_pred_train))
print("Accuracy on test set:", accuracy_score(y_test, y_pred_test))


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedig

In [12]:
# Regression using Bagging
# 7) Use decision tree regressor as a base estimator
base_regressor = DecisionTreeRegressor()

# 8) Regression model for bagging and Train models and print their accuracy.
bagging_regressor = BaggingRegressor(base_regressor, n_estimators=10, random_state=42)
bagging_regressor.fit(X_train, y_train)
y_pred_train_reg = bagging_regressor.predict(X_train)
y_pred_test_reg = bagging_regressor.predict(X_test)

print("Mean Squared Error on training set:", mean_squared_error(y_train, y_pred_train_reg))
print("Mean Squared Error on test set:", mean_squared_error(y_test, y_pred_test_reg))


Mean Squared Error on training set: 0.033159609120521176
Mean Squared Error on test set: 0.17792207792207793


In [14]:
# Importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_squared_error


#2)Load the dataset
diabetes_df = pd.read_csv("diabetes.csv")
# Check for Null and Describe the dataset
print(diabetes_df.isnull().sum())
print(diabetes_df.describe())

# Remove unnecessary column from the Dataset
diabetes_df.drop('Age', axis=1, inplace=True)

# Feature Engineering
# You may add more feature engineering steps here

# Scaling Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(diabetes_df.drop('Outcome', axis=1))
scaled_df = pd.DataFrame(scaled_features, columns=diabetes_df.columns[:-1])
scaled_df['Outcome'] = diabetes_df['Outcome']

# Split the dataset into training and test sets
X = scaled_df.drop('Outcome', axis=1)
y = scaled_df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameter Tuning for Classification
param_grid_classification = {
    'base_estimator__max_depth': [3, 5, 7],
    'n_estimators': [10, 20, 30],
    'max_samples': [0.5, 0.7, 1.0],
}

base_classifier = DecisionTreeClassifier(random_state=42)
bagging_classifier = BaggingClassifier(base_estimator=base_classifier, random_state=42)

grid_search_classification = GridSearchCV(estimator=bagging_classifier, param_grid=param_grid_classification, cv=5)
grid_search_classification.fit(X_train, y_train)

print("Best parameters for classification:", grid_search_classification.best_params_)

# Hyperparameter Tuning for Regression
param_grid_regression = {
    'base_estimator__max_depth': [3, 5, 7],
    'n_estimators': [10, 20, 30],
    'max_samples': [0.5, 0.7, 1.0],
}

base_regressor = DecisionTreeRegressor(random_state=42)
bagging_regressor = BaggingRegressor(base_estimator=base_regressor, random_state=42)

grid_search_regression = GridSearchCV(estimator=bagging_regressor, param_grid=param_grid_regression, cv=5)
grid_search_regression.fit(X_train, y_train)

print("Best parameters for regression:", grid_search_regression.best_params_)

# Evaluation for Classification
best_classifier = grid_search_classification.best_estimator_
y_pred_train_cls = best_classifier.predict(X_train)
y_pred_test_cls = best_classifier.predict(X_test)

print("Accuracy on training set (classification):", accuracy_score(y_train, y_pred_train_cls))
print("Accuracy on test set (classification):", accuracy_score(y_test, y_pred_test_cls))

# Evaluation for Regression
best_regressor = grid_search_regression.best_estimator_
y_pred_train_reg = best_regressor.predict(X_train)
y_pred_test_reg = best_regressor.predict(X_test)

print("Mean Squared Error on training set (regression):", mean_squared_error(y_train, y_pred_train_reg))
print("Mean Squared Error on test set (regression):", mean_squared_error(y_test, y_pred_test_reg))


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedig



Best parameters for classification: {'base_estimator__max_depth': 3, 'max_samples': 0.7, 'n_estimators': 20}




Best parameters for regression: {'base_estimator__max_depth': 5, 'max_samples': 1.0, 'n_estimators': 30}
Accuracy on training set (classification): 0.7801302931596091
Accuracy on test set (classification): 0.7727272727272727
Mean Squared Error on training set (regression): 0.10469211560671388
Mean Squared Error on test set (regression): 0.1564880696982441
