<a href="https://colab.research.google.com/github/Mahwish-source/pydata-lab/blob/main/Bagging_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [15]:
df = pd.read_csv("heart.csv")
print("Original dataset shape:", df.shape)

Original dataset shape: (918, 12)


In [16]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
df_no_outliers = df[(np.abs(stats.zscore(df[numeric_cols])) < 3).all(axis=1)]
print("Shape after outlier removal:", df_no_outliers.shape)

Shape after outlier removal: (899, 12)


In [17]:
categorical_cols = df_no_outliers.select_dtypes(include=['object', 'category']).columns

In [18]:
df_encoded = pd.get_dummies(df_no_outliers, columns=categorical_cols, drop_first=True)

In [21]:
target_col = 'HeartDisease'
X = df_encoded.drop(columns=[target_col])
y = df_encoded[target_col]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [23]:
svm = SVC(random_state=42)
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_test)
print("\nSVM Standalone Accuracy:", accuracy_score(y_test, svm_preds))
print(classification_report(y_test, svm_preds))


SVM Standalone Accuracy: 0.8888888888888888
              precision    recall  f1-score   support

           0       0.97      0.79      0.87        86
           1       0.84      0.98      0.90        94

    accuracy                           0.89       180
   macro avg       0.90      0.88      0.89       180
weighted avg       0.90      0.89      0.89       180



In [26]:
svm = SVC(random_state=42)
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_test)
print("\nSVM Standalone Accuracy:", accuracy_score(y_test, svm_preds))
print(classification_report(y_test, svm_preds))
# %%
# Changed base_estimator to estimator due to scikit-learn version update
svm_bagging = BaggingClassifier(estimator=SVC(), n_estimators=10, random_state=42)
svm_bagging.fit(X_train, y_train)
svm_bag_preds = svm_bagging.predict(X_test)
print("SVM Bagging Accuracy:", accuracy_score(y_test, svm_bag_preds))
print(classification_report(y_test, svm_bag_preds))


SVM Standalone Accuracy: 0.8888888888888888
              precision    recall  f1-score   support

           0       0.97      0.79      0.87        86
           1       0.84      0.98      0.90        94

    accuracy                           0.89       180
   macro avg       0.90      0.88      0.89       180
weighted avg       0.90      0.89      0.89       180

SVM Bagging Accuracy: 0.8944444444444445
              precision    recall  f1-score   support

           0       0.95      0.83      0.88        86
           1       0.86      0.96      0.90        94

    accuracy                           0.89       180
   macro avg       0.90      0.89      0.89       180
weighted avg       0.90      0.89      0.89       180



In [27]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_preds = dt.predict(X_test)
print("Decision Tree Standalone Accuracy:", accuracy_score(y_test, dt_preds))
print(classification_report(y_test, dt_preds))

Decision Tree Standalone Accuracy: 0.7777777777777778
              precision    recall  f1-score   support

           0       0.79      0.72      0.76        86
           1       0.76      0.83      0.80        94

    accuracy                           0.78       180
   macro avg       0.78      0.78      0.78       180
weighted avg       0.78      0.78      0.78       180



In [29]:
# Changed base_estimator to estimator due to scikit-learn version update
dt_bagging = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)
dt_bagging.fit(X_train, y_train)
dt_bag_preds = dt_bagging.predict(X_test)
print("Decision Tree Bagging Accuracy:", accuracy_score(y_test, dt_bag_preds))
print(classification_report(y_test, dt_bag_preds))

Decision Tree Bagging Accuracy: 0.85
              precision    recall  f1-score   support

           0       0.87      0.80      0.84        86
           1       0.83      0.89      0.86        94

    accuracy                           0.85       180
   macro avg       0.85      0.85      0.85       180
weighted avg       0.85      0.85      0.85       180

