In [3]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
X, y= make_classification(n_samples=1000, n_features=10, n_informative=5, n_classes=2)

In [4]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)

In [8]:
y_pred = dtc.predict(x_test)

In [9]:
accuracy_score(y_test, y_pred)

0.865

>## Bagging

In [77]:
bag = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100, # means build 100 decision trees
    max_samples=0.3, # means use 25% of the data for each tree
    bootstrap=True, # sampling with replacements
    random_state=2
)

In [78]:
bag.fit(x_train, y_train)

In [79]:
pred = bag.predict(x_test)

In [80]:
accuracy_score(y_test, pred)

0.92

In [84]:
# estimators_samples_ --> Gives --> A list of arrays, where each array contains the 
# indices of the samples that were used to train each base estimator. 
bag.estimators_samples_[0].shape

(240,)

> ## Pasting

In [85]:
pasting = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=100, 
    max_samples=0.3,
    bootstrap=False,
    verbose=1, # it gives all info at the time of training
    n_jobs=1, # used for parallel execution in cpu
)

In [86]:
pasting.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s finished


In [87]:
pred2 = pasting.predict(x_test)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [88]:
accuracy_score(y_test, pred2)

0.89

> ## Random Subspaces

In [97]:
# since we have 10 columns, in this random supspaces we only used 5 columns
randSubs= BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=500, 
    max_samples=1.0,
    bootstrap=False,
    max_features=0.5,
    bootstrap_features=True,
    random_state=42
)

In [98]:
randSubs.fit(x_train, y_train)

In [99]:
pred3 = randSubs.predict(x_test)
accuracy_score(y_test, pred3)

0.91

In [101]:
randSubs.estimators_samples_[0].shape
randSubs.estimators_features_[0].shape

(5,)

> ## Random Patches

In [105]:
randPatch = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=300, 
    max_samples=0.3, 
    bootstrap=True, 
    max_features=0.5,
    bootstrap_features=True,
    random_state=42
)

In [106]:
randPatch.fit(x_train, y_train)

In [107]:
pred4 = randPatch.predict(x_test)
accuracy_score(y_test, pred4)

0.895

>## OOB Score

In [109]:
bagging_oob = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=300, 
    max_samples=0.3, 
    bootstrap=True, 
    oob_score=True,
    random_state=42
)

In [110]:
bagging_oob.fit(x_train, y_train)

In [113]:
# so now oob values used as testing data, and its accuracy is achieved through this attribute
bagging_oob.oob_score_

0.8925

># Bagging Tips
- Bagging generally gives better results than Pasting
- Good results come around the 25% to 50% row sampling mark
- Random patches and subspaces should be used while dealing with high dimensional data
- To find the correct hyperparameter values we can do GridSearchCV/RandomSearchCV
___

>## Applying GridSearchCV

In [114]:
from sklearn.model_selection import GridSearchCV

In [118]:
parameters = {
    'n_estimators': [50,100,500], 
    'max_samples': [0.1,0.4,0.7,1.0],
    'bootstrap' : [True,False],
    'max_features' : [0.1,0.4,0.7,1.0]
    }
     

In [119]:
search = GridSearchCV(BaggingClassifier(), parameters, cv=5, n_jobs=2)

In [None]:
search.fit(x_train, y_train)

In [None]:
search.best_score_

In [None]:
search.best_score_