In [1]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [2]:
x,y=make_classification(n_samples=10000, n_features=10, n_informative=5)

In [3]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

# testing accuracy for decision tree first to see if bagging will improve it or not

In [4]:
dt=DecisionTreeClassifier(random_state=42)
dt.fit(x_train,y_train)
y_pred=dt.predict(x_test)

print(accuracy_score(y_test,y_pred))

0.922


# Bagging

In [5]:
bag=BaggingClassifier(
base_estimator=DecisionTreeClassifier(),
n_estimators=500,
max_samples=0.25,
bootstrap=True,
random_state=42
)

In [6]:
bag.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.25,
                  n_estimators=500, random_state=42)

In [7]:
y_pred=bag.predict(x_test)

In [8]:
print(accuracy_score(y_test, y_pred))

0.942


In [9]:
bag.estimators_samples_[0].shape

(2000,)

# Pasting

In [10]:
bag2=BaggingClassifier(
base_estimator=DecisionTreeClassifier(),
n_estimators=500,
max_samples=0.25,
bootstrap=False,
random_state=42,
verbose=1,
n_jobs=-1
)

In [11]:
bag2.fit(x_train, y_train)
y_pred2=bag2.predict(x_test)
print(accuracy_score(y_test, y_pred2))

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    6.8s remaining:   34.3s


0.9455


[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    7.2s finished
[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   2 out of  12 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:    0.0s finished


# Random Subspaces

In [12]:
bag3=BaggingClassifier(
base_estimator=DecisionTreeClassifier(),
n_estimators=500,
max_samples=1.0, 
max_features=0.5,
bootstrap=False,
bootstrap_features=True,
random_state=42
)

In [13]:
bag3.fit(x_train, y_train)
y_pred3=bag3.predict(x_test)
print(accuracy_score(y_test, y_pred3))

0.928


In [14]:
 bag3.estimators_samples_[0].shape

(8000,)

In [15]:
 bag3.estimators_features_[0].shape

(5,)

# Random patches

In [16]:
bag4=BaggingClassifier(
base_estimator=DecisionTreeClassifier(),
n_estimators=500,
max_samples=0.25, 
max_features=0.5,
bootstrap=True,
bootstrap_features=True,
random_state=42
)

In [17]:
bag4.fit(x_train, y_train)
y_pred4=bag4.predict(x_test)
print(accuracy_score(y_test, y_pred4))

0.923


# OOB Score
### OOB is Out Of Bag  score in bagging while bootstraping some rows get multiple times in dataset and some rows never get selected ina dataset so OOB score is created on those rows

In [18]:
bag5=BaggingClassifier(
base_estimator=DecisionTreeClassifier(),
n_estimators=500,
max_samples=0.25,
bootstrap=True,
oob_score=True,
random_state=42
)

In [19]:
bag5.fit(x_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.25,
                  n_estimators=500, oob_score=True, random_state=42)

In [20]:
y_pred5=bag5.predict(x_test)

In [21]:
print(accuracy_score(y_test, y_pred5))

0.942


  # Applying Grid Search CV

In [22]:
 from sklearn.model_selection import GridSearchCV

In [23]:
parameters={
    'n_estimators':[50,100,200,300,500],
    'max_samples':[0.1,0.25,0.50,0.75],
    'bootstrap':[True, False],
    'max_features': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
}

In [24]:
search=GridSearchCV(BaggingClassifier(), parameters, cv=5)

In [25]:
search.fit(x_train,y_train)

GridSearchCV(cv=5, estimator=BaggingClassifier(),
             param_grid={'bootstrap': [True, False],
                         'max_features': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7,
                                          0.8, 0.9],
                         'max_samples': [0.1, 0.25, 0.5, 0.75],
                         'n_estimators': [50, 100, 200, 300, 500]})

In [26]:
search.best_score_

0.9588750000000001

In [27]:
search.best_params_

{'bootstrap': False,
 'max_features': 0.8,
 'max_samples': 0.75,
 'n_estimators': 500}