In [2]:
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split


In [3]:
x,y = make_classification(n_samples=10000,n_features=10,n_informative=3)

In [4]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [5]:
x.shape

(10000, 10)

In [6]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(x_train,y_train)
y_pred = dt.predict(x_test)

print('Decision Tree Acuuracy',accuracy_score(y_test,y_pred))

Decision Tree Acuuracy 0.9335


# Bagging Classifier
we do row sampling with replacement

In [7]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    random_state=42

)

In [8]:
bag.fit(x_train,y_train)



In [9]:
y_pred_bag=bag.predict(x_test)

In [10]:
accuracy_score(y_test,y_pred_bag)

0.9535

In [11]:
# This shows the sample rows that are given to each dt(500)
len(bag.estimators_samples_)

500

In [12]:
# these are the 2000 row samples that are given to the one dt
bag.estimators_samples_[0].shape

(2000,)

In [13]:
# to test the no .of features that are given to each model
bag.estimators_features_[0].shape

(10,)

# Bagging using SVM

In [14]:
bag = BaggingClassifier(
    base_estimator=SVC(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    random_state=42
)

In [16]:
bag.fit(x_train,y_train)



In [17]:
y_pred_bag_svc = bag.predict(x_test)

In [19]:
print('bagging using SVM',accuracy_score(y_test,y_pred_bag_svc))

bagging using SVM 0.947


# Pasting
we do row sampling without replacement

In [23]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    max_samples=0.25,
    bootstrap=False,
    random_state=42,
    verbose=0,
    n_jobs=-1
)

In [24]:
bag.fit(x_train,y_train)
y_pred = bag.predict(x_test)
print("Pasting classifier",accuracy_score(y_test,y_pred))



Pasting classifier 0.951


# Random Subspaces
Here we do column sampling

In [27]:
bag = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=1.0,
    bootstrap=False,
    bootstrap_features=True,# with replacement
    max_features=0.5,
    random_state=42
)

In [28]:
bag.fit(x_train,y_train)
y_pred = bag.predict(x_test)
print("Random subspaces classifier",accuracy_score(y_test,y_pred))



Random subspaces classifier 0.951


In [29]:
bag.estimators_samples_[0].shape

(8000,)

columns sampling without replacemnt

In [30]:
bag1 = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=1.0,
    bootstrap=False,
    bootstrap_features=False,# without replacement
    max_features=0.5,
    random_state=42
)

In [31]:
bag1.fit(x_train,y_train)
y_pred = bag1.predict(x_test)
print("Random subspaces classifier",accuracy_score(y_test,y_pred))



Random subspaces classifier 0.957


# Random Patches
we do both row and column with or without replacement

In [35]:
bag2 = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=1.0,
    bootstrap=True,
    bootstrap_features=True,# with replacement
    max_features=0.5,
    random_state=42
)

In [36]:
bag2.fit(x_train,y_train)
y_pred = bag2.predict(x_test)
print("Random patches  classifier",accuracy_score(y_test,y_pred))



Random patches  classifier 0.95


# Out of Box (OOB) Score
* There is a chance the rows of some rows cant even seen the estimator.
* Satstically proven that Only 66% of the data will be seen by the estimators.
* Therefore 37% of the data estimators are not seen yet

In [39]:
bag3= BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=500,
    max_samples=0.25,
    bootstrap=True,
    oob_score=True,
    #bootstrap_features=True,# with replacement
   #max_features=0.5,
    random_state=42
)

In [40]:
bag3.fit(x_train,y_train)
y_pred = bag3.predict(x_test)
print("Random oob  classifier",accuracy_score(y_test,y_pred))



Random oob  classifier 0.9535


In [42]:
bag3.oob_score_

0.956875

# Bagging Tips

* Bagging generally gives better results than pasting.
* Good results come around the 0.25 to 0.5 row sampling
* Random patches and subspaces should be used while dealing with high dimensional data.
* To find the corret hyperparameter values we can do GridSearchCV/RandomSearchCV

# Applying GridSearchCV


In [43]:
from sklearn.model_selection import GridSearchCV

In [44]:
parameters={
    'n_estimators':[100,300,500],
    'max_samples':[0.1,0.25,0.4,0.7,1.0],
    'bootstrap':[True,False],
    'max_features':[0.1,0.4,0.6,0.8,1.0]
}

In [45]:
search = GridSearchCV(BaggingClassifier(),parameters,cv=5)

In [None]:
search.fit(x_train,y_train)

In [None]:
y_pred=search.predict(x_test)