# Ensemble Learning - Random Forest

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
import time

The DataFrame T contains several predictors and the response variable faultCode.

In [2]:
T = pd.read_csv("./data/pumpFeatures.csv")
X, y = T.iloc[:, :-1], T.faultCode
T.head()

Unnamed: 0,wMotor_mean,wMotor_std,wMotor_fftPeakIdx,wMotor_skewness,wMotor_kurtosis,wMotor_peak2peak,wMotor_peak2rms,wMotor_rms,wMotor_mad,wMotor_csRange,...,pOut_peak2peak,pOut_peak2rms,pOut_rms,pOut_mad,pOut_csRange,pOut_pLow,pOut_pMid,pOut_pHigh,pOut_pKur,faultCode
0,876.235677,6.204772,762,-0.164601,2.881123,39.070726,1.018288,876.257629,5.032113,1225859.0,...,0.1255,1.007635,7.226002,0.019002,10109.108199,9.1e-05,0.007472,0.001348,0.076296,0
1,876.238247,6.077463,762,-0.147077,2.673731,34.570767,1.016957,876.259308,4.982643,1225862.0,...,0.120924,1.007091,7.225622,0.01899,10108.578309,0.000118,0.006954,0.001434,0.076296,0
2,876.031159,6.250861,762,-0.123501,2.551445,35.49146,1.017266,876.053444,5.165914,1225575.0,...,0.131161,1.007494,7.225906,0.019032,10108.963684,6e-05,0.00657,0.001309,0.076296,0
3,876.196665,6.247495,762,-0.134596,2.728913,41.529486,1.018873,876.218921,5.093603,1225799.0,...,0.123254,1.00777,7.225914,0.018717,10108.973834,6.6e-05,0.00719,0.001364,0.076296,0
4,876.199734,6.095278,762,-0.139257,2.705275,36.096733,1.018066,876.220919,4.962708,1225812.0,...,0.123759,1.006735,7.22526,0.019078,10108.065292,5.1e-05,0.007814,0.0016,0.071808,0


This code fits a classification tree and calculates the accuracy.

In [3]:
classifier = DecisionTreeClassifier()
kf = KFold(n_splits=5, shuffle=True)

start_time = time.time()
cv_results = cross_validate(classifier, X, y, cv=kf)
elapsed_time = time.time() - start_time

print("Accuracies:", cv_results['test_score'])
print("Mean accuracy:", cv_results['test_score'].mean())
print(f"Cross validation elapsed time: {elapsed_time:.4f} seconds")

Accuracies: [0.87  0.875 0.88  0.89  0.84 ]
Mean accuracy: 0.8710000000000001
Cross validation elapsed time: 0.2315 seconds


We can use the [BaggingClassifier()](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn-ensemble-baggingclassifier) class to create the ensemble method Bagging classifier.

In [4]:
classifier = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=0)
classifier.fit(X, y)
classifier.score(X, y)

0.986

Without cross validation, the ensemble was trained to fit the data perfectly. The model is most likely overfit and will not generalize well to new data.

In [5]:
classifier = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10, random_state=0)
kf = KFold(n_splits=5, shuffle=True)

start_time = time.time()
cv_results = cross_validate(classifier, X, y, cv=kf)
elapsed_time = time.time() - start_time

print("Accuracies:", cv_results['test_score'])
print("Mean accuracy:", cv_results['test_score'].mean())
print(f"Cross validation elapsed time: {elapsed_time:.4f} seconds")

Accuracies: [0.885 0.895 0.89  0.9   0.935]
Mean accuracy: 0.901
Cross validation elapsed time: 1.7223 seconds


We can customize a learner for the BaggingClassifier() class.

In [6]:
estimator = DecisionTreeClassifier(max_features=15)
classifier = BaggingClassifier(estimator=estimator, n_estimators=10, random_state=0)
kf = KFold(n_splits=5, shuffle=True)

start_time = time.time()
cv_results = cross_validate(classifier, X, y, cv=kf)
elapsed_time = time.time() - start_time

print("Accuracies:", cv_results['test_score'])
print("Mean accuracy:", cv_results['test_score'].mean())
print(f"Cross validation elapsed time: {elapsed_time:.4f} seconds")

Accuracies: [0.88  0.935 0.905 0.885 0.905]
Mean accuracy: 0.9019999999999999
Cross validation elapsed time: 0.4097 seconds
