## Random Forest

* Bagging (Bootstrap Aggregation) : Parallel training,  of multiple models then choosing output based on majority vote.

* Boosting: Sequential training, Combines weak learners with strong learners. 

* Row sampling - taken from data with replacement.

Advantages:
* Handles dimentionality
* Diverse trees are generated
* Stable
* Handles overfitting well

Disadvantage:
* Slow due to multiple fit models

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [49]:
from sklearn.datasets import make_classification
# 2 classes
X, y = make_classification(n_features=100, n_classes=10, n_informative=80, n_clusters_per_class=2)
print(X.shape, y.shape)

(100, 100) (100,)


In [50]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42) 
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((80, 100), (20, 100), (80,), (20,))

In [5]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100,
                            criterion="gini",
                            max_depth=None,
                            min_samples_split=2,
                            min_samples_leaf=1,
                            min_weight_fraction_leaf=0.0,
                            max_features="sqrt",
                            max_leaf_nodes=None,
                            min_impurity_decrease=0.0,
                            bootstrap=True,
                            oob_score=False,
                            n_jobs=None,
                            random_state=None,
                            verbose=0,
                            warm_start=False, # reuse the solution of the previous call to fit and add more estimators to the ensemble
                            class_weight=None, 
                            ccp_alpha=0.0, # subtree with the largest cost complexity that is smaller than ccp_alpha will be chosen
                            max_samples=None,
                            monotonic_cst=None
)

clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
prediction

array([0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1])

In [8]:
accuracy = np.sum(y_test == prediction) / len(y_test)
accuracy

0.95

In [51]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100,
                            criterion="gini",
                            max_depth=None,
                            min_samples_split=2,
                            min_samples_leaf=1,
                            min_weight_fraction_leaf=0.0,
                            max_features=None,
                            max_leaf_nodes=None,
                            min_impurity_decrease=0.0,
                            bootstrap=True,
                            oob_score=True,
                            n_jobs=-1,
                            random_state=None,
                            verbose=0,
                            warm_start=True, # reuse the solution of the previous call to fit and add more estimators to the ensemble
                            class_weight=None, 
                            ccp_alpha=0.0, # subtree with the largest cost complexity that is smaller than ccp_alpha will be chosen
                            max_samples=None,
                            monotonic_cst=None
)

clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
prediction

array([9, 8, 4, 4, 2, 9, 1, 6, 6, 4, 9, 3, 6, 6, 3, 5, 9, 8, 2, 9])

In [52]:
clf.oob_score_

0.0875

In [53]:
accuracy = np.sum(y_test == prediction) / len(y_test)
accuracy

0.25

In [34]:
clf.predict_proba(X_test)

array([[0.05, 0.05, 0.07, 0.06, 0.09, 0.24, 0.11, 0.11, 0.1 , 0.12],
       [0.01, 0.13, 0.14, 0.08, 0.16, 0.09, 0.11, 0.1 , 0.05, 0.13],
       [0.13, 0.04, 0.06, 0.12, 0.11, 0.14, 0.16, 0.14, 0.03, 0.07],
       [0.19, 0.1 , 0.08, 0.11, 0.12, 0.04, 0.06, 0.12, 0.07, 0.11],
       [0.1 , 0.07, 0.25, 0.11, 0.06, 0.06, 0.04, 0.13, 0.05, 0.13],
       [0.17, 0.04, 0.11, 0.02, 0.07, 0.04, 0.09, 0.13, 0.09, 0.24],
       [0.11, 0.2 , 0.07, 0.14, 0.11, 0.04, 0.06, 0.09, 0.06, 0.12],
       [0.06, 0.24, 0.07, 0.08, 0.14, 0.02, 0.07, 0.15, 0.11, 0.06],
       [0.19, 0.05, 0.06, 0.02, 0.06, 0.08, 0.12, 0.16, 0.11, 0.15],
       [0.04, 0.2 , 0.12, 0.11, 0.11, 0.11, 0.09, 0.04, 0.07, 0.11],
       [0.02, 0.11, 0.09, 0.08, 0.08, 0.03, 0.1 , 0.06, 0.22, 0.21],
       [0.09, 0.16, 0.05, 0.17, 0.09, 0.05, 0.1 , 0.1 , 0.06, 0.13],
       [0.14, 0.11, 0.15, 0.11, 0.16, 0.03, 0.07, 0.06, 0.06, 0.11],
       [0.12, 0.08, 0.14, 0.08, 0.08, 0.1 , 0.1 , 0.09, 0.07, 0.14],
       [0.11, 0.1 , 0.09, 0.04, 0.

In [48]:
clf.feature_importances_

array([0.01766151, 0.01019349, 0.00561448, 0.01118886, 0.00919792,
       0.00811215, 0.00727606, 0.01007925, 0.01031755, 0.00925975,
       0.02404884, 0.01003255, 0.01073527, 0.01385061, 0.00955418,
       0.00473085, 0.01079059, 0.00847662, 0.00772717, 0.01703401,
       0.0086725 , 0.00671646, 0.01025203, 0.00423674, 0.00372776,
       0.00950022, 0.00397842, 0.0060015 , 0.01090547, 0.00982236,
       0.00481766, 0.005236  , 0.01080384, 0.0102584 , 0.00793475,
       0.01046419, 0.01281942, 0.01331624, 0.00617849, 0.01690533,
       0.00608743, 0.00246115, 0.01285234, 0.00396996, 0.00810509,
       0.00934045, 0.00894128, 0.0168459 , 0.00817011, 0.00505747,
       0.009146  , 0.00709868, 0.00710193, 0.00514853, 0.01128595,
       0.00360629, 0.0103941 , 0.0057376 , 0.00681693, 0.01042661,
       0.00969956, 0.0026323 , 0.013757  , 0.01143736, 0.01537705,
       0.00557788, 0.02076202, 0.01394456, 0.00280664, 0.00715216,
       0.00674864, 0.01199737, 0.01129382, 0.01521966, 0.01563

### GRIDSEARCHCV

In [44]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200]
}

from sklearn.model_selection import GridSearchCV

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 4,
                           n_jobs=-1, 
                           verbose=1, 
                           scoring="accuracy")


UsageError: Line magic function `%%time` not found.


In [45]:
%%time
grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 180 candidates, totalling 720 fits
CPU times: user 174 ms, sys: 61.8 ms, total: 236 ms
Wall time: 7.28 s


In [46]:
grid_search.cv_results_

{'mean_fit_time': array([0.01702094, 0.03266937, 0.03886247, 0.04286295, 0.08093274,
        0.12401521, 0.02110356, 0.02754778, 0.02562952, 0.03450722,
        0.07455277, 0.12230343, 0.01961476, 0.0253908 , 0.0236305 ,
        0.03924805, 0.06287068, 0.1238876 , 0.02227175, 0.02681899,
        0.02447098, 0.03739524, 0.06617486, 0.1043433 , 0.01619238,
        0.02169436, 0.03061157, 0.0464071 , 0.0805403 , 0.12765896,
        0.01875424, 0.01977849, 0.02251929, 0.03509802, 0.06908727,
        0.12324548, 0.02225077, 0.02475947, 0.02955276, 0.04494697,
        0.07043004, 0.12245196, 0.02285987, 0.03996682, 0.03711873,
        0.0423311 , 0.06970286, 0.12823528, 0.02681923, 0.02410668,
        0.029194  , 0.0451082 , 0.07179248, 0.1427812 , 0.01537782,
        0.02095574, 0.02673423, 0.03668284, 0.0674023 , 0.12138796,
        0.01839542, 0.02549559, 0.03057265, 0.0422045 , 0.0617308 ,
        0.14692825, 0.0212459 , 0.0248881 , 0.02534032, 0.04227972,
        0.07649052, 0.10474795,

In [47]:
print(grid_search.best_estimator_)
print(grid_search.best_params_)
print(grid_search.best_score_)

RandomForestClassifier(max_depth=2, min_samples_leaf=20, n_estimators=10,
                       n_jobs=-1, random_state=42)
{'max_depth': 2, 'min_samples_leaf': 20, 'n_estimators': 10}
0.15000000000000002


## Using real life sample

In [57]:
from sklearn.datasets import load_iris

data = load_iris()
X = data.data
y = data.target

In [60]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42) 
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [68]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10,
                            criterion="gini",
                            n_jobs=-1,
                            verbose=1,
                            warm_start=False, # reuse the solution of the previous call to fit and add more estimators to the ensemble
)

clf.fit(X_train,y_train)
prediction = clf.predict(X_test)
prediction

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [69]:
accuracy = np.sum( y_test == prediction) /len(y_test)
accuracy

1.0