# Random Forest
### Notebook by [Kevin Huang](https://kevin8523.github.io/)

In [78]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
np.random.seed(21)

### Set up dataset into a dataframe to practice ML

In [79]:
# Converting Iris to df
iris = load_iris() # print(iris) to explore the dictionary(starts with {)
df = pd.DataFrame(data= iris['data'], columns= iris['feature_names'])
df['target'] = iris['target']
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [64]:
# Getting categorical names & index of categories for reference purposes
# **Optional**
cat_names = pd.Categorical.from_codes(iris.target,iris.target_names)
index_cat = dict(enumerate(pd.Categorical.from_codes(iris.target,iris.target_names)))
cat_names

[setosa, setosa, setosa, setosa, setosa, ..., virginica, virginica, virginica, virginica, virginica]
Length: 150
Categories (3, object): [setosa, versicolor, virginica]

In [45]:
# Split dataset to train & test
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .7
train, test = df[df.is_train==True], df[df.is_train==False]
print('# of obs in Train:',len(train))
print('# of obs in Test:',len(test))

# of obs in Train: 96
# of obs in Test: 54


In [74]:
# Quick way to convert categorical names into digits
# **Don't need to do this for this example since its already in digits
# **Optional**
pd.factorize(train['target'])[0] # y=pd.factorize(train['target'])[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])

In [109]:
# Grabbing features of df into an index for easy referencing
features = df.columns[:4]
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [115]:
target = df.columns[4:5]
target

Index(['target'], dtype='object')

### Set up data for ML

In [90]:
train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,is_train
0,5.1,3.5,1.4,0.2,0,True
1,4.9,3.0,1.4,0.2,0,True
2,4.7,3.2,1.3,0.2,0,True
4,5.0,3.6,1.4,0.2,0,True
5,5.4,3.9,1.7,0.4,0,True


In [96]:
X = train.iloc[:,:-2].values

In [97]:
y = train.iloc[:,4].values # train.target.values
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])

### Apply ML 

In [73]:
from sklearn.ensemble import RandomForestClassifier

In [106]:
# View the parameters the classifier takes
RandomForestClassifier()

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf='deprecated', min_samples_split=2,
            min_weight_fraction_leaf='deprecated', n_estimators='warn',
            n_jobs=None, oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [102]:
# Creating a rf model
clf = RandomForestClassifier(n_jobs=2, random_state=0)
# Train model
clf.fit(train[features],y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf='deprecated', min_samples_split=2,
            min_weight_fraction_leaf='deprecated', n_estimators=10,
            n_jobs=2, oob_score=False, random_state=0, verbose=0,
            warm_start=False)

In [128]:
test[features].head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
3,4.6,3.1,1.5,0.2
6,4.6,3.4,1.4,0.3
13,4.3,3.0,1.1,0.1
15,5.7,4.4,1.5,0.4
16,5.4,3.9,1.3,0.4


In [127]:
# Apply to test set
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 1, 2, 2, 2, 2])

In [133]:
# View probabilities
clf.predict_proba(test[features])[50:54]

array([[0. , 0. , 1. ],
       [0. , 0.1, 0.9],
       [0. , 0. , 1. ],
       [0. , 0. , 1. ]])

In [163]:
# Predicted class
preds = clf.predict(test[features])
preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 1, 2, 2, 2, 2])

In [182]:
# Predicted class converted to names
preds = iris.target_names[clf.predict(test[features])]
preds

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica',
       'versicolor', 'virginica', 'virginica', 'virginica', 'virginica',
       'virginica', 'versicolor', 'virginica', 'virginica', 'virginica',
       'versicolor', 'virginica', 'virginica', 'virginica', 'virginica'],
      dtype='<U10')

In [183]:
# Actual class  
test['target'].head()

3     0
6     0
13    0
15    0
16    0
Name: target, dtype: int64

In [184]:
# Actual class names 
iris.target_names[test['target']]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica', 'virginica', 'virginica'],
      dtype='<U10')

In [185]:
# Comparison of predicted vs actual
print(preds[0:10])
print(iris.target_names[test['target']][0:10])

['setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa']
['setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa' 'setosa'
 'setosa' 'setosa']


In [186]:
# Creating Confusion Matrix
pd.crosstab(iris.target_names[test['target']],preds, rownames=['Actual Species'],
           colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,18,0,0
versicolor,0,16,0
virginica,0,3,17


In [187]:
# Model Accuracy
51/54

0.9444444444444444

In [188]:
# Productionize Model
prod_preds = iris.target_names[clf.predict( 
    [ [5.0,3.6,1.4,2.0], [5.0,2.0,1.4,3.0]  ]
    )]
prod_preds

array(['versicolor', 'versicolor'], dtype='<U10')