## Random Forests
### Building a Classification Model for Iris Data Set

Lauren Miller

In [77]:
#import libraries
#sklearn popular machine learning python package

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

### Accessing Dataset

In [78]:
#scikit-learn comes with a few small standard datasets that do not require to download any file from some external website.
#load_iris(*[, return_X_y, as_frame])
#Iris database, first used by Sir R.A. Fisher
#one of the best known database to be found in the pattern recognition literature

iris = datasets.load_iris()

In [79]:
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [80]:
#description of data
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [39]:
#input features

print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [38]:
#output features

print(iris.target_names)

['setosa' 'versicolor' 'virginica']


In [26]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [27]:
#data
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [28]:
#assigning input and output variables

X = iris.data
Y = iris.target

In [31]:
X.shape
#150 rows and 4 columns

(150, 4)

In [33]:
Y.shape
#150 rows

(150,)

### Build Classification Model Using Random Forest

In [34]:
clf = RandomForestClassifier()

In [35]:
#call function to fit to X and Y
#Random Forest Classifier is taking in two features, the X input variables, and the Y output variables

clf.fit(X,Y)

In [37]:
print(clf.feature_importances_)
#important in correlation with ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

[0.10571963 0.02599492 0.4203052  0.44798026]


As you can see above the most important feature is the petal width (cm). The second most important feature is the petal length (cm). The least important features are the sepal length (cm) and sepal width (cm). 


In [41]:
X[0]

array([5.1, 3.5, 1.4, 0.2])

In [48]:
#making prediction with X[0]
#['setosa' 'versicolor' 'virginica']

print(clf.predict(X[[0]]))

['setosa']


In [49]:
#predict probability of X[0]
print(clf.predict_proba(X[[0]]))

[[1. 0. 0.]]


In [47]:
clf.fit(iris.data, iris.target_names[iris.target])

clf.predicts predicts X[0] as setosa. 

clf.predicts_proba predicts X[0] probability for each of the three classes of flowers. As shown the probability for setosa is 1. The probability for versicolor and virginica is 0. 

### Spliting Data (80/20)

In [53]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
#20% test size 

In [64]:
X_train.shape, Y_train.shape

((120, 4), (120,))

In [65]:
X_test.shape, Y_test.shape
#20% of the data

((30, 4), (30,))

### Rebuilding Model

In [74]:
clf.fit(X_train, Y_train)

In [67]:
print(clf.predict(X[[0]]))

[0]


In [68]:
print(clf.predict_proba(X[[0]]))

[[1. 0. 0.]]


In [75]:
#predicted values
print(clf.predict(X_test))

[2 1 1 1 2 0 1 2 0 2 2 0 0 1 0 1 2 1 1 1 1 1 0 1 0 1 0 2 1 0]


In [70]:
clf.fit(iris.data, iris.target_names[iris.target])

In [72]:
#actual class labels
print(Y_test)

[2 1 1 1 2 0 1 2 0 2 1 0 0 1 0 1 2 1 2 1 1 1 0 1 0 1 0 1 1 0]


In [76]:
#model performance
#accuracy of the model
print(clf.score(X_test, Y_test))

0.9
