# Random Forest Classifier

In [153]:
# Loading Iris dataset
from sklearn.datasets import load_iris
# Loading Random Forest Classifier Library
from sklearn.ensemble import RandomForestClassifier
# Loading Python Libraries
import pandas as pd
import numpy as np
# Setting a random seed
np.random.seed(0)

In [154]:
# Giving it a name
iris = load_iris()
# Creating a dataframe
df = pd.DataFrame(iris.data, columns = iris.feature_names)
# Viewing top 5 rows
df.head()
# print(iris)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [155]:
# No. of rows and no. of columns in the dataset
df.shape

(150, 4)

In [156]:
# Description of the dataset
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [157]:
# Size of the dataset in memory
df.info(verbose = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Columns: 4 entries, sepal length (cm) to petal width (cm)
dtypes: float64(4)
memory usage: 4.8 KB


In [158]:
# Adding a new column
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
# print(iris.target) -> returns an integer array where each integer points to a category in categories (a.k.a. iris.target_names)
# print(iris.target_names) -> returns the categories for the categorical (eg. ['setosa' 'versicolor' 'virginica'])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [170]:
# Splitting the dataset into 75% for training and 25% for testing
df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75
# a random number between 0 and 1 is generated for all the rows in the dataset if it is less than 0.75 then true for is_train
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,False
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [171]:
train, test = df[df['is_train'] == True], df[df['is_train'] != True]
# train = df[df['is_train'] == True]
# test = df[df['is_train'] == False]
print('Training data: ', len(train))
print('Testing data: ', len(test))

Training data:  112
Testing data:  38


In [172]:
# Creating feature columns
feature_cols = df.columns[:4]
feature_cols

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [173]:
y = pd.factorize(train['species'])[0]
# y.shape -> (112,)
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2], dtype=int64)

In [174]:
model = RandomForestClassifier(n_jobs = 2, random_state = 0)
model.fit(train[feature_cols], y) # train[feature_cols] -> actual data, y -> target to be acheived

RandomForestClassifier(n_jobs=2, random_state=0)

In [175]:
x = test[feature_cols]
model.predict(x)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [176]:
# Viewing the predicted probablities
model.predict_proba(x)[12:27] # [starting row: ending row] 

array([[0.  , 1.  , 0.  ],
       [0.  , 0.99, 0.01],
       [0.  , 1.  , 0.  ],
       [0.  , 1.  , 0.  ],
       [0.  , 1.  , 0.  ],
       [0.  , 0.99, 0.01],
       [0.  , 0.98, 0.02],
       [0.  , 0.23, 0.77],
       [0.  , 0.  , 1.  ],
       [0.  , 0.  , 1.  ],
       [0.  , 0.02, 0.98],
       [0.  , 0.27, 0.73],
       [0.  , 0.01, 0.99],
       [0.  , 0.01, 0.99],
       [0.  , 0.43, 0.57]])

The Output gives an array of three values (Iris Setosa, Iris Verginica, Iris Versicolor) with the probablity of being one of the three.

In [177]:
predictions = iris.target_names[model.predict(x)]
predictions[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [178]:
test['species'].head()

1     setosa
5     setosa
6     setosa
13    setosa
14    setosa
Name: species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [180]:
pd.crosstab(test['species'], predictions, rownames = ['Actual Species'], colnames = ['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,10,0,0
versicolor,0,9,0
virginica,0,0,19


Number of Accurate Predictions => 10 + 9 + 19 = 38, 
Number of Inaccurate Predictions => 0, 
Model Accuracy => 100%

In [187]:
predictions = iris.target_names[model.predict([[5.0, 3.6, 1.4, 2.0], [5.0, 3.6, 1.4, 2.0]])]
predictions



array(['setosa', 'setosa'], dtype='<U10')