In [None]:
# Loading the Library with iris dataset
from sklearn.datasets import load_iris

#loading scikit's random forest classifier library 
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np

#Setting random seed
np.random.seed(0)

In [None]:
# Creating object iris with iris data
iris = load_iris()

# Creating datafram with four feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
# Adding a new column for the species name
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),specis,is_train,species
0,5.1,3.5,1.4,0.2,setosa,True,setosa
1,4.9,3.0,1.4,0.2,setosa,True,setosa
2,4.7,3.2,1.3,0.2,setosa,True,setosa
3,4.6,3.1,1.5,0.2,setosa,True,setosa
4,5.0,3.6,1.4,0.2,setosa,True,setosa


In [None]:
# Creating Test and Train Data
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),specis,is_train,species
0,5.1,3.5,1.4,0.2,setosa,True,setosa
1,4.9,3.0,1.4,0.2,setosa,False,setosa
2,4.7,3.2,1.3,0.2,setosa,True,setosa
3,4.6,3.1,1.5,0.2,setosa,True,setosa
4,5.0,3.6,1.4,0.2,setosa,True,setosa


In [None]:
 # Creating dataframes with test rows and training rows
 train, test = df[df['is_train']==True], df[df['is_train']==False]

print('Number of observations in the training data: ', len(train))
print('Number of observations in the training data: ', len(test))

Number of observations in the training data:  112
Number of observations in the training data:  38


In [None]:
# Create a list of the feature column's names
features = df.columns[:4]
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [None]:
# Converting each specis name into digits 
y = pd.factorize(train['species'])[0]

y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [None]:
# Creating a random forest Classifier
clf = RandomForestClassifier(n_jobs=2, random_state=0)
#Training the classifier
clf.fit(train[features], y)

In [None]:
# Applying trained classifier to the test
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [None]:
# Viewing the predicted probablities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.97, 0.03, 0.  ],
       [0.97, 0.03, 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ]])

In [None]:
# Mapping names for the plants for each predicted plant
preds = iris.target_names[clf.predict(test[features])]
preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [None]:
# Viewing the Actual species for the first five observations (NOT PREDICTION)
test['species'].head()

1     setosa
5     setosa
6     setosa
13    setosa
14    setosa
Name: species, dtype: category
Categories (3, object): ['setosa', 'versicolor', 'virginica']

In [None]:
# Creating confusion matrix
pd.crosstab(test['species'], preds, rownames=['Actual Species'],
            colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,10,0,0
versicolor,0,9,0
virginica,0,0,19


In [None]:
preds=iris.target_names[clf.predict([[5.0, 3.6, 1.4, 2.0], [5.0, 3.6, 1.4, 2.0]])]
preds



array(['setosa', 'setosa'], dtype='<U10')