In [5]:
#loading the needed libraries and dataset
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

#setting random seed
np.random.seed(0)

In [11]:
#loading the data
iris=load_iris()

#creating the dataframe and the features
df=pd.DataFrame(iris.data, columns=iris.feature_names)

#checking the first few rows of data
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [12]:
#checking the datasets dimensions
df.shape

(150, 4)

In [14]:
# adding a new column for the species name
df['species']=pd.Categorical.from_codes(iris.target, iris.target_names)

df.head

<bound method NDFrame.head of      sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                  5.1               3.5                1.4               0.2   
1                  4.9               3.0                1.4               0.2   
2                  4.7               3.2                1.3               0.2   
3                  4.6               3.1                1.5               0.2   
4                  5.0               3.6                1.4               0.2   
..                 ...               ...                ...               ...   
145                6.7               3.0                5.2               2.3   
146                6.3               2.5                5.0               1.9   
147                6.5               3.0                5.2               2.0   
148                6.2               3.4                5.4               2.3   
149                5.9               3.0                5.1               1.8  

In [16]:
# creating test and train data
df['is_train']=np.random.uniform(0,1,len(df)) <= .75

#so we assigned a random number between 0, and 1 each row in df;
#if it's <= .75 it is true, else is false

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,train,is_train
0,5.1,3.5,1.4,0.2,setosa,True,True
1,4.9,3.0,1.4,0.2,setosa,True,False
2,4.7,3.2,1.3,0.2,setosa,True,True
3,4.6,3.1,1.5,0.2,setosa,True,True
4,5.0,3.6,1.4,0.2,setosa,True,True


In [18]:
train,test=df[df['is_train']==True], df[df['is_train']==False]

print("train dataset:", train.shape)
print("test dataset:", test.shape)

train dataset: (112, 7)
test dataset: (38, 7)


In [20]:
#creaing list of feature columns
features=df.columns[:4]
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [22]:
#importing the LabelEncoder helper to transform our target
from sklearn.preprocessing import LabelEncoder

#let's encode our target 
le=LabelEncoder()
y=le.fit_transform(train['species'])
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [24]:
#creating a random forest classifier
clf=RandomForestClassifier(n_jobs=2, random_state=0)
#n_jobs are the number of jobs to run in paraller
#random_state controls both the randomness of the bootstrapping
#of the samples used when building trees
#and the sampling of the features to consider
#when looking for the best split at each node

#fitting the model
clf.fit(train[features],y)

RandomForestClassifier(n_jobs=2, random_state=0)

In [26]:
# now testing the model and getting predictions
predictions=clf.predict(test[features])
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [27]:
#let's explore the predicted probabilities of some of our data
clf.predict_proba(test[features])[0:8]

array([[1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.97, 0.03, 0.  ],
       [0.97, 0.03, 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ]])

In [28]:
# so the output above shows the probability of each species!

In [34]:
#mapping names for each predicted plant class
preds=iris.target_names[clf.predict(test[features])]

preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [36]:
#creating a confusion matrix
pd.crosstab(test['species'],preds,rownames=['Actual Species'],colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,10,0,0
versicolor,0,9,0
virginica,0,0,19


In [41]:
#looks like we have a very high accuracy