In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier


# setting random seed
np.random.seed(0)


In [4]:
iris  = load_iris()

df = pd.DataFrame(iris.data, columns = iris.feature_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
df ['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
#create test and train data

df['is_train']= np.random.uniform(0,1,len(df)) <=0.80
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,train,is_train
0,5.1,3.5,1.4,0.2,setosa,True,True
1,4.9,3.0,1.4,0.2,setosa,True,False
2,4.7,3.2,1.3,0.2,setosa,True,True
3,4.6,3.1,1.5,0.2,setosa,True,True
4,5.0,3.6,1.4,0.2,setosa,True,True


In [10]:
#creating test rows and train rows

train, test= df[df['is_train']== True], df[df['is_train'] == False]

print('The number of observation in the training data: ',len(train))
print('The number of observation in the test data: ', len(test))

The number of observation in the training data:  118
The number of observation in the test data:  32


In [11]:
features = df.columns[:4]
features


Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [13]:
#converting each species name into digits
    
y= pd.factorize(train['species'])[0]
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [15]:
#importing random forest classifier

clf = RandomForestClassifier(n_jobs = 2, random_state= 0)
clf.fit(train[features],y)

RandomForestClassifier(n_jobs=2, random_state=0)

In [17]:
# applying the trained classifier to test

clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 2, 2, 2, 2], dtype=int64)

In [18]:
# prediction's probabilities

clf.predict_proba(test[features])[0:10]

array([[1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [1.  , 0.  , 0.  ],
       [0.01, 0.94, 0.05]])

In [23]:
# mapping names to the predicted output

preds = iris.target_names [clf.predict(test[features])]
preds[0:10]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'versicolor'], dtype='<U10')

In [27]:
#create confusion matrix

pd.crosstab(test['species'], preds, rownames= ['Actual Species'], colnames= ['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,9,0,0
versicolor,0,7,0
virginica,0,1,15


In [35]:
# Predcition of values that are entered by us

preds = iris.target_names[clf.predict( [[5.0, 3.6, 1.4, 2.0]] )]
preds

array(['setosa'], dtype='<U10')

In [37]:
preds = iris.target_names[clf.predict( [[6.8, 4.5, 5.3, 3.2]] )]
preds

array(['virginica'], dtype='<U10')