### Building an Iris Model using Random Forest Algorithm


In [1]:
import pandas as pd

In [2]:
iris_data = pd.read_csv('iris (2).csv')

In [3]:
iris_data.head(10)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
5,5.4,3.9,1.7,0.4,Setosa
6,4.6,3.4,1.4,0.3,Setosa
7,5.0,3.4,1.5,0.2,Setosa
8,4.4,2.9,1.4,0.2,Setosa
9,4.9,3.1,1.5,0.1,Setosa


In [4]:
iris_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  150 non-null    float64
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
iris_data.tail(10)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
140,6.7,3.1,5.6,2.4,Virginica
141,6.9,3.1,5.1,2.3,Virginica
142,5.8,2.7,5.1,1.9,Virginica
143,6.8,3.2,5.9,2.3,Virginica
144,6.7,3.3,5.7,2.5,Virginica
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica
149,5.9,3.0,5.1,1.8,Virginica


In [6]:
iris_data.isnull().sum()

sepal.length    0
sepal.width     0
petal.length    0
petal.width     0
variety         0
dtype: int64

In [7]:
iris_data.duplicated().sum()

1

In [8]:
iris_data.corr()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
sepal.length,1.0,-0.11757,0.871754,0.817941
sepal.width,-0.11757,1.0,-0.42844,-0.366126
petal.length,0.871754,-0.42844,1.0,0.962865
petal.width,0.817941,-0.366126,0.962865,1.0


In [9]:
data = iris_data.rename(columns={'sepal.length': 'sepallength', 'sepal.width': 'sepalwidth' ,'petal.length':'petallength', 'petal.width': 'petalwidth'})

In [10]:
data.head()

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


Feature Extraction

In [11]:
x = data.loc[:, ['sepallength','sepalwidth','petallength','petalwidth']]
y = data.loc[:, 'variety']

In [12]:
data.shape

(150, 5)

In [13]:
y.value_counts()

Setosa        50
Versicolor    50
Virginica     50
Name: variety, dtype: int64

In [14]:
# encoding the label

y.map({
    'Setosa': 0,
    'Versicolor': 1,
    'Virginica': 2
})

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: variety, Length: 150, dtype: int64

In [15]:
y.value_counts()

Setosa        50
Versicolor    50
Virginica     50
Name: variety, dtype: int64

In [16]:
#importing relevant libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

#splitting data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2,random_state=0)


model = RandomForestClassifier(n_estimators=5)
model.fit(x_train, y_train)

In [17]:
y_pred=model.predict(x_train)

print(f'The accuracy for the train data {metrics.accuracy_score(y_train,y_train)}')

The accuracy for the train data 1.0


In [17]:
pip install --upgrade scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [18]:
y_pred=model.predict(x_test)   # evaluating our model


print(f'The accuracy for the test data {metrics.accuracy_score(y_pred,y_test)}')

The accuracy for the test data 0.9666666666666667


### Cross Validation

In [20]:
# FEATURE ENGINEERING IS VERY IMPORTANT.

from sklearn.model_selection import cross_val_score

model_cv = RandomForestClassifier(n_estimators=5)
scores = cross_val_score(model_cv,x,y, cv=5)

print(f'Accuracy on each fold: {scores}')
print(f'Accuracy on each accuracy: {sum(scores)/5}')

Accuracy on each fold: [0.96666667 0.96666667 0.93333333 0.93333333 1.        ]
Accuracy on each accuracy: 0.96


In [22]:
from joblib import dump

dump(model, 'Iris_model.joblib')

['Iris_model.joblib']