In [None]:
import numpy as np
np.random.seed(0)

When the data is split into training and testing, it selects random rows for testing. <br>
setting random.seed = 0 makes it to take same rows for testing every time.

In [None]:
from sklearn.datasets import load_iris

iris = load_iris()
print(iris.DESCR)

In [None]:
# Load data into dataframe
import pandas as pd
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

# set target column which is species in this dataset
iris_df['species'] = pd.Series(iris.target)
iris_df['species_name'] = pd.Categorical.from_codes(iris.target,iris.target_names)
iris_df

In [None]:
# explore
iris_df.head()

In [None]:
# visualization

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(font_scale=2)
sns.set_style('whitegrid')

for feature in iris.feature_names:
     plt.figure(figsize=(16, 9))
     sns.scatterplot(data=iris_df, x=feature, y='species_name', 
                     hue='species_name', 
                     palette='cool', legend=False)

In [None]:
# split the data for training and testing

from sklearn.model_selection import train_test_split

# test_size = 0.3 indicates 70% for training and 30% for testing
X_train, X_test, y_train, y_test = train_test_split(iris_df[iris_df.columns[0:3]],iris_df['species'],test_size=0.3)

# shape gives the number of rows and columns of train and test datasets.
print(f'Shape of training set X = {X_train.shape}')
print(f'Shape of testing  set X = {X_test.shape}')

print()

print(f'Shape of training set y = {y_train.shape}')
print(f'Shape of testing  set y = {y_test.shape}')

In [None]:
# Use RandomForest algorithm
from sklearn.ensemble import RandomForestClassifier as rfc

random_forest = rfc(n_jobs=2, random_state=0)
random_forest.fit(X_train, y_train)

In [None]:
predicted = random_forest.predict(X_test)
expected = y_test

print(f'predicted classes : {predicted}')
print(f'expected classes : {expected}')

In [None]:
# accuracy
from sklearn.metrics import accuracy_score
print(f'accuracy score : {accuracy_score(expected,predicted):.1f}')

In [None]:
# f1 score
from sklearn.metrics import f1_score
print(f1_score(expected,predicted,average='micro'))

In [None]:
# confusion matrix to evaluate the model

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true=expected, y_pred=predicted)

confusion_df = pd.DataFrame(cm,index=range(3),columns=range(3))
axes = sns.heatmap(confusion_df,annot=True, cmap='nipy_spectral_r')

Confusion matrtix indicates how many are correctly predicted and how many are wrong. <br>
Here the numbers in the diagonal represent the correctly predicted features. There are 16+16+10=42 correct predictions <br>
The numbers other than in the diagonal represent incorrect predictions. The '2' in the 2nd row and '1' in the 3rd row represents that the testing set is predicted wrongly. 