In [2]:
# Importing libraries
import numpy as np 
import pandas as pd
import matplotlib.pylab as plt
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# Importing the dataset
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns = iris.feature_names)
iris_df['Species'] = iris.target
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [None]:
# Checking the structure of the data, inconsistencies and data type
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   Species            150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [11]:
# Encode the Species column where iris-setosa becomes 0 and the others become 1
iris_df['encoded_species'] = iris_df['Species'].apply(lambda x : 0 if x == 'Iris-setosa' else 1)

# Getting the independent variables (features)
X = iris_df.iloc[:, [0, 1, 2, 3]].values

# Getting the dependent variable (target)
y = iris_df.iloc[:, [4]].values 

X = X.reshape(-1, 4)
X = preprocessing.scale(X)         # Scale the data so that it is easier to fit

# Split the data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 25, random_state = 0)

In [12]:
# Fit a model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Make a prediction on the test data
y_pred = log_reg.predict(X_test).reshape(-1, 1)


  y = column_or_1d(y, warn=True)


In [13]:
# Evaluating model performance using a confusion matrix

from sklearn.metrics import confusion_matrix

classes = list(iris.target_names)
conf_mat = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(conf_mat, columns = classes, index = classes)
cm_df

Unnamed: 0,setosa,versicolor,virginica
setosa,8,0,0
versicolor,0,11,0
virginica,0,0,6


Looking at the confusion matrix above, the prediction is that the model will have similar precision and recall because on our confusion matrix there are only true positive instances for each class. There are no false positives and false negatives.

Below are calculations for accuracy, precision and recall for each species (Iris-serosa and not-Iris-setosa)

In [14]:
# Calculating accuracy, precision and recall for Iris-setosa species

# First, let us state the values we have from the confusion matrix:
# From our confusion matrix, we only have true positive instances for each species:

TP_S = 8    # true positive for Iris-setosa
TP_VE = 11   # true positive for Iris -versicolor
TP_VI = 6    # true positive for Iris- virginica
TTP = 25     # total true positive = 8 + 11 + 6

FP = 0    # false positive - is the same for all species
TN = 0    # false negative  - is the same for all species
FN = 0    # false negative - is the same for all species

# Using the formulae
model_accuracy = (TTP + TN)/ (TTP + FP + TN + FN)       # true instances (TP + TN)/ total instances (TP + TN +FP +FN)

# Printing accuracy
print('Model accuracy: ', model_accuracy)

Model accuracy:  1.0


In [15]:
# Getting the precision and recall for setosa
precision_iris_setosa = TP_S/ (TP_S + FP)         # 8/(8 + 0)
recall_iris_setosa = TP_S / (TP_S + FN)           # 8/(8 + 0)

print('Precision for setosa: ', precision_iris_setosa)
print('Recall for setosa: ', recall_iris_setosa)

Precision for setosa:  1.0
Recall for setosa:  1.0


In [16]:
# Getting precision and recall for not-iris-setosa (iris-versicolor and iris-virginica)
precision_not_iris_setosa = (TP_VE + TP_VI)/ ((TP_VE + TP_VI) + FP)
recall_not_iris_setosa = (TP_VI + TP_VE)/ ((TP_VI + TP_VE ) + FN)

print('Precision for not-iris-setosa: ', precision_not_iris_setosa)
print('Recall for not-iris-setosa: ', recall_not_iris_setosa)

Precision for not-iris-setosa:  1.0
Recall for not-iris-setosa:  1.0


Looking at the above calculations, my prediction was actually correct! The model favored higher precision and recall.

Commenting on the model accuracy, precision and recall:
Accuracy = 1. This means that the model made zero mistakes (every prediction matched the actual label - 100% correct classification)

Precision = 1. Among all the samples the model predicted as Not-Iris-setosa, all of them were actually not-iris-setosa - there were no false positives.

Recall = 1. Among all the actual not-iris-setosa samples, the model correctly identified all of them - there were no false negatives.

Overall, the model perfectly distinguished Iris-setosa from the other species (no misclassifications at all in the dataset).
