In [1]:
import math
from copy import deepcopy

import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
# let's load iris and have a look at it

iris_data = load_iris()  # returns a dict
print(iris_data.keys(), '\n')

print('given features:', iris_data['feature_names'])
print('flower names (class names):', iris_data['target_names'])

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module']) 

given features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
flower names (class names): ['setosa' 'versicolor' 'virginica']


In [3]:
iris_data["data"]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [4]:
iris_data["target"]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [5]:
# Those are the given details. We're  concerned about 'data' (containing features of flowers) and target (containing classes)

X = np.array(iris_data['data'])
y = np.array(iris_data['target'])

# Data is sorted by y. Let's shake things up a bit

shuffled_idxs = list(range(0, len(y)))
np.random.shuffle(shuffled_idxs)

X = X[shuffled_idxs]
y = y[shuffled_idxs]

In [6]:
print(X.shape)
print(y.shape)

print(np.unique(y))

(150, 4)
(150,)
[0 1 2]


In [7]:
# The next step after getting your hands on a dataset might normally be EDA, but we'll skip that.
# we'll make a logistic regression model for class '0' (named setosa). The model returns 1 for flower IS setosa and 0 for flower is NOT setosa

y[y==0]  = -1  # placeholder value
y[y>0]   =  0
y[y==-1] =  1  # Setosa labels are now 1 in 'y' and other flower's are 0

In [8]:
y

array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=123, train_size=0.7)

In [10]:
model = LogisticRegression()
model.fit(X_train, y_train)  # we train on training data only

In [11]:
y_pred = model.predict(X_test)  # to check the performance, we use the testing data split

In [12]:
correct_mask = (y_pred == y_test)  # the predictions from testing features is supposed to be compared to labels of testing data
correct_count = correct_mask.sum()
total_count = len(y_pred)

print(f'The model predicted {correct_count} out of {total_count} predictions')
print(f'The Accuracy is {correct_count/total_count} ({correct_count/total_count*100}%)')

The model predicted 45 out of 45 predictions
The Accuracy is 1.0 (100.0%)


In [16]:
confusion_matrix(y_test, y_pred)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [17]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        30
           1       1.00      1.00      1.00        15

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



In [21]:
plt.plot(x[correct_mask][:, 0], x[correct_mask][:, 1], 'go', label='Predicted correctly')
plt.plot(x[~correct_mask][:, 0], x[~correct_mask][:, 1], 'ro', label='Predicted in-correctly')


plt.legend()
plt.show()

NameError: ignored