In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')


In [33]:
# HEAD of the dataset
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [54]:
# TAIL of the dataset
iris.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [34]:
# DIMENSIONS of the dataset - (row, columns)
iris.shape

(150, 5)

In [35]:
# STATISTICS of the dataset
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


Counts for the different species count

In [38]:
iris["species"].value_counts()

species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

In [39]:
iris.groupby("species").mean()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [40]:
# separating data and labels
x = iris.drop(columns='species', axis=1)
y = iris['species']
print(x) # features
print(y) # labels

     sepal_length  sepal_width  petal_length  petal_width
0             5.1          3.5           1.4          0.2
1             4.9          3.0           1.4          0.2
2             4.7          3.2           1.3          0.2
3             4.6          3.1           1.5          0.2
4             5.0          3.6           1.4          0.2
..            ...          ...           ...          ...
145           6.7          3.0           5.2          2.3
146           6.3          2.5           5.0          1.9
147           6.5          3.0           5.2          2.0
148           6.2          3.4           5.4          2.3
149           5.9          3.0           5.1          1.8

[150 rows x 4 columns]
0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object


Training data and Test data

In [42]:
# x_train <--> y_train
# x_test <--> y_test

# 90% training and 10% testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, stratify = y, random_state = 1) 

In [43]:
print(x_train)
print(y_train)

     sepal_length  sepal_width  petal_length  petal_width
77            6.7          3.0           5.0          1.7
114           5.8          2.8           5.1          2.4
110           6.5          3.2           5.1          2.0
139           6.9          3.1           5.4          2.1
39            5.1          3.4           1.5          0.2
..            ...          ...           ...          ...
45            4.8          3.0           1.4          0.3
118           7.7          2.6           6.9          2.3
41            4.5          2.3           1.3          0.3
127           6.1          3.0           4.9          1.8
42            4.4          3.2           1.3          0.2

[135 rows x 4 columns]
77     versicolor
114     virginica
110     virginica
139     virginica
39         setosa
          ...    
45         setosa
118     virginica
41         setosa
127     virginica
42         setosa
Name: species, Length: 135, dtype: object


After we split the data, we need to set up the model

In [45]:
model = LogisticRegression()

In [46]:
# train the logistic regression model with x_train (features/numbers/input data) and y_train (targets/labels/output data)
model.fit(x_train, y_train) 

Once we train the model with features and their targets:
we need to generate predictions for the x_train data

In [47]:
x_train_prediction = model.predict(x_train) # this gives the target values on the training data by predicting the training data

# accuracy on training data
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

print('Accuracy on training data : ', training_data_accuracy)

Accuracy on training data :  0.9703703703703703


After running the model on training data, we run it on test data. 
Then we find the accuracy of the model on the test data

In [49]:
x_test_prediction = model.predict(x_test) # this gives the target values on the test data by predicting the test data

# accuracy on test data
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

print('Accuracy on test data : ', test_data_accuracy)


Accuracy on test data :  0.9333333333333333


Now we can tweak the model to predict for an individual data flower

In [59]:
input_data = (5.9, 3.2, 5.4, 2.1)

# convert input data to dataframe because we train the model with dataframe
input_data_df = pd.DataFrame([input_data], columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'])

individual_prediction = model.predict(input_data_df) # this gives us a single target value by predicting the input data

if (individual_prediction[0] == 'setosa'):
  print('The flower is setosa')
elif (individual_prediction[0] == 'versicolor'):
  print('The flower is versicolor')
else:
  print('The flower is virginica')

The flower is virginica
