In [1]:
import pandas as pd
import numpy as np
from plotly import express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('Iris.csv')
df.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [23]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [21]:
df[df['SepalWidthCm'] > 4]

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
15,16,5.7,4.4,1.5,0.4,Iris-setosa
32,33,5.2,4.1,1.5,0.1,Iris-setosa
33,34,5.5,4.2,1.4,0.2,Iris-setosa


In [22]:
df[df['PetalWidthCm'] > 1]

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
50,51,7.0,3.2,4.7,1.4,Iris-versicolor
51,52,6.4,3.2,4.5,1.5,Iris-versicolor
52,53,6.9,3.1,4.9,1.5,Iris-versicolor
53,54,5.5,2.3,4.0,1.3,Iris-versicolor
54,55,6.5,2.8,4.6,1.5,Iris-versicolor
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [25]:
df[df['PetalWidthCm'] > 2]

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
100,101,6.3,3.3,6.0,2.5,Iris-virginica
102,103,7.1,3.0,5.9,2.1,Iris-virginica
104,105,6.5,3.0,5.8,2.2,Iris-virginica
105,106,7.6,3.0,6.6,2.1,Iris-virginica
109,110,7.2,3.6,6.1,2.5,Iris-virginica
112,113,6.8,3.0,5.5,2.1,Iris-virginica
114,115,5.8,2.8,5.1,2.4,Iris-virginica
115,116,6.4,3.2,5.3,2.3,Iris-virginica
117,118,7.7,3.8,6.7,2.2,Iris-virginica
118,119,7.7,2.6,6.9,2.3,Iris-virginica


## Model 1

In [4]:
y = df['SepalLengthCm']
x = df[['SepalWidthCm']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)
print(x_train.head(5))
print(x_test.head(5))
print(y_train.head(5))
print(y_test.head(5))

    SepalWidthCm
89           2.5
51           3.2
67           2.7
15           4.4
54           2.8
     SepalWidthCm
38            3.0
19            3.8
142           2.7
128           2.8
46            3.8
89    5.5
51    6.4
67    5.8
15    5.7
54    6.5
Name: SepalLengthCm, dtype: float64
38     4.4
19     5.1
142    5.8
128    6.4
46     5.1
Name: SepalLengthCm, dtype: float64


In [5]:
lr = LinearRegression().fit(x_train, y_train)

In [6]:
y_pred = lr.predict(x_test)

In [7]:
print("Pred-Real")
for pred, real in zip([np.round(y, 1) for y in y_pred[:5]], y_test[:5]):
    print(F"{pred}-{real}")

Pred-Real
5.9-4.4
5.9-5.1
6.0-5.8
6.0-6.4
5.9-5.1


In [8]:
mean_squared_error(y_pred=y_pred, y_true=y_test)

0.5439681070560553

## Model 2

In [9]:
y = df['SepalLengthCm']
x = df[['SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [11]:
lr = LinearRegression().fit(x_train, y_train)

In [12]:
y_pred = lr.predict(x_test)

In [13]:
mean_squared_error(y_pred=y_pred, y_true=y_test)

0.1234258328359523

## Bonus - Categorize species by petal/sepal characteristics

In [14]:
y = df['Species']
x = df[['SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'PetalLengthCm']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [15]:
# Train model to find species using sepal width and length, and petal width and length
log_reg = LogisticRegression().fit(x_train, y_train)

In [16]:
predictions = log_reg.predict(x_test)

In [17]:
print(F"Predictions\t\t-\tReal")
for pred, real in zip(predictions[:10], y_test[:10]):
    print(F"{pred}\t\t-\t{real}")

Predictions		-	Real
Iris-versicolor		-	Iris-versicolor
Iris-setosa		-	Iris-setosa
Iris-setosa		-	Iris-setosa
Iris-versicolor		-	Iris-versicolor
Iris-versicolor		-	Iris-versicolor
Iris-virginica		-	Iris-virginica
Iris-versicolor		-	Iris-versicolor
Iris-setosa		-	Iris-setosa
Iris-setosa		-	Iris-setosa
Iris-virginica		-	Iris-virginica


In [18]:
df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [19]:
# Normalize species into 1, 2, 3 values to find mean_squared_error
from enum import Enum
species = Enum('Species', ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
y_pred_norm = [species[x].value for x in predictions]
y_real_norm = [species[x].value for x in y_test]
mean_squared_error(y_pred=y_pred_norm, y_true=y_real_norm)

0.044444444444444446

In [20]:
# Verify actual mismatches
non_matches = [(x, y) for x, y in zip(predictions, y_test) if x != y]
non_matches

[('Iris-virginica', 'Iris-versicolor'), ('Iris-virginica', 'Iris-versicolor')]