In [38]:
import pandas as pd
import numpy as np
from plotly import express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error

In [39]:
import warnings
warnings.filterwarnings('ignore')

In [40]:
df = pd.read_csv('Iris.csv')
df.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

## Model 1

In [41]:
y = df['SepalLengthCm']
x = df[['SepalWidthCm']]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)
print(x_train.head(5))
print(x_test.head(5))
print(y_train.head(5))
print(y_test.head(5))

     SepalWidthCm
142           2.7
16            3.9
84            3.0
72            2.5
59            2.7
     SepalWidthCm
13            3.0
91            3.0
23            3.3
87            2.3
120           3.2
142    5.8
16     5.4
84     5.4
72     6.3
59     5.2
Name: SepalLengthCm, dtype: float64
13     4.3
91     6.1
23     5.1
87     6.3
120    6.9
Name: SepalLengthCm, dtype: float64


In [42]:
lr = LinearRegression().fit(x_train, y_train)

In [43]:
y_pred = lr.predict(x_test)

In [44]:
print("Pred-Real")
for pred, real in zip([np.round(y, 1) for y in y_pred[:5]], y_test[:5]):
    print(F"{pred}-{real}")

Pred-Real
5.9-4.3
5.9-6.1
5.8-5.1
6.0-6.3
5.8-6.9


In [45]:
mean_squared_error(y_pred=y_pred, y_true=y_test)

0.6556411229253234

## Model 2

In [46]:
y = df['SepalLengthCm']
x = df[['SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]

In [47]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [48]:
lr = LinearRegression().fit(x_train, y_train)

In [49]:
y_pred = lr.predict(x_test)

In [50]:
mean_squared_error(y_pred=y_pred, y_true=y_test)

0.07561400421305292

## Bonus - Categorize species by petal/sepal characteristics

In [58]:
y = df['Species']
x = df[['SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'PetalLengthCm']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30)

In [59]:
# Train model to find species using sepal width and length, and petal width and length
log_reg = LogisticRegression().fit(x_train, y_train)

In [60]:
predictions = log_reg.predict(x_test)

In [61]:
print(F"Predictions\t\t-\tReal")
for pred, real in zip(predictions[:10], y_test[:10]):
    print(F"{pred}\t\t-\t{real}")

Predictions		-	Real
Iris-versicolor		-	Iris-versicolor
Iris-setosa		-	Iris-setosa
Iris-virginica		-	Iris-virginica
Iris-setosa		-	Iris-setosa
Iris-virginica		-	Iris-virginica
Iris-versicolor		-	Iris-versicolor
Iris-virginica		-	Iris-virginica
Iris-virginica		-	Iris-versicolor
Iris-setosa		-	Iris-setosa
Iris-setosa		-	Iris-setosa


In [62]:
df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [63]:
# Normalize species into 1, 2, 3 values to find mean_squared_error
from enum import Enum
species = Enum('Species', ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'])
y_pred_norm = [species[x].value for x in predictions]
y_real_norm = [species[x].value for x in y_test]
mean_squared_error(y_pred=y_pred_norm, y_true=y_real_norm)

0.022222222222222223

In [67]:
# Verify actual mismatches
non_matches = [(x, y) for x, y in zip(predictions, y_test) if x != y]
non_matches

[('Iris-virginica', 'Iris-versicolor')]