In [None]:
!/opt/venv/bin/python -m pip install --upgrade pip
# Handling pip upgrades

import pandas as pd
import numpy as np

# Making plotly as the backend for pandas
!pip install plotly
pd.options.plotting.backend = "plotly"

# Setting the theme
import plotly.io as pio
import plotly.express as px
pio.templates.default = "plotly_white"

Requirement already up-to-date: pip in /opt/venv/lib/python3.7/site-packages (20.2.2)


In [None]:
iris_set = pd.read_csv("./iris/iris.data")
# add the column names, as it doesn't have it
attributes = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
iris_set.columns = attributes
iris_set

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


As we can see from the values, they are already pretty normalised, so we won't be needing to normalise them

Let's make some scatter plots to see how the dataset is like

In [None]:
px.scatter(iris_set, x="sepal_width", y="sepal_length", color="species", title="Based on sepal features")

In [None]:
px.scatter(iris_set, x="petal_width", y="petal_length", color="species", title="Based on petal features")

In [None]:
iris_set.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,149.0,149.0,149.0,149.0
mean,5.848322,3.051007,3.774497,1.205369
std,0.828594,0.433499,1.759651,0.761292
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.4,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [None]:
print("Any NaN items:", iris_set.isnull().values.any())

Any NaN items: False


It seems like there are no missing values either, in the dataset

Let's try and find some outliers in the dataset now

In [None]:
# let's find the mean for each row

for col in iris_set.columns[:-1]:
    print("\n{} column has the mean {:.4f}".format(col, iris_set[col].mean()))
    print("{} column has the std  {:.4f}".format(col, iris_set[col].std()))


sepal_length column has the mean 5.8483
sepal_length column has the std  0.8286

sepal_width column has the mean 3.0510
sepal_width column has the std  0.4335

petal_length column has the mean 3.7745
petal_length column has the std  1.7597

petal_width column has the mean 1.2054
petal_width column has the std  0.7613


Finding the outliers now:

We use Z-Score for finding it, and set the threshold to 2

In [None]:
threshold = 2
outlier = set()

for col in iris_set.columns[:-1]:
    row = 0
    std = iris_set[col].std()
    mean = iris_set[col].mean()
    for i in iris_set[col]: 
        z = (i-mean)/std
        if z > threshold:
            outlier.add(row)
        row+=1

Let's print the outlier rows:

In [None]:
iris_set[iris_set.index.isin(outlier)]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
13,5.8,4.0,1.2,0.2,Iris-setosa
14,5.7,4.4,1.5,0.4,Iris-setosa
31,5.2,4.1,1.5,0.1,Iris-setosa
32,5.5,4.2,1.4,0.2,Iris-setosa
104,7.6,3.0,6.6,2.1,Iris-virginica
116,7.7,3.8,6.7,2.2,Iris-virginica
117,7.7,2.6,6.9,2.3,Iris-virginica
121,7.7,2.8,6.7,2.0,Iris-virginica
130,7.9,3.8,6.4,2.0,Iris-virginica
134,7.7,3.0,6.1,2.3,Iris-virginica
