#### The aim of this notebook is to get descriptive statistics, search for missing values and check the balance between groups of a target value.

In [1]:
import pandas as pd

In [2]:
input_data = pd.read_csv("../data/raw/Graduate - IRISES dataset (2019-06).csv", sep="|")

In [3]:
# Visualize first 5 rows and get basic information about data

In [4]:
input_data.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa


In [5]:
input_data["Species"].value_counts()

virginica     50
setosa        50
versicolor    50
Name: Species, dtype: int64

In [6]:
input_data.describe()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length
count,150.0,149.0,150.0
mean,5.779333,3.061745,3.758
std,1.197721,0.433963,1.765298
min,-4.8,2.0,1.0
25%,5.1,2.8,1.6
50%,5.8,3.0,4.35
75%,6.4,3.3,5.1
max,7.9,4.4,6.9


In [7]:
input_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
Sepal.Length    150 non-null float64
Sepal.Width     149 non-null float64
Petal.Length    150 non-null float64
Petal.Width     150 non-null object
Species         150 non-null object
dtypes: float64(3), object(2)
memory usage: 7.0+ KB


In [8]:
input_data.isna().any()

Sepal.Length    False
Sepal.Width      True
Petal.Length    False
Petal.Width     False
Species         False
dtype: bool

In [9]:
# Found not numerical values in Petal.Width
# Found missing values in Sepal.Width column

In [9]:
# At first convert commas to dots, and change type to numeric

In [10]:
input_data["Petal.Width"] = input_data["Petal.Width"].str.replace(",", ".")
pd.to_numeric(input_data["Petal.Width"])

1      0.2
2      0.2
3      0.2
4      0.2
5      0.2
      ... 
146    2.3
147    1.9
148    2.0
149    2.3
150    1.8
Name: Petal.Width, Length: 150, dtype: float64

In [11]:
# Then drop rows with missing values

In [12]:
input_data.dropna(inplace=True)
input_data.isna().any()

Sepal.Length    False
Sepal.Width     False
Petal.Length    False
Petal.Width     False
Species         False
dtype: bool

In [13]:
input_data.describe()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length
count,149.0,149.0,149.0
mean,5.781208,3.061745,3.758389
std,1.20154,0.433963,1.771246
min,-4.8,2.0,1.0
25%,5.1,2.8,1.6
50%,5.8,3.0,4.4
75%,6.4,3.3,5.1
max,7.9,4.4,6.9


In [14]:
input_data["Species"].value_counts()

virginica     50
setosa        50
versicolor    49
Name: Species, dtype: int64

In [15]:
# Groups of target value are still balanced after cleaning

In [16]:
# Save cleaned data

In [17]:
input_data.to_csv("../data/interim/Graduate - IRISES dataset (2019-06)_CLEANED.csv", sep="|", index=False)