# 03 - Deleting & handling missing values - Iris

### Step 1. Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np

### Step 2. Import the dataset from this [address](https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data). 

### Step 3. Assign it to a variable called iris

In [5]:
iris = pd.read_csv('iris.data')
iris

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa
...,...,...,...,...,...
144,6.7,3.0,5.2,2.3,Iris-virginica
145,6.3,2.5,5.0,1.9,Iris-virginica
146,6.5,3.0,5.2,2.0,Iris-virginica
147,6.2,3.4,5.4,2.3,Iris-virginica


### Step 4. Call the columns for the dataset as follows:
1. sepal_length (in cm)
2. sepal_width (in cm)
3. petal_length (in cm)
4. petal_width (in cm)
5. class

In [8]:
cols = ['sepal_length (cm)', 'sepal_width (cm)', 'petal_length (cm)', 'petal_width (cm)', 'class']
iris = pd.read_csv('iris.data', names=cols)
iris

Unnamed: 0,sepal_length (cm),sepal_width (cm),petal_length (cm),petal_width (cm),class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


### Step 5.  Is there any missing value in the dataframe?

In [11]:
all(iris.notnull())

True

### Step 6.  Lets set the values of the rows 10 to 29 of the column 'petal_length' to NaN

In [30]:
iris.loc[10:29, 'petal_length (cm)'] = np.nan
iris.loc[9:30, 'petal_length (cm)']

9     1.5
10    NaN
11    NaN
12    NaN
13    NaN
14    NaN
15    NaN
16    NaN
17    NaN
18    NaN
19    NaN
20    NaN
21    NaN
22    NaN
23    NaN
24    NaN
25    NaN
26    NaN
27    NaN
28    NaN
29    NaN
30    1.6
Name: petal_length (cm), dtype: float64

### Step 7. Good, now lets substitute the NaN values to 1.0

In [31]:
iris = iris.fillna(1)
iris.loc[10:29, 'petal_length (cm)']

10    1.0
11    1.0
12    1.0
13    1.0
14    1.0
15    1.0
16    1.0
17    1.0
18    1.0
19    1.0
20    1.0
21    1.0
22    1.0
23    1.0
24    1.0
25    1.0
26    1.0
27    1.0
28    1.0
29    1.0
Name: petal_length (cm), dtype: float64

### Step 8. Now let's delete the column class

In [34]:
iris = iris.drop('class', axis=1)
iris

Unnamed: 0,sepal_length (cm),sepal_width (cm),petal_length (cm),petal_width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


### Step 9.  Set the first 3 rows as NaN

In [37]:
iris.loc[:2] = np.nan
iris.loc[:2]

Unnamed: 0,sepal_length (cm),sepal_width (cm),petal_length (cm),petal_width (cm)
0,,,,
1,,,,
2,,,,


### Step 10.  Delete the rows that have NaN

In [40]:
iris = iris.dropna()
iris

Unnamed: 0,sepal_length (cm),sepal_width (cm),petal_length (cm),petal_width (cm)
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


### Step 11. Reset the index so it begins with 0 again

In [43]:
iris = iris.reset_index()
iris

Unnamed: 0,index,sepal_length (cm),sepal_width (cm),petal_length (cm),petal_width (cm)
0,3,4.6,3.1,1.5,0.2
1,4,5.0,3.6,1.4,0.2
2,5,5.4,3.9,1.7,0.4
3,6,4.6,3.4,1.4,0.3
4,7,5.0,3.4,1.5,0.2
...,...,...,...,...,...
142,145,6.7,3.0,5.2,2.3
143,146,6.3,2.5,5.0,1.9
144,147,6.5,3.0,5.2,2.0
145,148,6.2,3.4,5.4,2.3
