## Method 01 : Removing outliers using z-score

In [1]:
# import liberaries 
import numpy as np
import pandas as pd

In [3]:
# load the data
data = {'age' : [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 75]}

In [21]:
# converting into pandas DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,age,z-score
0,20,-0.615341
1,21,-0.547653
2,22,-0.479966
3,23,-0.412278
4,24,-0.344591
5,25,-0.276903
6,26,-0.209216
7,27,-0.141528
8,28,-0.073841
9,29,-0.006153


In [22]:
# To check Outliers we need to compute the z-score of the data where  (z = (x - u)/std)
mean = np.mean(df['age'])
std = np.std(df['age'])
mean, std
df['z-score'] = (df['age'] - mean)/std
df

Unnamed: 0,age,z-score
0,20,-0.615341
1,21,-0.547653
2,22,-0.479966
3,23,-0.412278
4,24,-0.344591
5,25,-0.276903
6,26,-0.209216
7,27,-0.141528
8,28,-0.073841
9,29,-0.006153


In [23]:
# check the outliers in the data
outliers = df[df['z-score'] >= 3]
outliers

Unnamed: 0,age,z-score
10,75,3.107472


In [27]:
# removing outliers from the data
df = df[df['z-score'] < 3]
df

Unnamed: 0,age,z-score
0,20,-0.615341
1,21,-0.547653
2,22,-0.479966
3,23,-0.412278
4,24,-0.344591
5,25,-0.276903
6,26,-0.209216
7,27,-0.141528
8,28,-0.073841
9,29,-0.006153


## Method 02 : Removing outliers using Interqurtile Range (IQR = Q3 - Q1)

In [58]:
# import liberaries 
import numpy as np
import pandas as pd

In [59]:
# load the data
data = {'age' : [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 75]}

In [60]:
# converting into pandas DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,age
0,20
1,21
2,22
3,23
4,24
5,25
6,26
7,27
8,28
9,29


In [61]:
# Calculate (IQR = Q3 - Q1)
Q3 = np.percentile(df['age'], 75, interpolation = 'midpoint')
Q1 = np.percentile(df['age'], 25, interpolation = 'midpoint')
IQR = Q3 - Q1
IQR

5.0

In [62]:
# Calculate lower_bound and upper_bound
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
lower_bound, upper_bound

(15.0, 35.0)

In [63]:
df

Unnamed: 0,age
0,20
1,21
2,22
3,23
4,24
5,25
6,26
7,27
8,28
9,29


In [65]:
# Check the outliers
outliers = df[(df['age'] < lower_bound) | (df['age'] > upper_bound)]
outliers

Unnamed: 0,age
10,75


In [45]:
# Removeing the outliers
df = df[(df['age'] >= lower_bound) & (df['age'] <= upper_bound)]
df

Unnamed: 0,age
0,20
1,21
2,22
3,23
4,24
5,25
6,26
7,27
8,28
9,29
