In [1]:
%%writefile outlier.py

import pandas as pd
import numpy as np
# from warnings import warn

# 2018.11.07 Created by Eamon.Zhang

def outlier_detect_arbitrary(data,col,upper_fence,lower_fence):
    '''
    identify outliers based on arbitrary boundaries passed to the function.
    '''

    para = (upper_fence, lower_fence)
    tmp = pd.concat([data[col]>upper_fence,data[col]<lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Num of outlier detected:',outlier_index.value_counts()[1])
    print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))    
    return outlier_index, para



def outlier_detect_IQR(data,col,threshold=3):
    '''
    outlier detection by Interquartile Ranges Rule, also known as Tukey's test. 
    calculate the IQR ( 75th quantile - 25th quantile) 
    and the 25th 75th quantile. 
    Any value beyond:
        upper bound = 75th quantile + （IQR * threshold）
        lower bound = 25th quantile - （IQR * threshold）   
    are regarded as outliers. Default threshold is 3.
    '''
     
    IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
    Lower_fence = data[col].quantile(0.25) - (IQR * threshold)
    Upper_fence = data[col].quantile(0.75) + (IQR * threshold)
    para = (Upper_fence, Lower_fence)
    tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Num of outlier detected:',outlier_index.value_counts()[1])
    print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para


def outlier_detect_mean_std(data,col,threshold=3):
    '''
    outlier detection by Mean and Standard Deviation Method.
    If a value is a certain number(called threshold) of standard deviations away 
    from the mean, that data point is identified as an outlier. 
    Default threshold is 3.

    This method can fail to detect outliers because the outliers increase the standard deviation. 
    The more extreme the outlier, the more the standard deviation is affected.
    '''
   
    Upper_fence = data[col].mean() + threshold * data[col].std()
    Lower_fence = data[col].mean() - threshold * data[col].std()   
    para = (Upper_fence, Lower_fence)   
    tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Num of outlier detected:',outlier_index.value_counts()[1])
    print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para


def outlier_detect_MAD(data,col,threshold=3.5):
    """
    outlier detection by Median and Median Absolute Deviation Method (MAD)
    The median of the residuals is calculated. Then, the difference is calculated between each historical value and this median. 
    These differences are expressed as their absolute values, and a new median is calculated and multiplied by 
    an empirically derived constant to yield the median absolute deviation (MAD). 
    If a value is a certain number of MAD away from the median of the residuals, 
    that value is classified as an outlier. The default threshold is 3 MAD.
    
    This method is generally more effective than the mean and standard deviation method for detecting outliers, 
    but it can be too aggressive in classifying values that are not really extremely different. 
    Also, if more than 50% of the data points have the same value, MAD is computed to be 0, 
    so any value different from the residual median is classified as an outlier.
    """
    
    median = data[col].median()
    median_absolute_deviation = np.median([np.abs(y - median) for y in data[col]])
    modified_z_scores = pd.Series([0.6745 * (y - median) / median_absolute_deviation for y in data[col]])
    outlier_index = np.abs(modified_z_scores) > threshold
    print('Num of outlier detected:',outlier_index.value_counts()[1])
    print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index


# 2018.11.10 outlier treatment
def impute_outlier_with_arbitrary(data,outlier_index,value,col=[]):
    """
    impute outliers with arbitrary value
    """
    
    data_copy = data.copy(deep=True)
    for i in col:
        data_copy.loc[outlier_index,i] = value
    return data_copy
    
    
def windsorization(data,col,para,strategy='both'):
    """
    top-coding & bottom coding (capping the maximum of a distribution at an arbitrarily set value,vice versa)
    """
    
    data_copy = data.copy(deep=True)  
    if strategy == 'both':
        data_copy.loc[data_copy[col]>para[0],col] = para[0]
        data_copy.loc[data_copy[col]<para[1],col] = para[1]
    elif strategy == 'top':
        data_copy.loc[data_copy[col]>para[0],col] = para[0]
    elif strategy == 'bottom':
        data_copy.loc[data_copy[col]<para[1],col] = para[1]  
    return data_copy


def drop_outlier(data,outlier_index):
    """
    drop the cases that are outliers
    """
    
    data_copy = data[~outlier_index]
    return data_copy


def impute_outlier_with_avg(data,col,outlier_index,strategy='mean'):
    """
    impute outlier with mean/median/most frequent values of that variable.
    """
    
    data_copy = data.copy(deep=True)
    if strategy=='mean':
        data_copy.loc[outlier_index,col] = data_copy[col].mean()
    elif strategy=='median':
        data_copy.loc[outlier_index,col] = data_copy[col].median()
    elif strategy=='mode':
        data_copy.loc[outlier_index,col] = data_copy[col].mode()[0]   
        
    return data_copy

Writing outlier.py


## Imports

In [2]:
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
import os
# plt.style.use('seaborn-colorblind')
# %matplotlib inline
import outlier as ot

## Load dataset

In [3]:
use_cols = [
    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',
    'Survived'
]


data = pd.read_csv('https://raw.githubusercontent.com/daniel-dc-cd/feature-engineering-and-feature-selection/master/data/titanic.csv', usecols=use_cols)
data.head(3)
print(data.shape)

(891, 6)


In [4]:
pd.Series(data.Fare.unique()).sort_values()

104      0.0000
163      4.0125
245      5.0000
152      6.2375
240      6.4375
         ...   
164    227.5250
75     247.5208
148    262.3750
23     263.0000
127    512.3292
Length: 248, dtype: float64

## Detect by arbitrary boundary
identify outliers based on arbitrary boundaries

In [5]:
index,para = ot.outlier_detect_arbitrary(data=data,col='Fare',upper_fence=300,lower_fence=5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [6]:
# check the 19 found outliers
data.loc[index,'Fare'].sort_values()

179      0.0000
806      0.0000
732      0.0000
674      0.0000
633      0.0000
597      0.0000
815      0.0000
466      0.0000
481      0.0000
302      0.0000
277      0.0000
271      0.0000
263      0.0000
413      0.0000
822      0.0000
378      4.0125
679    512.3292
737    512.3292
258    512.3292
Name: Fare, dtype: float64

## IQR method
outlier detection by Interquartile Ranges Rule

In [7]:
index,para = ot.outlier_detect_IQR(data=data,col='Fare',threshold=5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 31
Proportion of outlier detected 0.03479236812570146
Upper bound: 146.448 
Lower bound: -107.53760000000001


In [8]:
# check the 31 found outliers
data.loc[index,'Fare'].sort_values()

31     146.5208
195    146.5208
305    151.5500
708    151.5500
297    151.5500
498    151.5500
609    153.4625
332    153.4625
268    153.4625
318    164.8667
856    164.8667
730    211.3375
779    211.3375
689    211.3375
377    211.5000
527    221.7792
700    227.5250
716    227.5250
557    227.5250
380    227.5250
299    247.5208
118    247.5208
311    262.3750
742    262.3750
341    263.0000
88     263.0000
438    263.0000
27     263.0000
679    512.3292
258    512.3292
737    512.3292
Name: Fare, dtype: float64

## Mean and Standard Deviation Method
outlier detection by Mean and Standard Deviation Method.

In [9]:
index,para = ot.outlier_detect_mean_std(data=data,col='Fare',threshold=3)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 20
Proportion of outlier detected 0.02244668911335578
Upper bound: 181.2844937601173 
Lower bound: -116.87607782296811


In [10]:
# check the 20 found outliers
data.loc[index,'Fare'].sort_values()

779    211.3375
730    211.3375
689    211.3375
377    211.5000
527    221.7792
716    227.5250
700    227.5250
380    227.5250
557    227.5250
118    247.5208
299    247.5208
311    262.3750
742    262.3750
27     263.0000
341    263.0000
88     263.0000
438    263.0000
258    512.3292
737    512.3292
679    512.3292
Name: Fare, dtype: float64

## MAD method
outlier detection by Median and Median Absolute Deviation Method (MAD)

In [11]:
# too aggressive for our dataset, about 18% of cases are detected as outliers.
index = ot.outlier_detect_MAD(data=data,col='Fare',threshold=3.5)

Num of outlier detected: 160
Proportion of outlier detected 0.17957351290684623


##  Imputation with arbitrary value
impute outliers with arbitrary value

In [12]:
# use any of the detection method above
index,para = ot.outlier_detect_arbitrary(data=data,col='Fare',upper_fence=300,lower_fence=5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [13]:
data[255:275]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
255,1,3,female,29.0,0,15.2458
256,1,1,female,,0,79.2
257,1,1,female,30.0,0,86.5
258,1,1,female,35.0,0,512.3292
259,1,2,female,50.0,0,26.0
260,0,3,male,,0,7.75
261,1,3,male,3.0,4,31.3875
262,0,1,male,52.0,1,79.65
263,0,1,male,40.0,0,0.0
264,0,3,female,,0,7.75


In [14]:
# see index 258,263,271 have been replaced
data2 = ot.impute_outlier_with_arbitrary(data=data,outlier_index=index,
                                         value=-999,col=['Fare'])
data2[255:275]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
255,1,3,female,29.0,0,15.2458
256,1,1,female,,0,79.2
257,1,1,female,30.0,0,86.5
258,1,1,female,35.0,0,-999.0
259,1,2,female,50.0,0,26.0
260,0,3,male,,0,7.75
261,1,3,male,3.0,4,31.3875
262,0,1,male,52.0,1,79.65
263,0,1,male,40.0,0,-999.0
264,0,3,female,,0,7.75


## Windsorization
top-coding & bottom coding (capping the maximum of a distribution at an arbitrarily set value,vice versa)


In [15]:
# use any of the detection method above
index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [16]:
# see index 258,263,271 have been replaced with top/bottom coding

data3 = ot.windsorization(data=data,col='Fare',para=para,strategy='both')
data3[255:275]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
255,1,3,female,29.0,0,15.2458
256,1,1,female,,0,79.2
257,1,1,female,30.0,0,86.5
258,1,1,female,35.0,0,300.0
259,1,2,female,50.0,0,26.0
260,0,3,male,,0,7.75
261,1,3,male,3.0,4,31.3875
262,0,1,male,52.0,1,79.65
263,0,1,male,40.0,0,5.0
264,0,3,female,,0,7.75


## Discard outliers
Drop the cases that are outliers

In [17]:
# use any of the detection method above
index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)
print('Upper bound:',para[0],'\nLower bound:',para[1])

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [18]:
# drop the outlier.
# we can see no more observations have value >300 or <5. They've been removed.
data4 = ot.drop_outlier(data=data,outlier_index=index)
print(data4.Fare.max())
print(data4.Fare.min())

263.0
5.0


## Mean/Median/Mode Imputation
replacing the outlier by mean/median/most frequent values of that variable

In [19]:
# use any of the detection method above
index,para = ot.outlier_detect_arbitrary(data,'Fare',300,5)
print('Upper bound:',para[0],'\nLower bound:',para[1])
    

Num of outlier detected: 19
Proportion of outlier detected 0.02132435465768799
Upper bound: 300 
Lower bound: 5


In [20]:
# see index 258,263,271 have been replaced with mean

data5 = ot.impute_outlier_with_avg(data=data,col='Fare',
                                   outlier_index=index,strategy='mean')
data5[255:275]

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Fare
255,1,3,female,29.0,0,15.2458
256,1,1,female,,0,79.2
257,1,1,female,30.0,0,86.5
258,1,1,female,35.0,0,32.204208
259,1,2,female,50.0,0,26.0
260,0,3,male,,0,7.75
261,1,3,male,3.0,4,31.3875
262,0,1,male,52.0,1,79.65
263,0,1,male,40.0,0,32.204208
264,0,3,female,,0,7.75
