In [33]:
#%%writefile outliers_removing.py

# identify outliers with standard deviation

import pandas as pd
import numpy as np
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from numpy import std

In [34]:
data = pd.read_csv('example.csv', sep=' ')
data.head()

Unnamed: 0,x,y
0,592.086026,705.922423
1,87.205976,595.025468
2,389.062677,402.234744
3,201.405819,788.103254
4,587.476402,692.690576


In [35]:
# calculate summary statistics

data_mean = []
data_std = []
for i in range(len(data.columns)):
    data_mean.append(mean(data[data.columns[i]]))
    data_std.append(std(data[data.columns[i]]))

In [36]:
data_mean

[14320570571277.896, 1130706399909.3296]

In [37]:
data_std

[127284015398036.23, 10049932062978.074]

In [38]:
stats = pd.DataFrame({'data_mean': data_mean,
                      'data_std': data_std,
                      'two_sigma': np.array(data_std)*2,
                      'three_sigma': np.array(data_std)*3}, index=data.columns)

In [39]:
stats

Unnamed: 0,data_mean,data_std,two_sigma,three_sigma
x,14320570000000.0,127284000000000.0,254568000000000.0,381852000000000.0
y,1130706000000.0,10049930000000.0,20099860000000.0,30149800000000.0


In [40]:
# identify outliers

cut_off = np.array(stats.two_sigma)
stats['lower'], stats['upper'] = np.array(data_mean) - cut_off, np.array(data_mean) + cut_off

In [41]:
stats

Unnamed: 0,data_mean,data_std,two_sigma,three_sigma,lower,upper
x,14320570000000.0,127284000000000.0,254568000000000.0,381852000000000.0,-240247500000000.0,268888600000000.0
y,1130706000000.0,10049930000000.0,20099860000000.0,30149800000000.0,-18969160000000.0,21230570000000.0


In [42]:
# identify outliers

outliers_set = set()
for feat in stats.index:
    low = stats.loc[feat, 'lower']
    up = stats.loc[feat, 'upper']
    outliers_set = outliers_set | set(data[(data[feat] < low) | (data[feat] > up)].index)

print(outliers_set)
outliers = data[data.index.isin(outliers_set)]
number_outs = outliers.shape[0]
print('Identified outliers: %d' %number_outs)

{34, 35}
Identified outliers: 2


In [43]:
outliers

Unnamed: 0,x,y
34,1145646000000000.0,1500.591
35,1183.748,90456460000000.0


In [44]:
# remove outliers

outliers_removed = data[~data.index.isin(outliers_set)]

number_outs_rem = outliers_removed.shape[0]
print('Non-outlier observations: %d' % number_outs_rem)

Non-outlier observations: 78


In [45]:
outliers_removed

Unnamed: 0,x,y
0,592.086026,7.059224e+02
1,87.205976,5.950255e+02
2,389.062677,4.022347e+02
3,201.405819,7.881033e+02
4,587.476402,6.926906e+02
5,1299.179475,1.293489e+03
6,602.074002,7.020022e+02
7,609.816235,6.953751e+02
8,1105.204310,1.510742e+03
9,412.868359,4.171389e+02
