# Test Dataset

In [1]:
import numpy as np

In [2]:
np.random.seed(1)

In [3]:
data = 5*np.random.randn(10000) + 50

In [4]:
print('mean= %.3f  std= %3.f' %(np.mean(data),np.std(data)))

mean= 50.049  std=   5


# Method 1 - Standard Deviation Method

In [5]:
data_mean , data_std = np.mean(data) , np.std(data)

In [6]:
cut_off = data_std*3

In [7]:
lower , upper = data_mean-cut_off , data_mean+cut_off

In [8]:
outliers = [i for i in data if i<lower or i>upper]

In [9]:
outliers_removed = [x for x in data if x>lower and x<upper]

In [10]:
print('Identified outlier = %d' %len(outliers))
print('Non-Outliers observation = %d'%len(outliers_removed))

Identified outlier = 29
Non-Outliers observation = 9971


# Method-2 Inquartile Range Method

In [11]:
np.random.seed(1)

In [12]:
data = np.random.randn(10000)

In [13]:
data = 5*data+50

In [14]:
q25 , q75 = np.percentile(data,25) , np.percentile(data,75)

In [16]:
iqr = q75-q25

In [17]:
q25 , iqr , q75

(46.685375790489445, 6.673668386862346, 53.35904417735179)

In [18]:
cut_off = iqr*1.5

In [19]:
lower , upper = q25-cut_off , q75+cut_off

In [20]:
outliers = [x for x in data if x<lower or x>upper]

In [21]:
outliers_removed = [x for x in data if x>lower and x<upper]

In [22]:
print('Outliers count = %d' % len(outliers))
print('Non-outliers count = %d' % len(outliers_removed))

Outliers count = 81
Non-outliers count = 9919


# Method-3 Local Outlier Factor

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [25]:
df = pd.read_csv('Datasets/BostonHousing.csv')

In [38]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [37]:
df = df.fillna(0)

In [39]:
df.isnull().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64

In [40]:
data = df.values

In [41]:
x , y = data[:,:-1] , data[:,-1]

In [42]:
x.shape , y.shape

((506, 13), (506,))

In [43]:
xtrain , xtest , ytrain , ytest = train_test_split(x,y,test_size = 0.3 , random_state=1)

In [44]:
xtrain.shape , xtest.shape , ytrain.shape , ytest.shape

((354, 13), (152, 13), (354,), (152,))

In [45]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [46]:
model = LinearRegression()

In [47]:
model.fit(xtrain,ytrain)

In [48]:
ypred = model.predict(xtest)

In [49]:
error = mean_absolute_error(ytest,ypred)

In [50]:
error

3.59156515414779

In [51]:
from sklearn.neighbors import LocalOutlierFactor

In [52]:
df = pd.read_csv('Datasets/BostonHousing.csv')

In [53]:
df = df.fillna(0)

In [54]:
data = df.values

In [55]:
x,y = data[:,:-1],data[:,-1]

In [56]:
xtrain , xtest , ytrain , ytest = train_test_split(x,y,test_size = 0.3 , random_state=1)

In [60]:
xtrain.shape , xtest.shape , ytrain.shape , ytest.shape

((354, 13), (152, 13), (354,), (152,))

In [58]:
lof = LocalOutlierFactor()

In [61]:
yhat = lof.fit_predict(xtrain)

In [66]:
yhat

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,
        1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1,  1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1, -1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1

In [62]:
mask = yhat != -1

In [64]:
xtrain , ytrain = xtrain[mask, :],ytrain[mask]

In [65]:
xtrain.shape ,ytrain.shape

((320, 13), (320,))

In [67]:
model = LinearRegression()

In [68]:
model.fit(xtrain,ytrain)

In [69]:
ypred = model.predict(xtest)

In [70]:
error = mean_absolute_error(ytest,ypred)

In [71]:
error

3.614204370905514