In [1]:
# Load libraries
import numpy as np
import pandas as pd
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
#Jupyter setup
init_notebook_mode(connected=True)
from plotnine import *

In [2]:
#Read Lidar's file
static_test=pd.read_csv(r'lidar_data\test_stat.csv', usecols=range(4))
#Keep positive distance
static_test = static_test[static_test.Distance > 0]

In [3]:
# Grouped data
grp_static=static_test.groupby(['Time','Segment'])

In [4]:
# Aggregate with maximum distance
grp_static=grp_static.apply(lambda x: x.loc[x.Distance.idxmax()])

In [5]:
#Drop second level
grp_static.index = grp_static.index.droplevel(-2)

In [11]:
# Change type of segment column  
grp_static.Segment=grp_static.Segment.astype('int')
grp_static.Segment=grp_static.Segment.astype('category')
grp_static=grp_static.rename(columns={"Segment": "segment"})

In [12]:
#Read real Observations from manual annotations
observation=pd.read_csv(r'Groundtruth\obser_stat.csv')

In [13]:
# Convert to timeseries data
p_dist=pd.pivot_table(grp_static, values=['Amplitude','Distance'], index='Time', columns='Segment')

In [14]:
p_dist.head()

Unnamed: 0_level_0,Amplitude,Amplitude,Amplitude,Amplitude,Amplitude,Amplitude,Amplitude,Amplitude,Amplitude,Amplitude,...,Distance,Distance,Distance,Distance,Distance,Distance,Distance,Distance,Distance,Distance
Segment,0,1,2,3,4,5,6,7,8,9,...,6,7,8,9,10,11,12,13,14,15
Time,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
120633.0,116.25,90.75,246.48,83.0,269.28,164.0,53.75,3.75,26.34,14.41,...,5.55,8.35,9.36,12.33,12.31,12.29,12.28,9.07,5.49,4.04
120953.0,116.25,90.5,240.7,82.75,269.43,166.63,53.88,7.5,28.57,14.45,...,5.25,8.05,9.36,12.34,12.31,12.29,12.28,9.37,5.49,4.04
121273.0,116.13,90.63,245.49,82.88,269.25,167.88,52.5,3.75,27.06,14.4,...,5.55,8.35,9.35,12.34,12.32,12.29,12.27,9.37,5.49,4.04
121592.0,116.38,90.88,248.57,83.13,269.25,167.88,54.0,3.75,27.07,14.51,...,5.55,8.35,9.36,12.34,12.32,12.29,12.27,9.07,5.49,4.04
121912.0,116.25,90.75,247.86,83.13,269.32,165.25,53.25,3.75,26.02,14.35,...,5.55,8.35,9.36,12.34,12.31,12.29,12.27,9.07,5.49,4.04


In [16]:
#Merge two dataframe real observation file and lidar file
p_dist=p_dist.merge(observation, on='Time', how='inner')


merging between different levels can give an unintended result (2 levels on the left, 1 on the right)



In [17]:
p_dist=p_dist.set_axis(['Time', 'Ampl_0', 'Ampl_1', 'Ampl_2', 'Ampl_3', 'Ampl_4','Ampl_5', 'Ampl_6', 'Ampl_7', 'Ampl_8', 'Ampl_9', 'Ampl_10', 'Ampl_11', 'Ampl_12', 'Ampl_13', 'Ampl_14', 'Ampl_15', 'Dist_0', 'Dist_1', 'Dist_2', 'Dist_3', 'Dist_4', 'Dist_5', 'Dist_6', 'Dist_7', 'Dist_8', 'Dist_9', 'Dist_10', 'Dist_11', 'Dist_12', 'Dist_13', 'Dist_14', 'Dist_15','Heure','Observation', 'Observation_d'], axis=1, inplace=False)

this next section, we will make a outliers classification with quartiles

In [18]:
p_dist["Out_a0"] = np.nan
p_dist.sort_values('Ampl_0')
Q1_a0 = p_dist.Ampl_0.quantile(0.25)
Q3_a0 = p_dist.Ampl_0.quantile(0.75)
IQR_a0 = Q3_a0 - Q1_a0
Min_a0= Q1_a0-(1.5*IQR_a0)
Max_a0= Q3_a0+(1.5*IQR_a0)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_0']>Max_a0 or  p_dist.loc[index, 'Ampl_0']<Min_a0 :
              p_dist.loc[index, 'Out_a0']=1
        else:  p_dist.loc[index, 'Out_a0']=0

In [19]:
p_dist["Out_a1"] = np.nan
p_dist.sort_values('Ampl_1')
Q1_a1 = p_dist.Ampl_1.quantile(0.25)
Q3_a1 = p_dist.Ampl_1.quantile(0.75)
IQR_a1 = Q3_a1 - Q1_a1
Min_a1= Q1_a1-(1.5*IQR_a1)
Max_a1= Q3_a1+(1.5*IQR_a1)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_1']>Max_a1 or  p_dist.loc[index, 'Ampl_1']<Min_a1 :
              p_dist.loc[index, 'Out_a1']=1
        else:  p_dist.loc[index, 'Out_a1']=0

In [20]:
p_dist["Out_a2"] = np.nan
p_dist.sort_values('Ampl_2')
Q1_a2 = p_dist.Ampl_2.quantile(0.25)
Q3_a2 = p_dist.Ampl_2.quantile(0.75)
IQR_a2 = Q3_a2 - Q1_a2
Min_a2= Q1_a2-(1.5*IQR_a2)
Max_a2= Q3_a2+(1.5*IQR_a2)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_2']>Max_a2 or  p_dist.loc[index, 'Ampl_2']<Min_a2 :
              p_dist.loc[index, 'Out_a2']=1
        else:  p_dist.loc[index, 'Out_a2']=0


In [21]:
p_dist["Out_a3"] = np.nan
p_dist.sort_values('Ampl_3')
Q1_a3 = p_dist.Ampl_3.quantile(0.25)
Q3_a3 = p_dist.Ampl_3.quantile(0.75)
IQR_a3 = Q3_a3 - Q1_a3
Min_a3= Q1_a3-(1.5*IQR_a3)
Max_a3= Q3_a3+(1.5*IQR_a3)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_3']>Max_a3 or  p_dist.loc[index, 'Ampl_3']<Min_a3 :
              p_dist.loc[index, 'Out_a3']=1
        else:  p_dist.loc[index, 'Out_a3']=0


In [22]:
p_dist["Out_a4"] = np.nan
p_dist.sort_values('Ampl_4')
Q1_a4 = p_dist.Ampl_4.quantile(0.25)
Q3_a4 = p_dist.Ampl_4.quantile(0.75)
IQR_a4 = Q3_a4 - Q1_a4
Min_a4= Q1_a4-(1.5*IQR_a4)
Max_a4= Q3_a4+(1.5*IQR_a4)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_4']>Max_a4 or  p_dist.loc[index, 'Ampl_4']<Min_a4 :
              p_dist.loc[index, 'Out_a4']=1
        else:  p_dist.loc[index, 'Out_a4']=0


In [23]:
p_dist["Out_a5"] = np.nan
p_dist.sort_values('Ampl_5')
Q1_a5 = p_dist.Ampl_5.quantile(0.25)
Q3_a5 = p_dist.Ampl_5.quantile(0.75)
IQR_a5 = Q3_a5 - Q1_a5
Min_a5= Q1_a5-(1.5*IQR_a5)
Max_a5= Q3_a5+(1.5*IQR_a5)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_5']>Max_a5 or  p_dist.loc[index, 'Ampl_5']<Min_a5 :
              p_dist.loc[index, 'Out_a5']=1
        else:  p_dist.loc[index, 'Out_a5']=0


In [24]:
p_dist["Out_a6"] = np.nan
p_dist.sort_values('Ampl_6')
Q1_a6 = p_dist.Ampl_6.quantile(0.25)
Q3_a6 = p_dist.Ampl_6.quantile(0.75)
IQR_a6 = Q3_a6 - Q1_a6
Min_a6= Q1_a6-(1.5*IQR_a6)
Max_a6= Q3_a6+(1.5*IQR_a6)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_6']>Max_a6 or  p_dist.loc[index, 'Ampl_6']<Min_a6 :
              p_dist.loc[index, 'Out_a6']=1
        else:  p_dist.loc[index, 'Out_a6']=0


In [25]:
p_dist["Out_a7"] = np.nan
p_dist.sort_values('Ampl_7')
Q1_a7 = p_dist.Ampl_7.quantile(0.25)
Q3_a7 = p_dist.Ampl_7.quantile(0.75)
IQR_a7 = Q3_a7 - Q1_a7
Min_a7= Q1_a7-(1.5*IQR_a7)
Max_a7= Q3_a7+(1.5*IQR_a7)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_7']>Max_a7 or  p_dist.loc[index, 'Ampl_7']<Min_a7 :
              p_dist.loc[index, 'Out_a7']=1
        else:  p_dist.loc[index, 'Out_a7']=0


In [26]:
p_dist["Out_a8"] = np.nan
p_dist.sort_values('Ampl_8')
Q1_a8 = p_dist.Ampl_8.quantile(0.25)
Q3_a8 = p_dist.Ampl_8.quantile(0.75)
IQR_a8 = Q3_a8 - Q1_a8
Min_a8= Q1_a8-(1.5*IQR_a8)
Max_a8= Q3_a8+(1.5*IQR_a8)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_8']>Max_a8 or  p_dist.loc[index, 'Ampl_8']<Min_a8 :
              p_dist.loc[index, 'Out_a8']=1
        else:  p_dist.loc[index, 'Out_a8']=0


In [27]:
p_dist["Out_a9"] = np.nan
p_dist.sort_values('Ampl_9')
Q1_a9 = p_dist.Ampl_9.quantile(0.25)
Q3_a9 = p_dist.Ampl_9.quantile(0.75)
IQR_a9 = Q3_a9 - Q1_a9
Min_a9= Q1_a9-(1.5*IQR_a9)
Max_a9= Q3_a9+(1.5*IQR_a9)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_9']>Max_a9 or  p_dist.loc[index, 'Ampl_9']<Min_a9 :
              p_dist.loc[index, 'Out_a9']=1
        else:  p_dist.loc[index, 'Out_a9']=0


In [28]:
p_dist["Out_Ampl_10"] = np.nan
p_dist.sort_values('Ampl_10')
Q1_Ampl_10 = p_dist.Ampl_10.quantile(0.25)
Q3_Ampl_10 = p_dist.Ampl_10.quantile(0.75)
IQR_Ampl_10 = Q3_Ampl_10 - Q1_Ampl_10
Min_Ampl_10= Q1_Ampl_10-(1.5*IQR_Ampl_10)
Max_Ampl_10= Q3_Ampl_10+(1.5*IQR_Ampl_10)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_10']>Max_Ampl_10 or  p_dist.loc[index, 'Ampl_10']<Min_Ampl_10 :
              p_dist.loc[index, 'Out_Ampl_10']=1
        else:  p_dist.loc[index, 'Out_Ampl_10']=0


In [29]:
p_dist["Out_Ampl_11"] = np.nan
p_dist.sort_values('Ampl_11')
Q1_Ampl_11 = p_dist.Ampl_11.quantile(0.25)
Q3_Ampl_11 = p_dist.Ampl_11.quantile(0.75)
IQR_Ampl_11 = Q3_Ampl_11 - Q1_Ampl_11
Min_Ampl_11= Q1_Ampl_11-(1.5*IQR_Ampl_11)
Max_Ampl_11= Q3_Ampl_11+(1.5*IQR_Ampl_11)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_11']>Max_Ampl_11 or  p_dist.loc[index, 'Ampl_11']<Min_Ampl_11 :
              p_dist.loc[index, 'Out_Ampl_11']=1
        else:  p_dist.loc[index, 'Out_Ampl_11']=0


In [30]:
p_dist["Out_Ampl_12"] = np.nan
p_dist.sort_values('Ampl_12')
Q1_Ampl_12 = p_dist.Ampl_12.quantile(0.25)
Q3_Ampl_12 = p_dist.Ampl_12.quantile(0.75)
IQR_Ampl_12 = Q3_Ampl_12 - Q1_Ampl_12
Min_Ampl_12= Q1_Ampl_12-(1.5*IQR_Ampl_12)
Max_Ampl_12= Q3_Ampl_12+(1.5*IQR_Ampl_12)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_12']>Max_Ampl_12 or  p_dist.loc[index, 'Ampl_12']<Min_Ampl_12 :
              p_dist.loc[index, 'Out_Ampl_12']=1
        else:  p_dist.loc[index, 'Out_Ampl_12']=0


In [31]:
p_dist["Out_Ampl_13"] = np.nan
p_dist.sort_values('Ampl_13')
Q1_Ampl_13 = p_dist.Ampl_13.quantile(0.25)
Q3_Ampl_13 = p_dist.Ampl_13.quantile(0.75)
IQR_Ampl_13 = Q3_Ampl_13 - Q1_Ampl_13
Min_Ampl_13= Q1_Ampl_13-(1.5*IQR_Ampl_13)
Max_Ampl_13= Q3_Ampl_13+(1.5*IQR_Ampl_13)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_13']>Max_Ampl_13 or  p_dist.loc[index, 'Ampl_13']<Min_Ampl_13 :
              p_dist.loc[index, 'Out_Ampl_13']=1
        else:  p_dist.loc[index, 'Out_Ampl_13']=0


In [32]:
p_dist["Out_Ampl_14"] = np.nan
p_dist.sort_values('Ampl_14')
Q1_Ampl_14 = p_dist.Ampl_14.quantile(0.25)
Q3_Ampl_14 = p_dist.Ampl_14.quantile(0.75)
IQR_Ampl_14 = Q3_Ampl_14 - Q1_Ampl_14
Min_Ampl_14= Q1_Ampl_14-(1.5*IQR_Ampl_14)
Max_Ampl_14= Q3_Ampl_14+(1.5*IQR_Ampl_14)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_14']>Max_Ampl_14 or  p_dist.loc[index, 'Ampl_14']<Min_Ampl_14 :
              p_dist.loc[index, 'Out_Ampl_14']=1
        else:  p_dist.loc[index, 'Out_Ampl_14']=0


In [33]:
p_dist["Out_Ampl_15"] = np.nan
p_dist.sort_values('Ampl_15')
Q1_Ampl_15 = p_dist.Ampl_15.quantile(0.25)
Q3_Ampl_15 = p_dist.Ampl_15.quantile(0.75)
IQR_Ampl_15 = Q3_Ampl_15 - Q1_Ampl_15
Min_Ampl_15= Q1_Ampl_15-(1.5*IQR_Ampl_15)
Max_Ampl_15= Q3_Ampl_15+(1.5*IQR_Ampl_15)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Ampl_15']>Max_Ampl_15 or  p_dist.loc[index, 'Ampl_15']<Min_Ampl_15 :
              p_dist.loc[index, 'Out_Ampl_15']=1
        else:  p_dist.loc[index, 'Out_Ampl_15']=0


In [34]:
p_dist["Out_Dist_0"] = np.nan
p_dist.sort_values('Dist_0')
Q1_Dist_0 = p_dist.Dist_0.quantile(0.25)
Q3_Dist_0 = p_dist.Dist_0.quantile(0.75)
IQR_Dist_0 = Q3_Dist_0 - Q1_Dist_0
Min_Dist_0= Q1_Dist_0-(1.5*IQR_Dist_0)
Max_Dist_0= Q3_Dist_0+(1.5*IQR_Dist_0)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_0']>Max_Dist_0 or  p_dist.loc[index, 'Dist_0']<Min_Dist_0 :
              p_dist.loc[index, 'Out_Dist_0']=1
        else:  p_dist.loc[index, 'Out_Dist_0']=0


In [35]:
p_dist["Out_Dist_1"] = np.nan
p_dist.sort_values('Dist_1')
Q1_Dist_1 = p_dist.Dist_1.quantile(0.25)
Q3_Dist_1 = p_dist.Dist_1.quantile(0.75)
IQR_Dist_1 = Q3_Dist_1 - Q1_Dist_1
Min_Dist_1= Q1_Dist_1-(1.5*IQR_Dist_1)
Max_Dist_1= Q3_Dist_1+(1.5*IQR_Dist_1)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_1']>Max_Dist_1 or  p_dist.loc[index, 'Dist_1']<Min_Dist_1 :
              p_dist.loc[index, 'Out_Dist_1']=1
        else:  p_dist.loc[index, 'Out_Dist_1']=0


In [36]:
p_dist["Out_Dist_2"] = np.nan
p_dist.sort_values('Dist_2')
Q1_Dist_2 = p_dist.Dist_2.quantile(0.25)
Q3_Dist_2 = p_dist.Dist_2.quantile(0.75)
IQR_Dist_2 = Q3_Dist_2 - Q1_Dist_2
Min_Dist_2= Q1_Dist_2-(1.5*IQR_Dist_2)
Max_Dist_2= Q3_Dist_2+(1.5*IQR_Dist_2)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_2']>Max_Dist_2 or  p_dist.loc[index, 'Dist_2']<Min_Dist_2 :
              p_dist.loc[index, 'Out_Dist_2']=1
        else:  p_dist.loc[index, 'Out_Dist_2']=0


In [37]:
p_dist["Out_Dist_3"] = np.nan
p_dist.sort_values('Dist_3')
Q1_Dist_3 = p_dist.Dist_3.quantile(0.25)
Q3_Dist_3 = p_dist.Dist_3.quantile(0.75)
IQR_Dist_3 = Q3_Dist_3 - Q1_Dist_3
Min_Dist_3= Q1_Dist_3-(1.5*IQR_Dist_3)
Max_Dist_3= Q3_Dist_3+(1.5*IQR_Dist_3)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_3']>Max_Dist_3 or  p_dist.loc[index, 'Dist_3']<Min_Dist_3 :
              p_dist.loc[index, 'Out_Dist_3']=1
        else:  p_dist.loc[index, 'Out_Dist_3']=0


In [38]:
p_dist["Out_Dist_4"] = np.nan
p_dist.sort_values('Dist_4')
Q1_Dist_4 = p_dist.Dist_4.quantile(0.25)
Q3_Dist_4 = p_dist.Dist_4.quantile(0.75)
IQR_Dist_4 = Q3_Dist_4 - Q1_Dist_4
Min_Dist_4= Q1_Dist_4-(1.5*IQR_Dist_4)
Max_Dist_4= Q3_Dist_4+(1.5*IQR_Dist_4)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_4']>Max_Dist_4 or  p_dist.loc[index, 'Dist_4']<Min_Dist_4 :
              p_dist.loc[index, 'Out_Dist_4']=1
        else:  p_dist.loc[index, 'Out_Dist_4']=0


In [39]:
p_dist["Out_Dist_5"] = np.nan
p_dist.sort_values('Dist_5')
Q1_Dist_5 = p_dist.Dist_5.quantile(0.25)
Q3_Dist_5 = p_dist.Dist_5.quantile(0.75)
IQR_Dist_5 = Q3_Dist_5 - Q1_Dist_5
Min_Dist_5= Q1_Dist_5-(1.5*IQR_Dist_5)
Max_Dist_5= Q3_Dist_5+(1.5*IQR_Dist_5)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_5']>Max_Dist_5 or  p_dist.loc[index, 'Dist_5']<Min_Dist_5 :
              p_dist.loc[index, 'Out_Dist_5']=1
        else:  p_dist.loc[index, 'Out_Dist_5']=0


In [40]:
p_dist["Out_Dist_6"] = np.nan
p_dist.sort_values('Dist_6')
Q1_Dist_6 = p_dist.Dist_6.quantile(0.25)
Q3_Dist_6 = p_dist.Dist_6.quantile(0.75)
IQR_Dist_6 = Q3_Dist_6 - Q1_Dist_6
Min_Dist_6= Q1_Dist_6-(1.5*IQR_Dist_6)
Max_Dist_6= Q3_Dist_6+(1.5*IQR_Dist_6)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_6']>Max_Dist_6 or  p_dist.loc[index, 'Dist_6']<Min_Dist_6 :
              p_dist.loc[index, 'Out_Dist_6']=1
        else:  p_dist.loc[index, 'Out_Dist_6']=0


In [41]:
p_dist["Out_Dist_7"] = np.nan
p_dist.sort_values('Dist_7')
Q1_Dist_7 = p_dist.Dist_7.quantile(0.25)
Q3_Dist_7 = p_dist.Dist_7.quantile(0.75)
IQR_Dist_7 = Q3_Dist_7 - Q1_Dist_7
Min_Dist_7= Q1_Dist_7-(1.5*IQR_Dist_7)
Max_Dist_7= Q3_Dist_7+(1.5*IQR_Dist_7)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_7']>Max_Dist_7 or  p_dist.loc[index, 'Dist_7']<Min_Dist_7 :
              p_dist.loc[index, 'Out_Dist_7']=1
        else:  p_dist.loc[index, 'Out_Dist_7']=0


In [42]:
p_dist["Out_Dist_8"] = np.nan
p_dist.sort_values('Dist_8')
Q1_Dist_8 = p_dist.Dist_8.quantile(0.25)
Q3_Dist_8 = p_dist.Dist_8.quantile(0.75)
IQR_Dist_8 = Q3_Dist_8 - Q1_Dist_8
Min_Dist_8= Q1_Dist_8-(1.5*IQR_Dist_8)
Max_Dist_8= Q3_Dist_8+(1.5*IQR_Dist_8)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_8']>Max_Dist_8 or  p_dist.loc[index, 'Dist_8']<Min_Dist_8 :
              p_dist.loc[index, 'Out_Dist_8']=1
        else:  p_dist.loc[index, 'Out_Dist_8']=0


In [43]:
p_dist["Out_Dist_9"] = np.nan
p_dist.sort_values('Dist_9')
Q1_Dist_9 = p_dist.Dist_9.quantile(0.25)
Q3_Dist_9 = p_dist.Dist_9.quantile(0.75)
IQR_Dist_9 = Q3_Dist_9 - Q1_Dist_9
Min_Dist_9= Q1_Dist_9-(1.5*IQR_Dist_9)
Max_Dist_9= Q3_Dist_9+(1.5*IQR_Dist_9)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_9']>Max_Dist_9 or  p_dist.loc[index, 'Dist_9']<Min_Dist_9 :
              p_dist.loc[index, 'Out_Dist_9']=1
        else:  p_dist.loc[index, 'Out_Dist_9']=0


In [44]:
p_dist["Out_Dist_10"] = np.nan
p_dist.sort_values('Dist_10')
Q1_Dist_10 = p_dist.Dist_10.quantile(0.25)
Q3_Dist_10 = p_dist.Dist_10.quantile(0.75)
IQR_Dist_10 = Q3_Dist_10 - Q1_Dist_10
Min_Dist_10= Q1_Dist_10-(1.5*IQR_Dist_10)
Max_Dist_10= Q3_Dist_10+(1.5*IQR_Dist_10)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_10']>Max_Dist_10 or  p_dist.loc[index, 'Dist_10']<Min_Dist_10 :
              p_dist.loc[index, 'Out_Dist_10']=1
        else:  p_dist.loc[index, 'Out_Dist_10']=0


In [45]:
p_dist["Out_Dist_11"] = np.nan
p_dist.sort_values('Dist_11')
Q1_Dist_11 = p_dist.Dist_11.quantile(0.25)
Q3_Dist_11 = p_dist.Dist_11.quantile(0.75)
IQR_Dist_11 = Q3_Dist_11 - Q1_Dist_11
Min_Dist_11= Q1_Dist_11-(1.5*IQR_Dist_11)
Max_Dist_11= Q3_Dist_11+(1.5*IQR_Dist_11)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_11']>Max_Dist_11 or  p_dist.loc[index, 'Dist_11']<Min_Dist_11 :
              p_dist.loc[index, 'Out_Dist_11']=1
        else:  p_dist.loc[index, 'Out_Dist_11']=0


In [46]:
p_dist["Out_Dist_12"] = np.nan
p_dist.sort_values('Dist_12')
Q1_Dist_12 = p_dist.Dist_12.quantile(0.25)
Q3_Dist_12 = p_dist.Dist_12.quantile(0.75)
IQR_Dist_12 = Q3_Dist_12 - Q1_Dist_12
Min_Dist_12= Q1_Dist_12-(1.5*IQR_Dist_12)
Max_Dist_12= Q3_Dist_12+(1.5*IQR_Dist_12)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_12']>Max_Dist_12 or  p_dist.loc[index, 'Dist_12']<Min_Dist_12 :
              p_dist.loc[index, 'Out_Dist_12']=1
        else:  p_dist.loc[index, 'Out_Dist_12']=0


In [47]:
p_dist["Out_Dist_13"] = np.nan
p_dist.sort_values('Dist_13')
Q1_Dist_13 = p_dist.Dist_13.quantile(0.25)
Q3_Dist_13 = p_dist.Dist_13.quantile(0.75)
IQR_Dist_13 = Q3_Dist_13 - Q1_Dist_13
Min_Dist_13= Q1_Dist_13-(1.5*IQR_Dist_13)
Max_Dist_13= Q3_Dist_13+(1.5*IQR_Dist_13)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_13']>Max_Dist_13 or  p_dist.loc[index, 'Dist_13']<Min_Dist_13 :
              p_dist.loc[index, 'Out_Dist_13']=1
        else:  p_dist.loc[index, 'Out_Dist_13']=0


In [48]:
p_dist["Out_Dist_14"] = np.nan
p_dist.sort_values('Dist_14')
Q1_Dist_14 = p_dist.Dist_14.quantile(0.25)
Q3_Dist_14 = p_dist.Dist_14.quantile(0.75)
IQR_Dist_14 = Q3_Dist_14 - Q1_Dist_14
Min_Dist_14= Q1_Dist_14-(1.5*IQR_Dist_14)
Max_Dist_14= Q3_Dist_14+(1.5*IQR_Dist_14)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_14']>Max_Dist_14 or  p_dist.loc[index, 'Dist_14']<Min_Dist_14 :
              p_dist.loc[index, 'Out_Dist_14']=1
        else:  p_dist.loc[index, 'Out_Dist_14']=0


In [49]:
p_dist["Out_Dist_15"] = np.nan
p_dist.sort_values('Dist_15')
Q1_Dist_15 = p_dist.Dist_15.quantile(0.25)
Q3_Dist_15 = p_dist.Dist_15.quantile(0.75)
IQR_Dist_15 = Q3_Dist_15 - Q1_Dist_15
Min_Dist_15= Q1_Dist_15-(1.5*IQR_Dist_15)
Max_Dist_15= Q3_Dist_15+(1.5*IQR_Dist_15)
#Conditions
for index, row in p_dist.iterrows():
        if    p_dist.loc[index, 'Dist_15']>Max_Dist_15 or  p_dist.loc[index, 'Dist_15']<Min_Dist_15 :
              p_dist.loc[index, 'Out_Dist_15']=1
        else:  p_dist.loc[index, 'Out_Dist_15']=0


In [50]:
p_dist["Outliers"] = np.nan
for index, row in p_dist.iterrows():
    if  p_dist.loc[index, 'Out_Dist_0']==1 or  p_dist.loc[index, 'Out_Dist_1' ]==1 or p_dist.loc[index, 'Out_Dist_2']==1 or p_dist.loc[index, 'Out_Dist_3']==1 or  p_dist.loc[index, 'Out_Dist_4']==1 or p_dist.loc[index, 'Out_Dist_5']==1 or p_dist.loc[index, 'Out_Dist_6']==1 or p_dist.loc[index, 'Out_Dist_7']==1 or p_dist.loc[index, 'Out_Dist_8']==1 or  p_dist.loc[index, 'Out_Dist_9']==1 or p_dist.loc[index, 'Out_Dist_10']==1 or p_dist.loc[index, 'Out_Dist_11']==1 or p_dist.loc[index, 'Out_Dist_12']==1 or p_dist.loc[index, 'Out_Dist_13']==1 or p_dist.loc[index, 'Out_Dist_14']==1 or p_dist.loc[index, 'Out_Dist_15']==1:
        p_dist.loc[index, 'Outliers']=1
    else: p_dist.loc[index, 'Outliers']=0


In [51]:
#Add column for confusion matrix
p_dist["Mat_conf"] = np.nan

In [52]:
# class distribution for confusion matrix
print(p_dist.groupby('Observation').size())

Observation
0     831
1    3643
dtype: int64


In [53]:
# class distribution for confusion matrix
print(p_dist.groupby('Outliers').size())

Outliers
0.0     826
1.0    3648
dtype: int64


In [54]:
#Condition to fill confusion matrix
for index, row in p_dist.iterrows():
    if  p_dist.loc[index, 'Observation']==1 and p_dist.loc[index, 'Outliers']==1  :
        p_dist.loc[index, 'Mat_conf']='TP'
    elif  p_dist.loc[index, 'Observation']==1 and p_dist.loc[index, 'Outliers']==0  :
        p_dist.loc[index, 'Mat_conf']='FN'
    elif  p_dist.loc[index, 'Observation']==0 and p_dist.loc[index, 'Outliers']==1  :
        p_dist.loc[index, 'Mat_conf']='FP'
    elif p_dist.loc[index, 'Observation']==0 and p_dist.loc[index, 'Outliers']==0:
        p_dist.loc[index, 'Mat_conf']='TN'

In [55]:
p_dist.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4474 entries, 0 to 4473
Data columns (total 70 columns):
Time             4474 non-null float64
Ampl_0           4472 non-null float64
Ampl_1           4473 non-null float64
Ampl_2           4473 non-null float64
Ampl_3           4473 non-null float64
Ampl_4           4474 non-null float64
Ampl_5           4474 non-null float64
Ampl_6           4473 non-null float64
Ampl_7           4474 non-null float64
Ampl_8           4474 non-null float64
Ampl_9           4474 non-null float64
Ampl_10          4474 non-null float64
Ampl_11          4474 non-null float64
Ampl_12          4474 non-null float64
Ampl_13          4471 non-null float64
Ampl_14          4472 non-null float64
Ampl_15          4474 non-null float64
Dist_0           4472 non-null float64
Dist_1           4473 non-null float64
Dist_2           4473 non-null float64
Dist_3           4473 non-null float64
Dist_4           4474 non-null float64
Dist_5           4474 non-null floa

In [56]:
# class distribution for confusion matrix
print(p_dist.groupby('Mat_conf').size())

Mat_conf
FN     550
FP     555
TN     276
TP    3093
dtype: int64


In [57]:
FN =   550
FP  =  555
TN   =  276
TP   = 3093
# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
print('TPR=', TPR)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
print('TNR=', TNR)
# Precision or positive predictive value
PPV = TP/(TP+FP)
print('PPV=', PPV)
# Negative predictive value
NPV = TN/(TN+FN)
print('NPV=', NPV)
# Fall out or false positive rate
FPR = FP/(FP+TN)
print('FPR=', FPR)
# False negative rate
FNR = FN/(TP+FN)
print('FNR=', FNR)
# False discovery rate
FDR = FP/(TP+FP)
print('FDR=', FDR)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
print('ACC=', ACC)

TPR= 0.8490255284106506
TNR= 0.33212996389891697
PPV= 0.8478618421052632
NPV= 0.3341404358353511
FPR= 0.6678700361010831
FNR= 0.15097447158934943
FDR= 0.15213815789473684
ACC= 0.7530174340634779
