In [1]:
import pandas as pd
from mlsecu.anomaly_detection_use_case import *
from mlsecu.data_exploration_utils import *
from mlsecu.data_preparation_utils import *
import re
import matplotlib.pyplot as plt

In [2]:
def load_df(name):
    df = pd.read_excel(name)
    sensor_regex = re.compile(r'^(A|F|L|P)IT\d{3}\.Pv$')
    sensor_cols = [col for col in df.columns if sensor_regex.match(col)]
    # sensor_cols.insert(0, 't_stamp')
    df = df[sensor_cols]
    return df

def load_dfs():
    names = ['data/22June2020_1.xlsx', 'data/22June2020_2.xlsx', 
             'data/29June2020_1.xlsx', 'data/29June2020_2.xlsx']
    dfs = [load_df(name) for name in names]
    df = pd.concat(dfs)
    return df

In [3]:
df = load_df('data/22June2020_1.xlsx')
df.head()

Unnamed: 0,LIT101.Pv,FIT101.Pv,FIT201.Pv,AIT201.Pv,AIT202.Pv,AIT203.Pv,AIT301.Pv,AIT302.Pv,AIT303.Pv,LIT301.Pv,...,FIT503.Pv,FIT504.Pv,AIT501.Pv,AIT502.Pv,AIT503.Pv,AIT504.Pv,PIT501.Pv,PIT502.Pv,PIT503.Pv,FIT601.Pv
0,695.2841,0.0,0.000256,18.072288,7.519418,135.708786,7.482105,110.4296,40.335487,1006.75311,...,0.116122,0,7.615548,149.6283,46.91105,1.845681,232.376312,2.01839,206.9982,0.000256
1,695.1271,0.0,0.000256,18.104332,7.514612,135.708786,7.482105,110.4296,40.335487,1006.75311,...,0.116122,0,7.612343,149.6283,46.91105,1.845681,232.376312,2.01839,206.9982,0.000256
2,694.930847,0.0,0.000256,18.104332,7.508844,135.708786,7.482105,110.4296,40.335487,1006.75311,...,0.116122,0,7.612343,149.6283,46.91105,1.845681,232.376312,2.01839,206.9982,0.000256
3,694.930847,0.0,0.000256,18.104332,7.504678,135.708786,7.489596,110.4296,40.335487,1006.75311,...,0.116122,0,7.612343,149.525757,46.91105,1.845681,232.376312,2.01839,206.9982,0.000256
4,694.8523,0.0,0.000256,18.104332,7.498911,135.708786,7.489596,110.4296,40.143414,1006.75311,...,0.116122,0,7.611382,149.525757,46.91105,1.845681,232.376312,2.01839,206.9982,0.000256


In [4]:
df.describe(include='all')

Unnamed: 0,LIT101.Pv,FIT101.Pv,FIT201.Pv,AIT201.Pv,AIT202.Pv,AIT203.Pv,AIT301.Pv,AIT302.Pv,AIT303.Pv,LIT301.Pv,...,FIT503.Pv,FIT504.Pv,AIT501.Pv,AIT502.Pv,AIT503.Pv,AIT504.Pv,PIT501.Pv,PIT502.Pv,PIT503.Pv,FIT601.Pv
count,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0,...,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0,14400.0
mean,745.134493,0.24547,0.921443,45.858387,8.303283,127.926579,7.737137,144.543083,48.071526,937.840665,...,0.071309,0.0,7.607555,160.594976,48.520726,2.107472,145.7737,1.647481,128.579515,0.012382
std,85.034098,1.0022,1.143895,35.980065,0.909122,25.029523,0.338565,30.309694,19.050132,70.627441,...,0.05521,0.0,0.08036,8.354277,3.911336,2.985455,106.528359,0.630686,96.34619,0.139629
min,494.074463,0.0,0.0,14.387337,7.133299,13.919508,7.052628,102.285675,29.003136,790.3422,...,0.001408,0.0,7.437388,146.936676,41.303513,1.115099,12.176365,0.16019,7.786465,0.0
25%,694.656067,0.0,0.000256,18.072288,7.47616,126.19841,7.394295,114.091812,40.015366,876.4179,...,0.002176,0.0,7.533517,152.832611,46.879005,1.691874,12.43271,1.425688,7.994745,0.000256
50%,781.071019,0.0,0.000384,27.653166,8.059984,130.940781,7.734298,153.761444,44.785194,977.31355,...,0.113946,0.0,7.621956,161.343246,47.00718,1.768777,230.549866,2.01839,205.219818,0.000256
75%,815.162048,0.0,2.34287,73.28249,9.370867,139.041275,8.072012,173.78833,54.77303,1006.673,...,0.116122,0.0,7.67707,164.752625,52.4545,2.230197,231.28685,2.066447,205.892715,0.00032
max,816.967651,4.384588,2.413097,110.356316,9.695784,158.728531,8.345636,197.246948,327.1336,1009.35663,...,0.136095,0.0,7.747885,179.748779,55.14612,63.752884,236.1574,4.741614,210.603058,1.805209


In [5]:
print('Rate of undefined values for each column:')
for col in df.columns:
    print(f'{col}: {df[col].isna().sum() / len(df[col])}')

Rate of undefined values for each column:
LIT101.Pv: 0.0
FIT101.Pv: 0.0
FIT201.Pv: 0.0
AIT201.Pv: 0.0
AIT202.Pv: 0.0
AIT203.Pv: 0.0
AIT301.Pv: 0.0
AIT302.Pv: 0.0
AIT303.Pv: 0.0
LIT301.Pv: 0.0
FIT301.Pv: 0.0
LIT401.Pv: 0.0
FIT401.Pv: 0.0
AIT401.Pv: 0.0
AIT402.Pv: 0.0
FIT501.Pv: 0.0
FIT502.Pv: 0.0
FIT503.Pv: 0.0
FIT504.Pv: 0.0
AIT501.Pv: 0.0
AIT502.Pv: 0.0
AIT503.Pv: 0.0
AIT504.Pv: 0.0
PIT501.Pv: 0.0
PIT502.Pv: 0.0
PIT503.Pv: 0.0
FIT601.Pv: 0.0


In [6]:
'''
all_df = load_dfs()
all_df.head()
'''

'\nall_df = load_dfs()\nall_df.head()\n'

In [7]:
# all_df.describe()

In [8]:
'''
all_df.dropna(axis=1, inplace=True)
all_df.describe(include='all')
'''

"\nall_df.dropna(axis=1, inplace=True)\nall_df.describe(include='all')\n"

In [9]:
# all_df.head()

In [10]:
'''
if_outliers = get_list_of_if_outliers(all_df, 0.03)
print(f'Number of outliers: {len(if_outliers)}')
'''

"\nif_outliers = get_list_of_if_outliers(all_df, 0.03)\nprint(f'Number of outliers: {len(if_outliers)}')\n"

In [11]:
if_outliers = get_list_of_if_outliers(df, 0.03)
print(f'Number of outliers: {len(if_outliers)}')

Number of outliers: 432
