<div class='alert alert-info'>
<h3><center>ANOMALY DETECTION</center></h3>
</div>

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest

In [2]:
data = pd.read_csv('SAMPLE-04061235.csv', delimiter = ';')

<div class='alert alert-info'>
<h3><center>Basic Data Characteristics</center></h3>
</div>

In [3]:
data.head()

Unnamed: 0,Date,Time,Temp,Moisture,Baro,Current,X,Y,Z,Mircophone
0,06/04/2023,12:34:56,22.7,42.22,1016.13,740,1.0,2.0,-11.0,
1,06/04/2023,12:34:56,22.7,42.22,1016.13,723,0.0,12.0,-11.0,854.0
2,06/04/2023,12:34:56,22.7,42.22,1016.13,724,0.0,0.0,0.0,
3,06/04/2023,12:34:56,22.7,42.22,1016.13,747,0.0,3.0,-13.0,947.0
4,06/04/2023,12:34:56,22.7,42.22,1016.13,727,1.0,-2.0,-10.0,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3444 entries, 0 to 3443
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Date        3433 non-null   object 
 1   Time        3433 non-null   object 
 2   Temp        3444 non-null   float64
 3   Moisture    3444 non-null   float64
 4   Baro        3444 non-null   float64
 5   Current     3444 non-null   int64  
 6   X           3444 non-null   float64
 7   Y           3444 non-null   float64
 8   Z           3444 non-null   float64
 9   Mircophone  2524 non-null   float64
dtypes: float64(7), int64(1), object(2)
memory usage: 269.2+ KB


In [5]:
data.describe()

Unnamed: 0,Temp,Moisture,Baro,Current,X,Y,Z,Mircophone
count,3444.0,3444.0,3444.0,3444.0,3444.0,3444.0,3444.0,2524.0
mean,22.689114,42.25426,1016.122297,728.684669,0.319686,3.496806,-10.513066,833.578447
std,0.016998,0.039247,0.016296,13.560758,0.890624,4.300267,2.884575,137.992569
min,22.65,42.2,1016.09,691.0,-3.0,-6.0,-16.0,349.0
25%,22.68,42.22,1016.12,718.0,0.0,0.0,-12.0,745.0
50%,22.7,42.25,1016.12,728.0,0.0,3.0,-11.0,837.0
75%,22.7,42.28,1016.13,739.0,1.0,7.0,-9.0,927.0
max,22.71,42.36,1016.15,766.0,4.0,13.0,0.0,1282.0


In [6]:
#Percentage of missing values 
round((pd.isnull(data).sum(axis = 0)/data.shape[0])*100,2)

Date           0.32
Time           0.32
Temp           0.00
Moisture       0.00
Baro           0.00
Current        0.00
X              0.00
Y              0.00
Z              0.00
Mircophone    26.71
dtype: float64

In [7]:
data.drop('Date', axis = 1, inplace = True)

In [8]:
#Utilizing ffill due to time series data 
data['Time'] = data['Time'].fillna(method = 'ffill')

In [9]:
print('Number of Sensor readings per second:',data[data['Time']==data['Time'].iloc[0]].shape[0])

Number of Sensor readings per second: 26


In [10]:
#Percentage of missing values 
round((pd.isnull(data).sum(axis = 0)/data.shape[0])*100,2)

Time           0.00
Temp           0.00
Moisture       0.00
Baro           0.00
Current        0.00
X              0.00
Y              0.00
Z              0.00
Mircophone    26.71
dtype: float64

In [11]:
## As microphone data is not always available, it may not be an important input feed to be considered
## we only use microphone data for correlation analysis with movement

In [12]:
data_environmentfeatures = data[['Temp', 'Moisture', 'Baro', 'Current']]
data_XYZ = data[['X', 'Y', 'Z', 'Mircophone']]

In [13]:
pd.isnull(data_environmentfeatures).sum(axis = 0)/data_environmentfeatures.shape[0]

Temp        0.0
Moisture    0.0
Baro        0.0
Current     0.0
dtype: float64

In [14]:
data_environmentfeatures.head(5)

Unnamed: 0,Temp,Moisture,Baro,Current
0,22.7,42.22,1016.13,740
1,22.7,42.22,1016.13,723
2,22.7,42.22,1016.13,724
3,22.7,42.22,1016.13,747
4,22.7,42.22,1016.13,727


<div class='alert alert-info'>
<h3><center>Anomaly Detection using Environment Variables</center></h3>
</div>

In [15]:
data.shape

(3444, 9)

In [16]:
data['Time'].min(),data['Time'].max()

('12:34:56', '12:35:58')

In [17]:
data.head()

Unnamed: 0,Time,Temp,Moisture,Baro,Current,X,Y,Z,Mircophone
0,12:34:56,22.7,42.22,1016.13,740,1.0,2.0,-11.0,
1,12:34:56,22.7,42.22,1016.13,723,0.0,12.0,-11.0,854.0
2,12:34:56,22.7,42.22,1016.13,724,0.0,0.0,0.0,
3,12:34:56,22.7,42.22,1016.13,747,0.0,3.0,-13.0,947.0
4,12:34:56,22.7,42.22,1016.13,727,1.0,-2.0,-10.0,


In [18]:
data.tail()

Unnamed: 0,Time,Temp,Moisture,Baro,Current,X,Y,Z,Mircophone
3439,12:35:58,22.68,42.28,1016.09,741,-1.0,10.0,-12.0,763.0
3440,12:35:58,22.68,42.28,1016.09,714,1.0,2.0,-14.0,602.0
3441,12:35:58,22.68,42.28,1016.09,722,0.0,7.0,-9.0,1110.0
3442,12:35:58,22.68,42.28,1016.09,743,0.0,1.0,-13.0,597.0
3443,12:35:58,22.68,42.28,1016.09,712,0.0,11.0,-9.0,


# Due to the unavailability of the data from 12:36:00 to 12:59:00, generating synthetic environmental data from 12:36:00 to 12:46:00

<div class='alert alert-info'>
<h3><center>Generating synthetic data for environmental features such as temperature, moisture, barometric pressure (Baro), and current.</center></h3>
</div>

<div class='alert alert-warning'>
<h4><center>Generating random values based on a normal distribution with mean and standard deviation values taken from the 'Temp', 'Moisture', 'Baro', and 'Current' columns</center> </h4>
</div>

In [19]:
np.random.seed(123)
synthetic_temperature = np.random.normal(data_environmentfeatures['Temp'].mean(), 
                                         data_environmentfeatures['Temp'].std(), 
                                         3444*9)

synthetic_moisture = np.random.normal(data_environmentfeatures['Moisture'].mean(), 
                                      data_environmentfeatures['Moisture'].std(), 
                                      3444*9)

synthetic_baro = np.random.normal(data_environmentfeatures['Baro'].mean(), 
                                  data_environmentfeatures['Baro'].std(), 
                                  3444*9)

synthetic_current = np.random.normal(data_environmentfeatures['Current'].mean(), 
                                      data_environmentfeatures['Current'].std(), 
                                      3444*9)

synthetic_times = []

synthetic_times = ['12:35:59']*57

synthetic_times += list(np.array([['12:' + str(i) + ':' + str(j)]*57 if j > 9 else ['12:' + str(i) + ':0' + str(j)]*57 for i in range(36, 46) for j in range(0, 59)]).flatten())

synthetic_times = synthetic_times[:3444*9]

data_synthetic = pd.DataFrame({'Time': synthetic_times,
                               'Temp': synthetic_temperature,
                               'Moisture': synthetic_moisture,
                               'Baro': synthetic_baro,
                               'Current': synthetic_current})

data_synthetic

Unnamed: 0,Time,Temp,Moisture,Baro,Current
0,12:35:59,22.670661,42.227998,1016.124542,712.489602
1,12:35:59,22.706067,42.210816,1016.108416,714.176877
2,12:35:59,22.693924,42.242304,1016.120693,725.806949
3,12:35:59,22.663510,42.262819,1016.106400,724.786459
4,12:35:59,22.679279,42.282455,1016.140752,723.669541
...,...,...,...,...,...
30991,12:45:11,22.708919,42.280005,1016.119619,732.383493
30992,12:45:11,22.698744,42.240887,1016.101636,743.865432
30993,12:45:11,22.692889,42.295926,1016.107279,721.931238
30994,12:45:11,22.675522,42.239768,1016.100668,746.043210


In [20]:
data_timegraph = data[['Time', 'Temp', 'Moisture', 'Baro', 'Current']]
synthetic_data_timegraph = data_synthetic.copy()
synthesized_data = pd.concat([data_timegraph, synthetic_data_timegraph], ignore_index=True, axis = 0)
synthesized_data

Unnamed: 0,Time,Temp,Moisture,Baro,Current
0,12:34:56,22.700000,42.220000,1016.130000,740.000000
1,12:34:56,22.700000,42.220000,1016.130000,723.000000
2,12:34:56,22.700000,42.220000,1016.130000,724.000000
3,12:34:56,22.700000,42.220000,1016.130000,747.000000
4,12:34:56,22.700000,42.220000,1016.130000,727.000000
...,...,...,...,...,...
34435,12:45:11,22.708919,42.280005,1016.119619,732.383493
34436,12:45:11,22.698744,42.240887,1016.101636,743.865432
34437,12:45:11,22.692889,42.295926,1016.107279,721.931238
34438,12:45:11,22.675522,42.239768,1016.100668,746.043210


In [21]:
len(synthesized_data[synthesized_data['Time']=='12:34:56'])

26

<div class='alert alert-info'>
<h3><center>Anomaly detection using Isolation forest</center></h3>
</div>

In [22]:
X = synthesized_data.iloc[:, 1:].values

from sklearn.preprocessing import MinMaxScaler
minmaxscaler = MinMaxScaler().fit(X)
X_scaled = minmaxscaler.transform(X)

In [23]:
model = IsolationForest().fit(X_scaled)
anomaly_scores = -model.score_samples(X_scaled)

In [24]:
synthesized_data['Anomaly Score'] = anomaly_scores
synthesized_data

Unnamed: 0,Time,Temp,Moisture,Baro,Current,Anomaly Score
0,12:34:56,22.700000,42.220000,1016.130000,740.000000,0.403383
1,12:34:56,22.700000,42.220000,1016.130000,723.000000,0.400536
2,12:34:56,22.700000,42.220000,1016.130000,724.000000,0.401406
3,12:34:56,22.700000,42.220000,1016.130000,747.000000,0.421868
4,12:34:56,22.700000,42.220000,1016.130000,727.000000,0.396702
...,...,...,...,...,...,...
34435,12:45:11,22.708919,42.280005,1016.119619,732.383493,0.413861
34436,12:45:11,22.698744,42.240887,1016.101636,743.865432,0.431024
34437,12:45:11,22.692889,42.295926,1016.107279,721.931238,0.426236
34438,12:45:11,22.675522,42.239768,1016.100668,746.043210,0.444217


In [25]:
#Determining the health of the system by 1- anomaly score
synthesized_data['Health Value'] = round((1 - synthesized_data['Anomaly Score'])*100, 2)
synthesized_data['Anomaly Score'] = round(synthesized_data['Anomaly Score']*100, 2)

In [26]:
synthesized_data

Unnamed: 0,Time,Temp,Moisture,Baro,Current,Anomaly Score,Health Value
0,12:34:56,22.700000,42.220000,1016.130000,740.000000,40.34,59.66
1,12:34:56,22.700000,42.220000,1016.130000,723.000000,40.05,59.95
2,12:34:56,22.700000,42.220000,1016.130000,724.000000,40.14,59.86
3,12:34:56,22.700000,42.220000,1016.130000,747.000000,42.19,57.81
4,12:34:56,22.700000,42.220000,1016.130000,727.000000,39.67,60.33
...,...,...,...,...,...,...,...
34435,12:45:11,22.708919,42.280005,1016.119619,732.383493,41.39,58.61
34436,12:45:11,22.698744,42.240887,1016.101636,743.865432,43.10,56.90
34437,12:45:11,22.692889,42.295926,1016.107279,721.931238,42.62,57.38
34438,12:45:11,22.675522,42.239768,1016.100668,746.043210,44.42,55.58


In [27]:
pd.cut(synthesized_data['Health Value'], 5).value_counts()

(55.7, 61.84]      17578
(49.56, 55.7]      11482
(43.42, 49.56]      4492
(37.28, 43.42]       835
(31.109, 37.28]       53
Name: Health Value, dtype: int64

<div class='alert alert-info'>
<h3><center>Storing the Anomaly detection results</center></h3>
</div>

In [28]:
synthesized_data.to_excel('Synthesized_Data_withlabels.xlsx', index = False)

In [29]:
per_second = synthesized_data.groupby('Time')['Health Value'].median().reset_index()

In [30]:
#Binning the Health value into three categories -> Medium, Good, and Poor
per_second['System Health'] = pd.cut(per_second['Health Value'], 3, ordered = True, labels = ['Poor', 'Medium', 'Good'])

In [31]:
pd.cut(per_second['Health Value'], 3, ordered = True, labels = ['Poor', 'Medium', 'Good']).value_counts()

Medium    434
Good      150
Poor        9
Name: Health Value, dtype: int64

In [32]:
per_second.to_excel('per_second.xlsx', index = False)