In [None]:
from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import metrics

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Load data frame using pandas
df = pd.read_csv('/content/drive/My Drive/1_gecco2019_water_quality.csv', index_col = 0)
df
#Colab Notebooks/

Unnamed: 0,Time,Tp,pH,Cond,Turb,SAC,PFM,Event
1,2017-07-01 00:00:00,6.94,8.60774,0.020954,0.125931,3.58683,43.7559,False
2,2017-07-01 00:01:00,6.93,8.60589,0.020965,0.127219,3.59025,43.4366,False
3,2017-07-01 00:02:00,6.94,8.60220,0.020968,0.126482,3.58318,43.5994,False
4,2017-07-01 00:03:00,6.94,8.60220,0.020972,0.126184,3.58769,43.3704,False
5,2017-07-01 00:04:00,6.94,8.60405,0.020974,0.127908,3.58287,43.1656,False
...,...,...,...,...,...,...,...,...
132476,2017-09-30 23:55:00,10.30,8.56593,0.020724,0.126518,4.53577,56.4686,False
132477,2017-09-30 23:56:00,10.30,8.56593,0.020727,0.126575,4.53008,56.3567,False
132478,2017-09-30 23:57:00,10.30,8.56593,0.020723,0.126512,4.53512,55.0477,False
132479,2017-09-30 23:58:00,10.30,8.56228,0.020720,0.126477,4.54084,55.4052,False


In [None]:
#Map True and False to values 0 and 1 respectively

#0 represents Anomaly
#1 represents Normalcy

df['Event'] = df['Event'].astype('category')
encode_map ={
    False : 1,
    True : 0 }

df['Event'].replace(encode_map, inplace=True)  

In [None]:
#Count of values of 0 and 1

df['Event'].value_counts()

1    132268
0       212
Name: Event, dtype: int64

In [None]:
#Wherever some of the fields are not filled with data,fill those fields with the mean value


df['pH'].fillna((df['pH'].mean()), inplace=True)
df['Tp'].fillna((df['Tp'].mean()), inplace=True)
df['Cond'].fillna((df['Cond'].mean()), inplace=True)
df['Turb'].fillna((df['Turb'].mean()), inplace=True)
df['SAC'].fillna((df['SAC'].mean()), inplace=True)
df['PFM'].fillna((df['PFM'].mean()), inplace=True)

In [None]:
df

Unnamed: 0,Time,Tp,pH,Cond,Turb,SAC,PFM,Event
1,2017-07-01 00:00:00,6.94,8.60774,0.020954,0.125931,3.58683,43.7559,1
2,2017-07-01 00:01:00,6.93,8.60589,0.020965,0.127219,3.59025,43.4366,1
3,2017-07-01 00:02:00,6.94,8.60220,0.020968,0.126482,3.58318,43.5994,1
4,2017-07-01 00:03:00,6.94,8.60220,0.020972,0.126184,3.58769,43.3704,1
5,2017-07-01 00:04:00,6.94,8.60405,0.020974,0.127908,3.58287,43.1656,1
...,...,...,...,...,...,...,...,...
132476,2017-09-30 23:55:00,10.30,8.56593,0.020724,0.126518,4.53577,56.4686,1
132477,2017-09-30 23:56:00,10.30,8.56593,0.020727,0.126575,4.53008,56.3567,1
132478,2017-09-30 23:57:00,10.30,8.56593,0.020723,0.126512,4.53512,55.0477,1
132479,2017-09-30 23:58:00,10.30,8.56228,0.020720,0.126477,4.54084,55.4052,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132480 entries, 1 to 132480
Data columns (total 8 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    132480 non-null  object 
 1   Tp      132480 non-null  float64
 2   pH      132480 non-null  float64
 3   Cond    132480 non-null  float64
 4   Turb    132480 non-null  float64
 5   SAC     132480 non-null  float64
 6   PFM     132480 non-null  float64
 7   Event   132480 non-null  int64  
dtypes: float64(6), int64(1), object(1)
memory usage: 9.1+ MB


In [None]:
#Data Type conversion

df['Time'] = pd.to_datetime(df['Time']).astype(np.int64)
df['Tp'] = df['Tp'].astype('float32')
df['pH'] = df['pH'].astype('float32')
df['Cond'] = df['Cond'].astype('float32')
df['Turb'] = df['Turb'].astype('float32')
df['SAC'] = df['SAC'].astype('float32')
df['PFM'] = df['PFM'].astype('float32')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132480 entries, 1 to 132480
Data columns (total 8 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    132480 non-null  int64  
 1   Tp      132480 non-null  float32
 2   pH      132480 non-null  float32
 3   Cond    132480 non-null  float32
 4   Turb    132480 non-null  float32
 5   SAC     132480 non-null  float32
 6   PFM     132480 non-null  float32
 7   Event   132480 non-null  int64  
dtypes: float32(6), int64(2)
memory usage: 6.1 MB


In [None]:
#Define Input Columns(X) and Output Columns(y) 

X = df.iloc[:, 1:7]
y = df.iloc[:, -1]
X

Unnamed: 0,Tp,pH,Cond,Turb,SAC,PFM
1,6.94,8.60774,0.020954,0.125931,3.58683,43.755901
2,6.93,8.60589,0.020965,0.127219,3.59025,43.436600
3,6.94,8.60220,0.020968,0.126482,3.58318,43.599400
4,6.94,8.60220,0.020972,0.126184,3.58769,43.370399
5,6.94,8.60405,0.020973,0.127908,3.58287,43.165600
...,...,...,...,...,...,...
132476,10.30,8.56593,0.020724,0.126518,4.53577,56.468601
132477,10.30,8.56593,0.020727,0.126575,4.53008,56.356701
132478,10.30,8.56593,0.020723,0.126512,4.53512,55.047699
132479,10.30,8.56228,0.020720,0.126477,4.54084,55.405201


In [None]:
#Normalise the values

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() 
X = scaler.fit_transform(X) 
X

array([[0.6737864 , 0.99319005, 0.9026238 , 0.07063466, 0.5878054 ,
        0.03734416],
       [0.6728155 , 0.99297655, 0.9031321 , 0.0713571 , 0.58836585,
        0.02985793],
       [0.6737864 , 0.99255073, 0.9032484 , 0.07094371, 0.5872072 ,
        0.03367478],
       ...,
       [1.        , 0.9883659 , 0.8926773 , 0.07096054, 0.74321   ,
        0.30208665],
       [1.        , 0.98794466, 0.89255667, 0.07094091, 0.7441474 ,
        0.3104685 ],
       [1.        , 0.9881547 , 0.89269453, 0.07099082, 0.7431002 ,
        0.30070108]], dtype=float32)

In [None]:
#Split Train and Test Data

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
#Over Sample the Data using SMOTE

from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state=42)
x_train, y_train = sm.fit_resample(x_train, y_train)



In [None]:
#Encode output
y_train=tf.keras.utils.to_categorical(y_train)
y_test=tf.keras.utils.to_categorical(y_test)

#Reshape Train Data
x_train = x_train.reshape(x_train.shape[0],x_train.shape[1], 1)
x_test = np.asarray(x_test).reshape(x_test.shape[0],x_test.shape[1], 1)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(211610, 6, 1)
(211610, 2)
(26496, 6, 1)
(26496, 2)


In [None]:
verbose, epochs, batch_size = 0, 100, 64
n_timesteps, n_features, n_outputs = x_train.shape[0], x_train.shape[1], y_train.shape[1]

#Define CNN Model

model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(x_train.shape[1], 1)))
model.add(tf.keras.layers.Conv1D(filters=64, kernel_size=2, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(100, activation='relu'))
model.add(tf.keras.layers.Dense(n_outputs, activation='softmax'))
opt = tf.keras.optimizers.SGD(lr=0.01, momentum=0.9)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['acc'])
print(model.summary())
	
# fit network
hist = model.fit(x_train, y_train, epochs=epochs,validation_data=(x_test,y_test), batch_size=batch_size, verbose=1)      
            

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 5, 64)             192       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 4, 64)             8256      
_________________________________________________________________
dropout (Dropout)            (None, 4, 64)             0         
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 2, 64)             0         
_________________________________________________________________
flatten (Flatten)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 100)               12900     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 2

In [None]:
preds = model.predict(x_test)
con_mat=metrics.confusion_matrix(y_test.argmax(axis=1), preds.argmax(axis=1)) 
print()
print()
print()    
print("\t\t\t     CONFUSION MATRIX")
print("\t\t\t+------------------------+")
print("\t\t\t|\tTP  |\tFP\t |")
print("\t\t\t+------------------------+")
print("\t\t\t|\t",con_mat[0][0],"|\t",con_mat[0][1],"\t |")
print("\t\t\t+------------------------+")
print("\t\t\t|\t",con_mat[1][0],"|\t",con_mat[1][1],"\t |")
print("\t\t\t+------------------------+")
print("\t\t\t|\tFN  |\tTN\t |")
print("\t\t\t+------------------------+")
print()
print()
print()




			     CONFUSION MATRIX
			+------------------------+
			|	TP  |	FP	 |
			+------------------------+
			|	 31 |	 2 	 |
			+------------------------+
			|	 525 |	 25938 	 |
			+------------------------+
			|	FN  |	TN	 |
			+------------------------+





In [None]:
#Print the Evaluation Metrics
print(metrics.classification_report(y_test.argmax(axis=1), preds.argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.06      0.94      0.11        33
           1       1.00      0.98      0.99     26463

    accuracy                           0.98     26496
   macro avg       0.53      0.96      0.55     26496
weighted avg       1.00      0.98      0.99     26496



In [None]:
cm=con_mat

CALCULATING METRICS MANUALLY 

In [None]:
#Accuracy = (TP+TN)/(TP+TN+FP+FN)

In [None]:
print("Accuracy",(cm[0][0]+cm[1][1])/cm.sum())

Accuracy 0.9801102053140096


In [None]:
#Precision = TP / (TP+FP)

In [None]:
prec=cm[0][0]/(cm[0][0]+cm[0][1])
print("Precision",cm[0][0]/(cm[0][0]+cm[0][1]))

Precision 0.9393939393939394


In [None]:
#Recall = TP / (TP+FN)

In [None]:
rec=cm[0][0]/(cm[0][0]+cm[1][0])
print("Recall",cm[0][0]/(cm[0][0]+cm[1][0]))

Recall 0.05575539568345324


In [None]:
#F1-score = 2*[(Precision*Recall)/(Precision+Recall)]

In [None]:
print("F1 score",2*(prec*rec/(prec+rec)))

F1 score 0.10526315789473684
