In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('1_gecco2019_water_quality.csv', index_col = 0)
df

Unnamed: 0,Time,Tp,pH,Cond,Turb,SAC,PFM,Event
1,2017-07-01 00:00:00,6.94,8.60774,0.020954,0.125931,3.58683,43.7559,False
2,2017-07-01 00:01:00,6.93,8.60589,0.020965,0.127219,3.59025,43.4366,False
3,2017-07-01 00:02:00,6.94,8.60220,0.020968,0.126482,3.58318,43.5994,False
4,2017-07-01 00:03:00,6.94,8.60220,0.020972,0.126184,3.58769,43.3704,False
5,2017-07-01 00:04:00,6.94,8.60405,0.020974,0.127908,3.58287,43.1656,False
...,...,...,...,...,...,...,...,...
132476,2017-09-30 23:55:00,10.30,8.56593,0.020724,0.126518,4.53577,56.4686,False
132477,2017-09-30 23:56:00,10.30,8.56593,0.020727,0.126575,4.53008,56.3567,False
132478,2017-09-30 23:57:00,10.30,8.56593,0.020723,0.126512,4.53512,55.0477,False
132479,2017-09-30 23:58:00,10.30,8.56228,0.020720,0.126477,4.54084,55.4052,False


In [3]:
#Map False to 1 and True to 0
df['Event'] = df['Event'].astype('category')
encode_map ={
    False : 1,
    True : 0 }

df['Event'].replace(encode_map, inplace=True)  

In [4]:
df['Event'].value_counts()
#Prints the count of 0 and 1

1    132268
0       212
Name: Event, dtype: int64

In [5]:
#Fill the Null values in the dataset with mean values
df['pH'].fillna((df['pH'].mean()), inplace=True)
df['Tp'].fillna((df['Tp'].mean()), inplace=True)
df['Cond'].fillna((df['Cond'].mean()), inplace=True)
df['Turb'].fillna((df['Turb'].mean()), inplace=True)
df['SAC'].fillna((df['SAC'].mean()), inplace=True)
df['PFM'].fillna((df['PFM'].mean()), inplace=True)

In [6]:
df

Unnamed: 0,Time,Tp,pH,Cond,Turb,SAC,PFM,Event
1,2017-07-01 00:00:00,6.94,8.60774,0.020954,0.125931,3.58683,43.7559,1
2,2017-07-01 00:01:00,6.93,8.60589,0.020965,0.127219,3.59025,43.4366,1
3,2017-07-01 00:02:00,6.94,8.60220,0.020968,0.126482,3.58318,43.5994,1
4,2017-07-01 00:03:00,6.94,8.60220,0.020972,0.126184,3.58769,43.3704,1
5,2017-07-01 00:04:00,6.94,8.60405,0.020974,0.127908,3.58287,43.1656,1
...,...,...,...,...,...,...,...,...
132476,2017-09-30 23:55:00,10.30,8.56593,0.020724,0.126518,4.53577,56.4686,1
132477,2017-09-30 23:56:00,10.30,8.56593,0.020727,0.126575,4.53008,56.3567,1
132478,2017-09-30 23:57:00,10.30,8.56593,0.020723,0.126512,4.53512,55.0477,1
132479,2017-09-30 23:58:00,10.30,8.56228,0.020720,0.126477,4.54084,55.4052,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132480 entries, 1 to 132480
Data columns (total 8 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    132480 non-null  object 
 1   Tp      132480 non-null  float64
 2   pH      132480 non-null  float64
 3   Cond    132480 non-null  float64
 4   Turb    132480 non-null  float64
 5   SAC     132480 non-null  float64
 6   PFM     132480 non-null  float64
 7   Event   132480 non-null  int64  
dtypes: float64(6), int64(1), object(1)
memory usage: 9.1+ MB


In [8]:
df['Time'] = pd.to_datetime(df['Time']).astype(np.int64)
df['Tp'] = df['Tp'].astype('float32')
df['pH'] = df['pH'].astype('float32')
df['Cond'] = df['Cond'].astype('float32')
df['Turb'] = df['Turb'].astype('float32')
df['SAC'] = df['SAC'].astype('float32')
df['PFM'] = df['PFM'].astype('float32')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132480 entries, 1 to 132480
Data columns (total 8 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    132480 non-null  int64  
 1   Tp      132480 non-null  float32
 2   pH      132480 non-null  float32
 3   Cond    132480 non-null  float32
 4   Turb    132480 non-null  float32
 5   SAC     132480 non-null  float32
 6   PFM     132480 non-null  float32
 7   Event   132480 non-null  int64  
dtypes: float32(6), int64(2)
memory usage: 6.1 MB


In [9]:
x = df.iloc[:, 1:7] #Input Columns
y = df.iloc[:, -1]  #Output Columns

In [10]:
#Convert all the values into the range of 1(Scale the values in such a way that the bigger values are made into smaller range)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() 
x = scaler.fit_transform(x) 

In [11]:
df

Unnamed: 0,Time,Tp,pH,Cond,Turb,SAC,PFM,Event
1,1498867200000000000,6.94,8.60774,0.020954,0.125931,3.58683,43.755901,1
2,1498867260000000000,6.93,8.60589,0.020965,0.127219,3.59025,43.436600,1
3,1498867320000000000,6.94,8.60220,0.020968,0.126482,3.58318,43.599400,1
4,1498867380000000000,6.94,8.60220,0.020972,0.126184,3.58769,43.370399,1
5,1498867440000000000,6.94,8.60405,0.020973,0.127908,3.58287,43.165600,1
...,...,...,...,...,...,...,...,...
132476,1506815700000000000,10.30,8.56593,0.020724,0.126518,4.53577,56.468601,1
132477,1506815760000000000,10.30,8.56593,0.020727,0.126575,4.53008,56.356701,1
132478,1506815820000000000,10.30,8.56593,0.020723,0.126512,4.53512,55.047699,1
132479,1506815880000000000,10.30,8.56228,0.020720,0.126477,4.54084,55.405201,1


In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [13]:
#Oversample the dataset to counter imbalances
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state=42)
x_train, y_train = sm.fit_resample(x_train, y_train)

In [14]:
#Converting Y output into 2 columns

y_train=tf.keras.utils.to_categorical(y_train)
y_test=tf.keras.utils.to_categorical(y_test)


print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(211610, 6)
(211610, 2)
(26496, 6)
(26496, 2)


## ELM

In [15]:
input_size = x_train.shape[1]
hidden_size = 1000

In [16]:
input_weights = np.random.normal(size=[input_size,hidden_size])
biases = np.random.normal(size=[hidden_size])

In [17]:
def relu(x):
    return np.maximum(x, 0, x)

In [18]:
def hidden_nodes(X):
    G = np.dot(X, input_weights)
    G = G + biases
    H = relu(G)
    return H

In [19]:
import scipy
output_weights = np.dot(scipy.linalg.pinv2(hidden_nodes(x_train)), y_train)
output_biases=np.zeros_like(output_weights)
output_weights.shape

(1000, 2)

In [20]:
def predict(X):
    out = hidden_nodes(X)
    out = np.dot(out, output_weights)
    return out

In [21]:
prediction = predict(x_test)
correct = 0
total = x_test.shape[0]
for i in range(total):
    predicted = np.argmax(prediction[i])
    actual = np.argmax(y_test[i])
    correct += 1 if predicted == actual else 0
accuracy = correct/total
print('Accuracy for ', hidden_size, ' hidden nodes: ', accuracy)

Accuracy for  1000  hidden nodes:  0.9892436594202898
