In [16]:
import pandas as pd
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

### Reading [Water Quality](https://www.kaggle.com/datasets/mssmartypants/water-quality) dataset

#### Get the data

In [17]:
data_df = pd.read_csv('waterQuality1.csv') #read csv
data_df.head() #show first 5 rows of dataset

Unnamed: 0,aluminium,ammonia,arsenic,barium,cadmium,chloramine,chromium,copper,flouride,bacteria,...,lead,nitrates,nitrites,mercury,perchlorate,radium,selenium,silver,uranium,is_safe
0,1.65,9.08,0.04,2.85,0.007,0.35,0.83,0.17,0.05,0.2,...,0.054,16.08,1.13,0.007,37.75,6.78,0.08,0.34,0.02,1
1,2.32,21.16,0.01,3.31,0.002,5.28,0.68,0.66,0.9,0.65,...,0.1,2.01,1.93,0.003,32.26,3.21,0.08,0.27,0.05,1
2,1.01,14.02,0.04,0.58,0.008,4.24,0.53,0.02,0.99,0.05,...,0.078,14.16,1.11,0.006,50.28,7.07,0.07,0.44,0.01,0
3,1.36,11.33,0.04,2.96,0.001,7.23,0.03,1.66,1.08,0.71,...,0.016,1.41,1.29,0.004,9.12,1.72,0.02,0.45,0.05,1
4,0.92,24.33,0.03,0.2,0.006,2.67,0.69,0.57,0.61,0.13,...,0.117,6.74,1.11,0.003,16.9,2.41,0.02,0.06,0.02,1


In [18]:
print(data_df.keys()) #output names of columns

Index(['aluminium', 'ammonia', 'arsenic', 'barium', 'cadmium', 'chloramine',
       'chromium', 'copper', 'flouride', 'bacteria', 'viruses', 'lead',
       'nitrates', 'nitrites', 'mercury', 'perchlorate', 'radium', 'selenium',
       'silver', 'uranium', 'is_safe'],
      dtype='object')


#### Preprocessing Data of Water Quality

In [19]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7999 entries, 0 to 7998
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   aluminium    7999 non-null   float64
 1   ammonia      7999 non-null   object 
 2   arsenic      7999 non-null   float64
 3   barium       7999 non-null   float64
 4   cadmium      7999 non-null   float64
 5   chloramine   7999 non-null   float64
 6   chromium     7999 non-null   float64
 7   copper       7999 non-null   float64
 8   flouride     7999 non-null   float64
 9   bacteria     7999 non-null   float64
 10  viruses      7999 non-null   float64
 11  lead         7999 non-null   float64
 12  nitrates     7999 non-null   float64
 13  nitrites     7999 non-null   float64
 14  mercury      7999 non-null   float64
 15  perchlorate  7999 non-null   float64
 16  radium       7999 non-null   float64
 17  selenium     7999 non-null   float64
 18  silver       7999 non-null   float64
 19  uraniu

In [20]:
data_df.isnull().sum()

aluminium      0
ammonia        0
arsenic        0
barium         0
cadmium        0
chloramine     0
chromium       0
copper         0
flouride       0
bacteria       0
viruses        0
lead           0
nitrates       0
nitrites       0
mercury        0
perchlorate    0
radium         0
selenium       0
silver         0
uranium        0
is_safe        0
dtype: int64

In [21]:
# Знайдемо рядки зі значенням "#NUM!" і видалимо їх з набору даних
data_df = data_df[data_df != '#NUM!'].dropna()
data_df.shape

(7996, 21)

In [22]:
# Змінюємо тип даних стовпців 'ammonia' та 'is_safe' на стандартний для всіх float64
data_df['ammonia'] = data_df['ammonia'].astype(float)
data_df['is_safe'] = data_df['is_safe'].astype(float)

In [23]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7996 entries, 0 to 7998
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   aluminium    7996 non-null   float64
 1   ammonia      7996 non-null   float64
 2   arsenic      7996 non-null   float64
 3   barium       7996 non-null   float64
 4   cadmium      7996 non-null   float64
 5   chloramine   7996 non-null   float64
 6   chromium     7996 non-null   float64
 7   copper       7996 non-null   float64
 8   flouride     7996 non-null   float64
 9   bacteria     7996 non-null   float64
 10  viruses      7996 non-null   float64
 11  lead         7996 non-null   float64
 12  nitrates     7996 non-null   float64
 13  nitrites     7996 non-null   float64
 14  mercury      7996 non-null   float64
 15  perchlorate  7996 non-null   float64
 16  radium       7996 non-null   float64
 17  selenium     7996 non-null   float64
 18  silver       7996 non-null   float64
 19  uraniu

### Fully connected multi-layer neural network - Multilayer Perceptron (MLP)

In [24]:
num_classes = 10

X = data_df.drop(['is_safe'], axis = 1)
y = data_df['is_safe']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

print(X_train.shape[0], 'train samples')
print(X_test.shape[0], 'test samples')

6396 train samples
1600 test samples


In [25]:
scaler = MinMaxScaler()
scaler.fit(X_train)

In [26]:
scaled_X_train = scaler.transform(X_train) 
scaled_X_test = scaler.transform(X_test) 

In [27]:
model = keras.Sequential([
    keras.layers.Dense(256, activation="relu", input_shape=(X_train.shape[-1], )),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(128, activation="sigmoid"),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(1, activation="sigmoid"),
    ])

In [28]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 256)               5376      
                                                                 
 batch_normalization_1 (Batc  (None, 256)              1024      
 hNormalization)                                                 
                                                                 
 dense_4 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 1)                 129       
                                                                 
Total params: 39,425
Trainable params: 38,913
Non-trainable params: 512
________________________________________________

In [29]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.0001), 
    loss="binary_crossentropy", 
    metrics=["accuracy"]
)

In [30]:
model_1_fit = model.fit(
    X_train, 
    y_train,
    validation_data=(X_test, y_test),
    verbose=2,
    epochs=20, 
    batch_size=128
)

Epoch 1/20


50/50 - 2s - loss: 1.1393 - accuracy: 0.1756 - val_loss: 1.0861 - val_accuracy: 0.1013 - 2s/epoch - 49ms/step
Epoch 2/20
50/50 - 0s - loss: 0.8863 - accuracy: 0.3324 - val_loss: 0.8112 - val_accuracy: 0.2862 - 220ms/epoch - 4ms/step
Epoch 3/20
50/50 - 0s - loss: 0.7105 - accuracy: 0.5519 - val_loss: 0.6209 - val_accuracy: 0.7262 - 219ms/epoch - 4ms/step
Epoch 4/20
50/50 - 0s - loss: 0.5766 - accuracy: 0.7214 - val_loss: 0.5059 - val_accuracy: 0.8494 - 218ms/epoch - 4ms/step
Epoch 5/20
50/50 - 0s - loss: 0.4977 - accuracy: 0.7971 - val_loss: 0.4325 - val_accuracy: 0.8838 - 216ms/epoch - 4ms/step
Epoch 6/20
50/50 - 0s - loss: 0.4360 - accuracy: 0.8421 - val_loss: 0.3803 - val_accuracy: 0.8925 - 220ms/epoch - 4ms/step
Epoch 7/20
50/50 - 0s - loss: 0.3998 - accuracy: 0.8612 - val_loss: 0.3455 - val_accuracy: 0.8988 - 220ms/epoch - 4ms/step
Epoch 8/20
50/50 - 0s - loss: 0.3710 - accuracy: 0.8690 - val_loss: 0.3212 - val_accuracy: 0.8988 - 314ms/epoch - 6ms/step
Epoch 9/20
50/50 - 0s - loss: