# Classificação da qualidade da água usando MLP

## Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Leitura do dataframe (df)

In [2]:
df = pd.read_csv('water_dataX.csv',encoding="ISO-8859-1")
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,STATION CODE,LOCATIONS,STATE,Temp,D.O. (mg/l),PH,CONDUCTIVITY (µmhos/cm),B.O.D. (mg/l),NITRATENAN N+ NITRITENANN (mg/l),FECAL COLIFORM (MPN/100ml),TOTAL COLIFORM (MPN/100ml)Mean,year
0,1393,"DAMANGANGA AT D/S OF MADHUBAN, DAMAN",DAMAN & DIU,30.6,6.7,7.5,203,NAN,0.1,11,27,2014
1,1399,ZUARI AT D/S OF PT. WHERE KUMBARJRIA CANAL JOI...,GOA,29.8,5.7,7.2,189,2,0.2,4953,8391,2014
2,1475,ZUARI AT PANCHAWADI,GOA,29.5,6.3,6.9,179,1.7,0.1,3243,5330,2014
3,3181,RIVER ZUARI AT BORIM BRIDGE,GOA,29.7,5.8,6.9,64,3.8,0.5,5382,8443,2014
4,3182,RIVER ZUARI AT MARCAIM JETTY,GOA,29.5,5.8,7.3,83,1.9,0.4,3428,5500,2014


## Tratamento dos dados

In [3]:
df.dtypes

STATION CODE                        object
LOCATIONS                           object
STATE                               object
Temp                                object
D.O. (mg/l)                         object
PH                                  object
CONDUCTIVITY (µmhos/cm)             object
B.O.D. (mg/l)                       object
NITRATENAN N+ NITRITENANN (mg/l)    object
FECAL COLIFORM (MPN/100ml)          object
TOTAL COLIFORM (MPN/100ml)Mean      object
year                                 int64
dtype: object

Podemos notar que os dados não são lidos como numéricos

In [4]:
df['Temp']=pd.to_numeric(df['Temp'],errors='coerce')
df['D.O. (mg/l)']=pd.to_numeric(df['D.O. (mg/l)'],errors='coerce')
df['PH']=pd.to_numeric(df['PH'],errors='coerce')
df['B.O.D. (mg/l)']=pd.to_numeric(df['B.O.D. (mg/l)'],errors='coerce')
df['CONDUCTIVITY (µmhos/cm)']=pd.to_numeric(df['CONDUCTIVITY (µmhos/cm)'],errors='coerce')
df['NITRATENAN N+ NITRITENANN (mg/l)']=pd.to_numeric(df['NITRATENAN N+ NITRITENANN (mg/l)'],errors='coerce')
df['FECAL COLIFORM (MPN/100ml)']=pd.to_numeric(df['FECAL COLIFORM (MPN/100ml)'],errors='coerce')
df['TOTAL COLIFORM (MPN/100ml)Mean']=pd.to_numeric(df['TOTAL COLIFORM (MPN/100ml)Mean'],errors='coerce')
df.dtypes

STATION CODE                         object
LOCATIONS                            object
STATE                                object
Temp                                float64
D.O. (mg/l)                         float64
PH                                  float64
CONDUCTIVITY (µmhos/cm)             float64
B.O.D. (mg/l)                       float64
NITRATENAN N+ NITRITENANN (mg/l)    float64
FECAL COLIFORM (MPN/100ml)          float64
TOTAL COLIFORM (MPN/100ml)Mean      float64
year                                  int64
dtype: object

In [5]:
start=2
end=1779

do  = df.iloc[start:end,  4].astype(np.float64)
ph  = df.iloc[start:end,  5].astype(np.float64)
co  = df.iloc[start:end,  6].astype(np.float64)   
bod = df.iloc[start:end,  7].astype(np.float64)
na  = df.iloc[start:end,  8].astype(np.float64)
fc  = df.iloc[start:end,  9].astype(np.float64)
tc  = df.iloc[start:end, 10].astype(np.float64)
yr  = df.iloc[start:end, 11].astype( np.int64 )

df = pd.concat([do, ph, co, bod, na, fc, tc],axis=1)
df.columns = ['do', 'ph', 'co', 'bod', 'na', 'fc', 'tc']

In [6]:
df.head()

Unnamed: 0,do,ph,co,bod,na,fc,tc
2,6.3,6.9,179.0,1.7,0.1,3243.0,5330.0
3,5.8,6.9,64.0,3.8,0.5,5382.0,8443.0
4,5.8,7.3,83.0,1.9,0.4,3428.0,5500.0
5,5.5,7.4,81.0,1.5,0.1,2853.0,4049.0
6,6.1,6.7,308.0,1.4,0.3,3355.0,5672.0


In [7]:
df.dtypes

do     float64
ph     float64
co     float64
bod    float64
na     float64
fc     float64
tc     float64
dtype: object

In [8]:
df = df.dropna()

# Cálculo do WQI

O cálculo do índice de Qualidade da Água foi feito de acordo com:

$$\begin{matrix}
WQI = \frac{\sum^{N}_{i=1}q_i\times w_i}{\sum^{N}_{i=1}w_i}\\
q_i = 100 \times \left( \frac{V_i - V_{ideal}}{S_i - V_{ideal}} \right)\\
w_i = k \div S_i\\
k = \frac{1}{\sum^{N}_{i=1}S_i}
\end{matrix}\quad
\begin{matrix}
Onde:\\
\text{WQI: Índice de Qualdade da Água}\\
\text{N: Quantidade de parâmetros}\\
q_i\text{: Escala estimada de qualidade do parâmetro}\\
w_i\text{: Peso unitário do parâmetro}\\
V_i\text{: Valor da amostra}\\
V_{ideal}\text{: Valor ideal (para água pura)}\\
S_i\text{: Valor experado/permissível}\\
\text{k: Constante de proporcionalidade}\\
\end{matrix}$$

Proposto pelos autores do artigo.

In [9]:
# Lista de parâmetros
lp = ['do', 'ph', 'co', 'na', 'bod', 'fc', 'tc']

# Limiar Permissível
si = {
    'do' : 10,
    'ph' : 8.5,
    'co' : 1000,
    'na' : 45,
    'bod': 5,
    'fc' : 100,
    'tc' : 1000
}

# K - Constante
k = 1 / sum(si.values())

# w_i - peso do parâmetro (i)
w = {
    'do' : k / si['do'],
    'ph' : k / si['ph'],
    'co' : k / si['co'],
    'na' : k / si['na'],
    'bod': k / si['bod'],
    'fc' : k / si['fc'],
    'tc' : k / si['do']
}

# WQI - Water Quality Index
wqi = []
to_remove = []
for idx, row in df.iterrows():
    q = {}
    q['do'] = 100 * ((row.do - 14.6) / (si['do'] - 14.6))
    q['ph'] = 100 * (  (row.ph - 7)  / (si['ph'] - 7))
    q['co'] = 100 * (    (row.co)    / (si['co']))
    q['na'] = 100 * (    (row.na)    / (si['na']))
    q['bod']= 100 * (    (row.bod)   / (si['bod']))
    q['fc'] = 100 * (    (row.fc)    / (si['fc']))
    q['tc'] = 100 * (    (row.tc)    / (si['tc']))

    num = 0
    div = 0
    for parametro in lp:
        num += q[parametro] * w[parametro]
        div += w[parametro]

    wq = (num/div)
    if wq <= 160: wqi.append(wq)
    else: to_remove.append(idx)


df = df.drop(to_remove)

pd.DataFrame(wqi).describe()

Unnamed: 0,0
count,1196.0
mean,68.43645
std,27.372596
min,-8.288165
25%,48.676395
50%,62.250308
75%,80.188242
max,159.888082


In [26]:
qualidade = lambda x: ('Clean' if x <= 25
                       else('Unclean' if x <= 50
                            else('Polluted' if x <= 75
                                else('Highly polluted'))))

Y = [qualidade(x) for x in wqi]

### Normalização

O método de normalização adotado foi feito da seguinte maneira:
- Aplicamos uma técnica conhecida como normalização *min-max*, onde: $$x_{norm} = (x - min(x)) / (max(x) - min(x))$$
- Em seguida, comparamos a base normalizada com o resultado do tratamento dos dados proposto por Anbarivan N L e Anjali Vasudevan (colaboradores do dataset no *Kaggle*).
    - Contudo, acreditamos que o tratamento proposto pelos colaboradores apresenta muita perda em termos de precisão, portanto não é relevante para o nosso estudo;
 
OBS: Também é válido mencionar que os autores do artigo fizeram a normalização e o tratamento dos dados manualmente, mas não disponibilizaram tais dados.

In [11]:
# Normalização "convencional"
df_norm = df.copy() 

# Aplicando normalização min-max
for column in df_norm.columns: 
	df_norm[column] = (df_norm[column] - df_norm[column].min()) / (df_norm[column].max() - df_norm[column].min())

df_norm.head()

Unnamed: 0,do,ph,co,bod,na,fc,tc
11,0.648936,0.592824,0.001739,0.083871,0.001721,0.813874,0.584157
15,0.712766,0.686427,0.005006,0.090323,0.003442,0.415087,0.417206
16,0.712766,0.686427,0.003754,0.058065,0.001721,0.487491,0.518569
18,0.712766,0.639626,0.002821,0.090323,0.001721,0.735406,0.51925
26,0.712766,0.639626,0.000933,0.083871,0.001721,1.0,0.681261


In [12]:
df2 = df.copy()

# Cálculo do Oxigênio Dissolvido
df2['ndo'] = df2.do.apply(lambda x:(100 if (x>=6)  
                                 else(80 if  (6>=x>=5.1) 
                                      else(60 if (5>=x>=4.1)
                                          else(40 if (4>=x>=3) 
                                              else 0)))))

# Cálculo do PH
df2['nph'] = df2.ph.apply(lambda x: (100 if (8.5>=x>=7)  
                                 else(80 if  (8.6>=x>=8.5) or (6.9>=x>=6.8) 
                                      else(60 if (8.8>=x>=8.6) or (6.8>=x>=6.7) 
                                          else(40 if (9>=x>=8.8) or (6.7>=x>=6.5)
                                              else 0)))))

# Calculo da Condutividade Elétrica
df2['nco'] = df2.co.apply(lambda x:(100 if (75>=x>=0)  
                                 else(80 if  (150>=x>=75) 
                                      else(60 if (225>=x>=150)
                                          else(40 if (300>=x>=225) 
                                              else 0)))))

# Cálculo da Demanda Biológica de Oxigênio (B.O.D)
df2['nbod'] = df2.bod.apply(lambda x:(100 if (3>=x>=0)  
                                 else(80 if  (6>=x>=3) 
                                      else(60 if (80>=x>=6)
                                          else(40 if (125>=x>=80) 
                                              else 0)))))

# Cálculo da concentração de Nitrato
df2['nna'] = df2.na.apply(lambda x:(100 if (20>=x>=0)  
                                 else(80 if  (50>=x>=20) 
                                      else(60 if (100>=x>=50)
                                          else(40 if (200>=x>=100) 
                                              else 0)))))

# Cálculo do total de coliformes
df2['ntc'] = df2.tc.apply(lambda x:(100 if (5>=x>=0)  
                                 else(80 if  (50>=x>=5) 
                                      else(60 if (500>=x>=50)
                                          else(40 if (10000>=x>=500) 
                                              else 0)))))

df2.head()

Unnamed: 0,do,ph,co,bod,na,fc,tc,ndo,nph,nco,nbod,nna,ntc
11,6.7,6.4,93.0,1.4,0.1,2147.0,3433.0,100,0,80,100,100,40
15,7.3,7.0,247.0,1.5,0.2,1095.0,2453.0,100,100,40,100,100,40
16,7.3,7.0,188.0,1.0,0.1,1286.0,3048.0,100,100,60,100,100,40
18,7.3,6.7,144.0,1.5,0.1,1940.0,3052.0,100,60,80,100,100,40
26,7.3,6.7,55.0,1.4,0.1,2638.0,4003.0,100,60,100,100,100,40


# Aprendizado

## Cross-validation

[documentação do MLP](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier)

[documentação do KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html)

In [37]:
clf = MLPClassifier(
    hidden_layer_sizes=(6,5,4,), # 7(6,5,4,)4 | 7(6,5,5,4,4,)4 | (100,)
    activation = 'relu', 
    solver = 'adam',
    alpha = 0.0001,
    batch_size = 'auto',
    learning_rate = 'constant',
    learning_rate_init = 0.001,
    power_t = 0.5,
    max_iter = 2000, # 200, 500, 2000
    shuffle = True,
    random_state = None,
    tol = 0.0001,
    verbose = False,
    warm_start = False,
    momentum = 0.9,
    nesterovs_momentum = True,
    early_stopping = False,
    validation_fraction = 0.1,
    beta_1 = 0.9,
    beta_2 = 0.99,
    epsilon = 1e-8,
    n_iter_no_change = 10,
    max_fun = 15000
)

#kfolds = KFold(n_splits = 10)
kfolds = RepeatedKFold(n_repeats=3, n_splits=10)

# Coversão do dataframe pra nparray
X = np.array(df_norm.iloc[:].values)
Y = np.array(Y)

In [38]:
scores = cross_val_score(clf, X, Y, cv = kfolds)

print("Cross Validation Scores: \n", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))

Cross Validation Scores: 
 [0.98333333 0.98333333 0.975      0.975      1.         0.99166667
 0.99159664 0.98319328 0.97478992 1.         0.45       0.98333333
 0.975      0.98333333 1.         0.99166667 0.97478992 0.99159664
 0.96638655 0.35294118 0.99166667 0.98333333 0.98333333 1.
 0.35       0.39166667 0.99159664 0.96638655 0.97478992 0.99159664]
Average CV Score:  0.9050443510737628
Number of CV Scores used in Average:  30


In [39]:
for train_indices, test_indices in kfolds.split(X):
    clf.fit(X[train_indices], Y[train_indices])
    print(clf.score(X[test_indices], Y[test_indices]))

0.425
0.9833333333333333
0.9916666666666667
1.0
0.43333333333333335
0.4166666666666667
0.9663865546218487
0.46218487394957986
0.9915966386554622
0.9831932773109243
0.9916666666666667
0.975
0.4166666666666667
0.4583333333333333
0.9833333333333333
0.8916666666666667
0.9915966386554622
0.3025210084033613
0.9663865546218487
0.9915966386554622
0.9833333333333333
0.35833333333333334
0.9916666666666667
0.975
1.0
0.9666666666666667
0.9915966386554622
0.9915966386554622
0.44537815126050423
0.4957983193277311
