# Classificação da qualidade da água usando MLP

## Imports

In [1]:
import pandas as pd
import numpy as np

## Leitura do dataframe (df)

In [2]:
df = pd.read_csv('water_dataX.csv',encoding="ISO-8859-1")
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,STATION CODE,LOCATIONS,STATE,Temp,D.O. (mg/l),PH,CONDUCTIVITY (µmhos/cm),B.O.D. (mg/l),NITRATENAN N+ NITRITENANN (mg/l),FECAL COLIFORM (MPN/100ml),TOTAL COLIFORM (MPN/100ml)Mean,year
0,1393,"DAMANGANGA AT D/S OF MADHUBAN, DAMAN",DAMAN & DIU,30.6,6.7,7.5,203,NAN,0.1,11,27,2014
1,1399,ZUARI AT D/S OF PT. WHERE KUMBARJRIA CANAL JOI...,GOA,29.8,5.7,7.2,189,2,0.2,4953,8391,2014
2,1475,ZUARI AT PANCHAWADI,GOA,29.5,6.3,6.9,179,1.7,0.1,3243,5330,2014
3,3181,RIVER ZUARI AT BORIM BRIDGE,GOA,29.7,5.8,6.9,64,3.8,0.5,5382,8443,2014
4,3182,RIVER ZUARI AT MARCAIM JETTY,GOA,29.5,5.8,7.3,83,1.9,0.4,3428,5500,2014


## Tratamento dos dados

In [3]:
df.dtypes

STATION CODE                        object
LOCATIONS                           object
STATE                               object
Temp                                object
D.O. (mg/l)                         object
PH                                  object
CONDUCTIVITY (µmhos/cm)             object
B.O.D. (mg/l)                       object
NITRATENAN N+ NITRITENANN (mg/l)    object
FECAL COLIFORM (MPN/100ml)          object
TOTAL COLIFORM (MPN/100ml)Mean      object
year                                 int64
dtype: object

Podemos notar que os dados não são lidos como numéricos

In [4]:
df['Temp']=pd.to_numeric(df['Temp'],errors='coerce')
df['D.O. (mg/l)']=pd.to_numeric(df['D.O. (mg/l)'],errors='coerce')
df['PH']=pd.to_numeric(df['PH'],errors='coerce')
df['B.O.D. (mg/l)']=pd.to_numeric(df['B.O.D. (mg/l)'],errors='coerce')
df['CONDUCTIVITY (µmhos/cm)']=pd.to_numeric(df['CONDUCTIVITY (µmhos/cm)'],errors='coerce')
df['NITRATENAN N+ NITRITENANN (mg/l)']=pd.to_numeric(df['NITRATENAN N+ NITRITENANN (mg/l)'],errors='coerce')
df['FECAL COLIFORM (MPN/100ml)']=pd.to_numeric(df['FECAL COLIFORM (MPN/100ml)'],errors='coerce')
df['TOTAL COLIFORM (MPN/100ml)Mean']=pd.to_numeric(df['TOTAL COLIFORM (MPN/100ml)Mean'],errors='coerce')
df.dtypes

STATION CODE                         object
LOCATIONS                            object
STATE                                object
Temp                                float64
D.O. (mg/l)                         float64
PH                                  float64
CONDUCTIVITY (µmhos/cm)             float64
B.O.D. (mg/l)                       float64
NITRATENAN N+ NITRITENANN (mg/l)    float64
FECAL COLIFORM (MPN/100ml)          float64
TOTAL COLIFORM (MPN/100ml)Mean      float64
year                                  int64
dtype: object

In [5]:
start=2
end=1992

do  = df.iloc[start:end,  4].astype(np.float64)
ph  = df.iloc[start:end,  5].astype(np.float64)
co  = df.iloc[start:end,  6].astype(np.float64)   
bod = df.iloc[start:end,  7].astype(np.float64)
na  = df.iloc[start:end,  8].astype(np.float64)
fc  = df.iloc[start:end,  9].astype(np.float64)
tc  = df.iloc[start:end, 10].astype(np.float64)
yr  = df.iloc[start:end, 11].astype( np.int64 )

df = pd.concat([do, ph, co, bod, na, fc, tc],axis=1)
df.columns = ['do', 'ph', 'co', 'bod', 'na', 'fc', 'tc']

In [6]:
df.head()

Unnamed: 0,do,ph,co,bod,na,fc,tc
2,6.3,6.9,179.0,1.7,0.1,3243.0,5330.0
3,5.8,6.9,64.0,3.8,0.5,5382.0,8443.0
4,5.8,7.3,83.0,1.9,0.4,3428.0,5500.0
5,5.5,7.4,81.0,1.5,0.1,2853.0,4049.0
6,6.1,6.7,308.0,1.4,0.3,3355.0,5672.0


In [7]:
df.dtypes

do     float64
ph     float64
co     float64
bod    float64
na     float64
fc     float64
tc     float64
dtype: object

### Normalização

O método de normalização adotado foi feito da seguinte maneira:
- Aplicamos uma técnica conhecida como normalização *min-max*, onde: $$x_{norm} = (x - min(x)) / (max(x) - min(x))$$
- Em seguida, comparamos a base normalizada com o resultado do tratamento dos dados proposto por Anbarivan N L e Anjali Vasudevan (colaboradores do dataset no *Kaggle*).
    - Contudo, acreditamos que o tratamento proposto pelos colaboradores perde muita precisão e não é relevante para o nosso estudo
 
OBS: Também é válido mencionar que os autores do artigo fizeram a normalização e o tratamento dos dados manualmente, mas não disponibilizaram tais dados.

In [8]:
# Normalização "convencional"
df_norm = df.copy() 

# Aplicando normalização min-max
for column in df_norm.columns: 
	df_norm[column] = (df_norm[column] - df_norm[column].min()) / (df_norm[column].max() - df_norm[column].min())

df_norm.head()

Unnamed: 0,do,ph,co,bod,na,fc,tc
2,0.552632,0.000103,0.002718,0.002994,0.00092,1.2e-05,1e-05
3,0.508772,0.000103,0.000968,0.006924,0.0046,2e-05,1.7e-05
4,0.508772,0.000109,0.001257,0.003368,0.00368,1.3e-05,1.1e-05
5,0.482456,0.00011,0.001227,0.00262,0.00092,1e-05,8e-06
6,0.535088,0.0001,0.004682,0.002433,0.00276,1.2e-05,1.1e-05


In [10]:
{
    "tags": [
        "hide-input",
    ]
}

df2 = df.copy()

# Cálculo do Oxigênio Dissolvido
df2['ndo'] = df2.do.apply(lambda x:(100 if (x>=6)  
                                 else(80 if  (6>=x>=5.1) 
                                      else(60 if (5>=x>=4.1)
                                          else(40 if (4>=x>=3) 
                                              else 0)))))

# Cálculo do PH
df2['nph'] = df2.ph.apply(lambda x: (100 if (8.5>=x>=7)  
                                 else(80 if  (8.6>=x>=8.5) or (6.9>=x>=6.8) 
                                      else(60 if (8.8>=x>=8.6) or (6.8>=x>=6.7) 
                                          else(40 if (9>=x>=8.8) or (6.7>=x>=6.5)
                                              else 0)))))

# Calculo da Condutividade Elétrica
df2['nco'] = df2.co.apply(lambda x:(100 if (75>=x>=0)  
                                 else(80 if  (150>=x>=75) 
                                      else(60 if (225>=x>=150)
                                          else(40 if (300>=x>=225) 
                                              else 0)))))

# Cálculo da Demanda Biológica de Oxigênio (B.O.D)
df2['nbod'] = df2.bod.apply(lambda x:(100 if (3>=x>=0)  
                                 else(80 if  (6>=x>=3) 
                                      else(60 if (80>=x>=6)
                                          else(40 if (125>=x>=80) 
                                              else 0)))))

# Cálculo da concentração de Nitrato
df2['nna'] = df2.na.apply(lambda x:(100 if (20>=x>=0)  
                                 else(80 if  (50>=x>=20) 
                                      else(60 if (100>=x>=50)
                                          else(40 if (200>=x>=100) 
                                              else 0)))))

# Cálculo do total de coliformes
df2['ntc'] = df2.tc.apply(lambda x:(100 if (5>=x>=0)  
                                 else(80 if  (50>=x>=5) 
                                      else(60 if (500>=x>=50)
                                          else(40 if (10000>=x>=500) 
                                              else 0)))))

df2.head()

Unnamed: 0,do,ph,co,bod,na,fc,tc,ndo,nph,nco,nbod,nna,ntc
2,6.3,6.9,179.0,1.7,0.1,3243.0,5330.0,100,80,60,100,100,40
3,5.8,6.9,64.0,3.8,0.5,5382.0,8443.0,80,80,100,80,100,40
4,5.8,7.3,83.0,1.9,0.4,3428.0,5500.0,80,100,80,100,100,40
5,5.5,7.4,81.0,1.5,0.1,2853.0,4049.0,80,100,80,100,100,40
6,6.1,6.7,308.0,1.4,0.3,3355.0,5672.0,100,60,0,100,100,40
