# Classificação da qualidade da água usando MLP

## Imports

In [1]:
import pandas as pd
import numpy as np

## Leitura do dataframe (df)

In [2]:
df = pd.read_csv('water_dataX.csv',encoding="ISO-8859-1")
df.fillna(0, inplace=True)
df.head()

Unnamed: 0,STATION CODE,LOCATIONS,STATE,Temp,D.O. (mg/l),PH,CONDUCTIVITY (µmhos/cm),B.O.D. (mg/l),NITRATENAN N+ NITRITENANN (mg/l),FECAL COLIFORM (MPN/100ml),TOTAL COLIFORM (MPN/100ml)Mean,year
0,1393,"DAMANGANGA AT D/S OF MADHUBAN, DAMAN",DAMAN & DIU,30.6,6.7,7.5,203,NAN,0.1,11,27,2014
1,1399,ZUARI AT D/S OF PT. WHERE KUMBARJRIA CANAL JOI...,GOA,29.8,5.7,7.2,189,2,0.2,4953,8391,2014
2,1475,ZUARI AT PANCHAWADI,GOA,29.5,6.3,6.9,179,1.7,0.1,3243,5330,2014
3,3181,RIVER ZUARI AT BORIM BRIDGE,GOA,29.7,5.8,6.9,64,3.8,0.5,5382,8443,2014
4,3182,RIVER ZUARI AT MARCAIM JETTY,GOA,29.5,5.8,7.3,83,1.9,0.4,3428,5500,2014


## Tratamento dos dados

In [3]:
df.dtypes

STATION CODE                        object
LOCATIONS                           object
STATE                               object
Temp                                object
D.O. (mg/l)                         object
PH                                  object
CONDUCTIVITY (µmhos/cm)             object
B.O.D. (mg/l)                       object
NITRATENAN N+ NITRITENANN (mg/l)    object
FECAL COLIFORM (MPN/100ml)          object
TOTAL COLIFORM (MPN/100ml)Mean      object
year                                 int64
dtype: object

Podemos notar que os dados não são lidos como numéricos

In [4]:
df['Temp']=pd.to_numeric(df['Temp'],errors='coerce')
df['D.O. (mg/l)']=pd.to_numeric(df['D.O. (mg/l)'],errors='coerce')
df['PH']=pd.to_numeric(df['PH'],errors='coerce')
df['B.O.D. (mg/l)']=pd.to_numeric(df['B.O.D. (mg/l)'],errors='coerce')
df['CONDUCTIVITY (µmhos/cm)']=pd.to_numeric(df['CONDUCTIVITY (µmhos/cm)'],errors='coerce')
df['NITRATENAN N+ NITRITENANN (mg/l)']=pd.to_numeric(df['NITRATENAN N+ NITRITENANN (mg/l)'],errors='coerce')
df['FECAL COLIFORM (MPN/100ml)']=pd.to_numeric(df['FECAL COLIFORM (MPN/100ml)'],errors='coerce')
df['TOTAL COLIFORM (MPN/100ml)Mean']=pd.to_numeric(df['TOTAL COLIFORM (MPN/100ml)Mean'],errors='coerce')
df.dtypes

STATION CODE                         object
LOCATIONS                            object
STATE                                object
Temp                                float64
D.O. (mg/l)                         float64
PH                                  float64
CONDUCTIVITY (µmhos/cm)             float64
B.O.D. (mg/l)                       float64
NITRATENAN N+ NITRITENANN (mg/l)    float64
FECAL COLIFORM (MPN/100ml)          float64
TOTAL COLIFORM (MPN/100ml)Mean      float64
year                                  int64
dtype: object

In [5]:
start=2
end=1992

do = df.iloc[start:end ,4].astype(np.float64)
ph = df.iloc[start:end,5].astype(np.float64)
co = df.iloc[start:end ,6].astype(np.float64)   
bod = df.iloc[start:end ,7].astype(np.float64)
na = df.iloc[start:end ,8].astype(np.float64)
fc = df.iloc[start:end, 9].astype(np.float64)
tc = df.iloc[2:end ,10].astype(np.float64)

df = pd.concat([do,ph,co,bod,na,tc],axis=1)
df.columns = ['do','ph','co','bod','na','tc']

In [6]:
df.head()

Unnamed: 0,do,ph,co,bod,na,tc
2,6.3,6.9,179.0,1.7,0.1,5330.0
3,5.8,6.9,64.0,3.8,0.5,8443.0
4,5.8,7.3,83.0,1.9,0.4,5500.0
5,5.5,7.4,81.0,1.5,0.1,4049.0
6,6.1,6.7,308.0,1.4,0.3,5672.0


In [7]:
df.dtypes

do     float64
ph     float64
co     float64
bod    float64
na     float64
tc     float64
dtype: object

### Normalização

A normalização dos dados foi feita de acordo com o sugerido por Anbarivan N L e Anjali Vasudevan (colaboradores do dataset no *Kaggle*).

In [10]:
df_norm = df / df.iloc[0]

display(df_norm)

Unnamed: 0,do,ph,co,bod,na,tc
2,1.000000,1.000000,1.000000,1.000000,1.00,1.000000
3,0.920635,1.000000,0.357542,2.235294,5.00,1.584053
4,0.920635,1.057971,0.463687,1.117647,4.00,1.031895
5,0.873016,1.072464,0.452514,0.882353,1.00,0.759662
6,0.968254,0.971014,1.720670,0.823529,3.00,1.064165
...,...,...,...,...,...,...
1986,1.253968,106.956522,0.040223,1.588235,5.18,0.037899
1987,1.190476,84.782609,0.035196,1.529412,1.55,0.059099
1988,1.206349,14.202899,0.034637,0.705882,,0.106942
1989,1.222222,13.188406,0.036313,0.764706,,0.105441


In [None]:
df['nph']=df.ph.apply(lambda x: (100 if (8.5>=x>=7)  
                                 else(80 if  (8.6>=x>=8.5) or (6.9>=x>=6.8) 
                                      else(60 if (8.8>=x>=8.6) or (6.8>=x>=6.7) 
                                          else(40 if (9>=x>=8.8) or (6.7>=x>=6.5)
                                              else 0)))))