## Import modules

In [2]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as pp

# pd.option show rows and cols data of df
pd.set_option("display.max_rows", 5)
pd.set_option("display.max_columns", 20)

## Create compelete dataset

In [None]:
# File's year
data_years = ["2017","2018","2019","2020","2021","2022"]

# Create empty dataframe
df = pd.DataFrame()

# Concatenate dataframes
def create_dataset(df):
    for year in data_years:
        # save the csv file in ./dataset folder
        path = f'./dataset/{year}_lake_data.csv'

        df_temp = pd.read_csv(path)
        df_temp["year"] = year
        df = pd.concat([df,df_temp])
    return df

df = create_dataset(df)

df.info()
df.head()

In [4]:
# if value = "BDL" (Blow Detection Limit) convert it to 0
df.replace({"BDL": 0},inplace=True)

# if value = "-", just set it to NaN
df.replace({"-": np.nan}, inplace=True)

In [None]:
# Get labels
df_location = df.iloc[:,0:4]
df_year  = df.iloc[:,-1:]
df_label = pd.concat([df_location,df_year],axis=1)

# Get datas
df_data = df.iloc[:,4:-1]


# Ensure all datas are numeric
df_data = df_data.apply(pd.to_numeric, errors='coerce')
df_data


## Normalization column

In [42]:
# Fill NaN data with mean of the col
df_data.fillna(df_data.mean(), inplace=True)

def create_normalized_DataFrame(data,label=df_label):
    dataframe = pd.DataFrame(data=data, columns=df_data.columns)
    dataframe = pd.concat([label.reset_index(),dataframe], axis=1)
    dataframe.drop(['index'], axis=1, inplace=True)
    return dataframe

### Scaled

In [45]:
## Scaled
scaled_normalized = pp.scale(df_data)

df_scaled = create_normalized_DataFrame(scaled_normalized)
df_scaled

Unnamed: 0,STN Code,Name of Monitoring Location,Type Water Body,State Name,year,Min Temperature,Max Temperature,Min Dissolved Oxygen,Max Dissolved Oxygen,Min pH,Max pH,Min Conductivity,Max Conductivity,Min BOD,Max BOD,Min Nitrate N + Nitrite N,Max Nitrate N + Nitrite N,Min Fecal Coliform,Max Fecal Coliform,Min Total Coliform,Max Total Coliform
0,1790.0,"PULICATE LAKE , NELLORE \nDIST.",LAKE,ANDHRA \nPRADESH,2017,1.108272,-0.273000,0.494970,-0.050480,0.154477,0.752842,1.346902,20.203709,-0.392689,-0.423832,-0.209433,-0.000390,-0.216630,-0.035851,-0.038989,-0.057777
1,2353.0,"KONDACHARLA-AAVA LAKE, \nPARAWADA PHARMA CITY,...",LAKE,ANDHRA \nPRADESH,2017,0.532075,-0.273000,0.836471,-0.079681,-0.181250,0.568074,-0.045175,-0.149705,-0.348276,-0.423832,-0.023496,-0.216167,-0.214700,-0.035851,-0.040777,-0.057774
2,2205.0,"MER BEEL AT MADHABPUR, \nASSAM",LAKE,ASSAM,2017,-0.236189,-0.535945,-0.742972,0.037122,-2.195617,-2.018674,-0.330049,-0.268241,-0.392689,0.002184,-0.409953,-0.317350,-0.152708,-0.035839,-0.040737,-0.057759
3,2206.0,"DALONI BEEL NEAR \nJOGIGHOPA, ASSAM",LAKE,ASSAM,2017,0.147943,1.830561,0.494970,-0.313287,-0.684842,-0.540532,-0.327966,-0.264970,-0.407493,-0.408508,-0.409953,-0.335636,-0.152708,-0.035844,-0.040976,-0.057763
4,1263.0,ELANGABEEL SYSTEM POND \n(CONNECTED TO R. KOLA...,POND,ASSAM,2017,0.147943,1.304671,-1.383286,-0.722098,-0.349114,0.568074,-0.219120,-0.157816,0.125459,-0.043789,-0.154745,-0.073535,0.018896,-0.035829,-0.039267,-0.057702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3162,2544.0,SAHEBBANDH AT \nPURULIA,LAKE,WEST \nBENGAL,2022,-0.236189,1.830561,2.159788,0.913146,-0.013387,-0.540532,-0.231098,-0.242335,-0.126213,-0.362535,-0.337036,-0.356361,0.018896,-0.035822,-0.033823,-0.057741
3163,2522.0,SINCHAL LAKE FOR \nDARJEELING,LAKE,WEST \nBENGAL,2022,-3.693373,-2.902452,1.391410,0.504335,-0.181250,-0.355764,-0.349318,-0.278446,-0.392689,-0.439156,-0.337036,-0.352094,-0.216630,-0.035850,-0.042009,-0.057777
3164,5163.0,"SUBHAS SAROBAR, \nBELAGHATA, KOLKATA",LAKE,WEST \nBENGAL,2022,0.147943,0.778781,1.177972,0.270729,0.322341,0.383307,-0.174332,-0.229252,-0.096604,-0.356405,-0.337036,-0.338075,-0.066906,-0.035837,-0.037002,-0.057770
3165,2521.0,"WATER RESERVIOR AT \nDELO LAKE, DARJELLING",LAKE,WEST \nBENGAL,2022,-2.540978,-0.798891,1.306035,0.416733,-0.013387,-0.725300,-0.347756,-0.280016,-0.392689,-0.457546,-0.337036,-0.349656,-0.202044,-0.035830,-0.041611,-0.057741


### Min-Max (0-1)

In [46]:
minmax_normalized = pp.normalize(df_data)
df_minmax = create_normalized_DataFrame(minmax_normalized)
df_minmax

Unnamed: 0,STN Code,Name of Monitoring Location,Type Water Body,State Name,year,Min Temperature,Max Temperature,Min Dissolved Oxygen,Max Dissolved Oxygen,Min pH,Max pH,Min Conductivity,Max Conductivity,Min BOD,Max BOD,Min Nitrate N + Nitrite N,Max Nitrate N + Nitrite N,Min Fecal Coliform,Max Fecal Coliform,Min Total Coliform,Max Total Coliform
0,1790.0,"PULICATE LAKE , NELLORE \nDIST.",LAKE,ANDHRA \nPRADESH,2017,0.000172,0.000179,0.000033,0.000044,0.000045,0.000054,0.020875,0.999717,0.000006,0.000015,0.000004,0.000044,0.000013,0.000013,0.005107,0.010214
1,2353.0,"KONDACHARLA-AAVA LAKE, \nPARAWADA PHARMA CITY,...",LAKE,ANDHRA \nPRADESH,2017,0.008876,0.010356,0.002182,0.002515,0.002552,0.003107,0.220799,0.382423,0.000481,0.000851,0.000429,0.001243,0.004068,0.010726,0.129447,0.887635
2,2205.0,"MER BEEL AT MADHABPUR, \nASSAM",LAKE,ASSAM,2017,0.002975,0.004016,0.000327,0.001071,0.000848,0.001041,0.007437,0.019039,0.000149,0.002410,0.000015,0.000253,0.044622,0.297482,0.053547,0.951942
3,2206.0,"DALONI BEEL NEAR \nJOGIGHOPA, ASSAM",LAKE,ASSAM,2017,0.004034,0.006602,0.000935,0.001100,0.001210,0.001430,0.009902,0.028057,0.000165,0.000513,0.000018,0.000257,0.055013,0.220051,0.055013,0.971892
4,1263.0,ELANGABEEL SYSTEM POND \n(CONNECTED TO R. KOLA...,POND,ASSAM,2017,0.001030,0.001592,0.000033,0.000215,0.000318,0.000393,0.012316,0.045518,0.000211,0.000688,0.000037,0.000267,0.051512,0.163901,0.034185,0.983406
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3162,2544.0,SAHEBBANDH AT \nPURULIA,LAKE,WEST \nBENGAL,2022,0.001639,0.002950,0.000738,0.000836,0.000574,0.000639,0.019670,0.026718,0.000229,0.000352,0.000025,0.000087,0.090153,0.385199,0.172110,0.901531
3163,2522.0,SINCHAL LAKE FOR \nDARJEELING,LAKE,WEST \nBENGAL,2022,0.001236,0.011125,0.004450,0.005439,0.004265,0.004883,0.008035,0.030903,0.000618,0.001113,0.000185,0.000698,0.001236,0.142156,0.024723,0.988909
3164,5163.0,"SUBHAS SAROBAR, \nBELAGHATA, KOLKATA",LAKE,WEST \nBENGAL,2022,0.004916,0.007151,0.001497,0.001788,0.001609,0.001855,0.077990,0.095197,0.000670,0.001006,0.000067,0.000304,0.156427,0.513974,0.290507,0.782134
3165,2521.0,"WATER RESERVIOR AT \nDELO LAKE, DARJELLING",LAKE,WEST \nBENGAL,2022,0.000697,0.002264,0.000609,0.000740,0.000609,0.000670,0.001393,0.003309,0.000087,0.000104,0.000026,0.000102,0.006095,0.287318,0.012189,0.957728


### Binarized

In [47]:
binary_normalized = pp.binarize(scaled_normalized)

df_binarized = create_normalized_DataFrame(binary_normalized)
df_binarized

Unnamed: 0,STN Code,Name of Monitoring Location,Type Water Body,State Name,year,Min Temperature,Max Temperature,Min Dissolved Oxygen,Max Dissolved Oxygen,Min pH,Max pH,Min Conductivity,Max Conductivity,Min BOD,Max BOD,Min Nitrate N + Nitrite N,Max Nitrate N + Nitrite N,Min Fecal Coliform,Max Fecal Coliform,Min Total Coliform,Max Total Coliform
0,1790.0,"PULICATE LAKE , NELLORE \nDIST.",LAKE,ANDHRA \nPRADESH,2017,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2353.0,"KONDACHARLA-AAVA LAKE, \nPARAWADA PHARMA CITY,...",LAKE,ANDHRA \nPRADESH,2017,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2205.0,"MER BEEL AT MADHABPUR, \nASSAM",LAKE,ASSAM,2017,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2206.0,"DALONI BEEL NEAR \nJOGIGHOPA, ASSAM",LAKE,ASSAM,2017,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1263.0,ELANGABEEL SYSTEM POND \n(CONNECTED TO R. KOLA...,POND,ASSAM,2017,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3162,2544.0,SAHEBBANDH AT \nPURULIA,LAKE,WEST \nBENGAL,2022,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3163,2522.0,SINCHAL LAKE FOR \nDARJEELING,LAKE,WEST \nBENGAL,2022,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3164,5163.0,"SUBHAS SAROBAR, \nBELAGHATA, KOLKATA",LAKE,WEST \nBENGAL,2022,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3165,2521.0,"WATER RESERVIOR AT \nDELO LAKE, DARJELLING",LAKE,WEST \nBENGAL,2022,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
