## Import modules

In [48]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as pp

# pd.option show rows and cols data of df
pd.set_option("display.max_rows", 5)
pd.set_option("display.max_columns", 20)

## Create compelete dataset

In [None]:
# File's year
data_years = ["2017","2018","2019","2020","2021","2022"]

# Create empty dataframe
df = pd.DataFrame()

# Concatenate dataframes
def create_dataset(df):
    for year in data_years:
        # save the csv file in ./dataset folder
        path = f'./dataset/{year}_lake_data.csv'

        df_temp = pd.read_csv(path)
        df_temp["year"] = year
        df = pd.concat([df,df_temp])
    return df

df = create_dataset(df)

df.info()
df.head()

In [50]:
# if value = "BDL" (Blow Detection Limit) convert it to 0
df.replace({"BDL": 0},inplace=True)

# if value = "-", just set it to NaN
df.replace({"-": np.nan}, inplace=True)

In [None]:
# Get labels
df_location = df.iloc[:,0:4]
df_year  = df.iloc[:,-1:]
df_label = pd.concat([df_location,df_year],axis=1)

# Get datas
df_data = df.iloc[:,4:-1]


# Ensure all datas are numeric
df_data = df_data.apply(pd.to_numeric, errors='coerce')
df_data


## Normalization column

In [52]:
# Fill NaN data with mean of the col
df_data.fillna(df_data.mean(), inplace=True)

def create_normalized_DataFrame(data,label=df_label):
    dataframe = pd.DataFrame(data=data, columns=df_data.columns)
    dataframe = pd.concat([label.reset_index(),dataframe], axis=1)
    dataframe.drop(['index'], axis=1, inplace=True)
    return dataframe

### Scaled

In [None]:
## Scaled
scaled_normalized = pp.scale(df_data)

df_scaled = create_normalized_DataFrame(scaled_normalized)
df_scaled

### Min-Max (0-1)

In [None]:
minmax_normalized = pp.normalize(df_data)
df_minmax = create_normalized_DataFrame(minmax_normalized)
df_minmax

### Binarized

In [None]:
binary_normalized = pp.binarize(scaled_normalized)

df_binarized = create_normalized_DataFrame(binary_normalized)
df_binarized