Import Libraries

In [1]:
import pandas as pd
import numpy as np
import src.util as utils
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer

Configuration Files

In [2]:
config = utils.load_config()

Load Dataset

In [3]:
dataset = pd.read_csv(config["dataset_path"])

In [4]:
dataset

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


Data Validation

In [5]:
dataset.isnull().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

In [6]:
dataset.dtypes

ph                 float64
Hardness           float64
Solids             float64
Chloramines        float64
Sulfate            float64
Conductivity       float64
Organic_carbon     float64
Trihalomethanes    float64
Turbidity          float64
Potability           int64
dtype: object

In [7]:
print('terdapat data kosong untuk ph, sulfate dan trihalomethanes')

terdapat data kosong untuk ph, sulfate dan trihalomethanes


Impute Data

In [8]:
imputer = KNNImputer(n_neighbors=10, weights="uniform")

In [9]:
l=imputer.fit_transform(dataset)

In [10]:
waterpot=pd.DataFrame(l,columns=dataset.columns)

In [11]:
waterpot.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [12]:
round(waterpot.describe(),2)

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0,3276.0
mean,7.08,196.37,22014.09,7.12,333.62,426.21,14.28,66.43,3.97,0.39
std,1.48,32.88,8768.57,1.58,36.88,80.82,3.31,15.8,0.78,0.49
min,0.0,47.43,320.94,0.35,129.0,181.48,2.2,0.74,1.45,0.0
25%,6.25,176.85,15666.69,6.13,313.81,365.73,12.07,56.52,3.44,0.0
50%,7.03,196.97,20927.83,7.13,333.25,421.88,14.22,66.67,3.96,0.0
75%,7.91,216.67,27332.76,8.11,352.96,481.79,16.56,76.75,4.5,1.0
max,14.0,323.12,61227.2,13.13,481.03,753.34,28.3,124.0,6.74,1.0


Data Defense

In [13]:
def check_data(input_data, config):
    # Measure the range of input data
    len_input_data = len(input_data)

    # Check data types
    assert input_data.select_dtypes("int").columns.to_list() == config["int_columns"], "an error occurs in int column(s)."
    assert input_data.select_dtypes("float").columns.to_list() == config["float_columns"], "an error occurs in float column(s)."

    # Check range of data
    assert input_data[config["float_columns"][0]].between(config["ph"][0], config["ph"][1]).sum() == len_input_data, "an error occurs in ph range."
    assert input_data[config["float_columns"][1]].between(config["Hardness"][0], config["Hardness"][1]).sum() == len_input_data, "an error occurs in Hardness range."
    assert input_data[config["float_columns"][2]].between(config["Solids"][0], config["Solids"][1]).sum() == len_input_data, "an error occurs in Solids range."
    assert input_data[config["float_columns"][3]].between(config["Chloramines"][0], config["Chloramines"][1]).sum() == len_input_data, "an error occurs in Chloramines range."
    assert input_data[config["float_columns"][4]].between(config["Sulfate"][0], config["Sulfate"][1]).sum() == len_input_data, "an error occurs in Sulfate range."
    assert input_data[config["float_columns"][5]].between(config["Conductivity"][0], config["Conductivity"][1]).sum() == len_input_data, "an error occurs in Conductivity range."
    assert input_data[config["float_columns"][6]].between(config["Organic_carbon"][0], config["Organic_carbon"][1]).sum() == len_input_data, "an error occurs in Organic_carbon range."
    assert input_data[config["float_columns"][7]].between(config["Trihalomethanes"][0], config["Trihalomethanes"][1]).sum() == len_input_data, "an error occurs in Trihalomethanes range."
    assert input_data[config["float_columns"][8]].between(config["Turbidity  "][0], config["Turbidity  "][1]).sum() == len_input_data, "an error occurs in Turbidity   range."
    assert input_data[config["int_columns"][0]].between(config["Potability"][0], config["Potabililty"][1]).sum() == len_input_data, "an error occurs in Potability range."


Data Splitting

In [14]:
x = waterpot[config["predictors"]].copy()
y = waterpot[config["label"]].copy()

In [15]:
x

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,7.417308,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135
1,3.716080,129.422921,18630.057858,6.635246,341.794864,592.885359,15.180013,56.329076,4.500656
2,8.099124,224.236259,19909.541732,9.275884,329.237272,418.606213,16.868637,66.420093,3.055934
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075
...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821
3272,7.808856,193.553212,17329.802160,8.061362,341.343026,392.449580,19.903225,62.554876,2.798243
3273,9.419510,175.762646,33155.578218,7.350233,335.228216,432.044783,11.039070,69.845400,3.298875
3274,5.126763,230.603758,11983.869376,6.303357,336.711411,402.883113,11.168946,77.488213,4.708658


In [16]:
y

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
3271    1.0
3272    1.0
3273    1.0
3274    1.0
3275    1.0
Name: Potability, Length: 3276, dtype: float64

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42, stratify = y)

In [18]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size = 0.5, random_state = 42, stratify = y_test)

In [19]:
utils.pickle_dump(dataset, config["dataset_cleaned_path"])

utils.pickle_dump(x_train, config["train_set_path"][0])
utils.pickle_dump(y_train, config["train_set_path"][1])

utils.pickle_dump(x_valid, config["valid_set_path"][0])
utils.pickle_dump(y_valid, config["valid_set_path"][1])

utils.pickle_dump(x_test, config["test_set_path"][0])
utils.pickle_dump(y_test, config["test_set_path"][1])