Load Libraries

In [1]:
import src.util as utils
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler

Load Configuration

In [2]:
config = utils.load_config()

Load Dataset

In [3]:
def load_dataset(config_data: dict):
    # Load every set of data
    x_train = utils.pickle_load(config_data["train_set_path"][0])
    y_train = utils.pickle_load(config_data["train_set_path"][1])

    x_valid = utils.pickle_load(config_data["valid_set_path"][0])
    y_valid = utils.pickle_load(config_data["valid_set_path"][1])

    x_test = utils.pickle_load(config_data["test_set_path"][0])
    y_test = utils.pickle_load(config_data["test_set_path"][1])

    # Concatenate x and y each set
    train_set = pd.concat([x_train, y_train], axis = 1)
    valid_set = pd.concat([x_valid, y_valid], axis = 1)
    test_set = pd.concat([x_test, y_test], axis = 1)

    # Return 3 set of data
    return train_set, valid_set, test_set

In [4]:
train_set, valid_set, test_set = load_dataset(config)

In [5]:
print(train_set.shape)
print(valid_set.shape)
print(test_set.shape)

(2620, 10)
(328, 10)
(328, 10)


Over Sample Train Data

In [6]:
x_ros, y_ros = SMOTE(random_state = 42).fit_resample(
    train_set.drop(columns = config["label"]),
    train_set[config["label"]]
)
train_set_bal = pd.concat([x_ros, y_ros], axis = 1)

Scaling Data (normalisasi)

In [7]:

scaler = MinMaxScaler()
names = x_ros.columns
d = scaler.fit_transform(x_ros)

x_std = pd.DataFrame(d, columns=names)
train_set_bal_std = pd.concat([x_std, y_ros], axis = 1)

In [8]:
train_set_bal_std.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,3196.0,3196.0,3196.0,3196.0,3196.0,3196.0,3196.0,3196.0,3196.0,3196.0
mean,0.515206,0.541752,0.355427,0.532943,0.58259,0.432318,0.485841,0.530251,0.472655,0.5
std,0.107004,0.116039,0.146039,0.122676,0.104331,0.138255,0.128453,0.124523,0.144235,0.500078
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.454914,0.47303,0.247856,0.456007,0.523831,0.329772,0.400322,0.453113,0.37403,0.0
50%,0.511706,0.542966,0.337933,0.532344,0.582483,0.427289,0.483022,0.531991,0.47395,0.5
75%,0.574257,0.61272,0.441379,0.610504,0.639772,0.529065,0.571809,0.607904,0.568976,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Outlier Remove

In [9]:
def remove_outliers(set_data):
    set_data = set_data.copy()
    list_of_set_data = list()

    for col_name in set_data.columns[:-1]:
        q1 = set_data[col_name].quantile(0.25)
        q3 = set_data[col_name].quantile(0.75)
        iqr = q3 - q1
        set_data_cleaned = set_data[~((set_data[col_name] < (q1 - 1.5 * iqr)) | (set_data[col_name] > (q3 + 1.5 * iqr)))].copy()
        list_of_set_data.append(set_data_cleaned.copy())
    
    set_data_cleaned = pd.concat(list_of_set_data)
    count_duplicated_index = set_data_cleaned.index.value_counts()
    used_index_data = count_duplicated_index[count_duplicated_index == (set_data.shape[1]-1)].index
    set_data_cleaned = set_data_cleaned.loc[used_index_data].drop_duplicates()

    return set_data_cleaned

In [10]:
train_set_bal_cleaned = remove_outliers(train_set_bal)

In [11]:
train_set_bal_cleaned.describe()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
count,2689.0,2689.0,2689.0,2689.0,2689.0,2689.0,2689.0,2689.0,2689.0,2689.0
mean,7.056138,197.125394,21479.484152,7.088768,334.292479,428.150772,14.233504,65.977483,3.971829,0.499442
std,1.199744,27.458526,7952.663563,1.403651,30.58039,78.357271,3.066849,14.277681,0.7367,0.500093
min,3.902476,120.101908,320.942611,3.23958,252.23133,210.319182,5.914617,28.400877,1.922377,0.0
25%,6.328553,179.548074,15417.932834,6.17557,315.180405,370.085153,12.128377,56.932455,3.453861,0.0
50%,7.036774,197.300014,20596.391231,7.076222,334.161127,425.790228,14.150363,66.179547,3.978495,0.0
75%,7.784067,215.194145,26744.301742,8.021803,352.952803,482.673327,16.333126,75.41671,4.481552,1.0
max,10.252816,273.813807,44868.458368,10.999995,413.914001,652.537592,22.686837,104.230949,5.989543,1.0


Dumping File

In [12]:
utils.pickle_dump(train_set_bal_cleaned[config["predictors"]], config["train_feng_set_path"][0])
utils.pickle_dump(train_set_bal_cleaned[config["label"]], config["train_feng_set_path"][1])

utils.pickle_dump(valid_set[config["predictors"]], config["valid_feng_set_path"][0])
utils.pickle_dump(valid_set[config["label"]], config["valid_feng_set_path"][1])

utils.pickle_dump(test_set[config["predictors"]], config["test_feng_set_path"][0])
utils.pickle_dump(test_set[config["label"]], config["test_feng_set_path"][1])