In [1]:
# Import necessary basic libraries
# Libraries will be added as needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.utils import resample

In [2]:
# Read Data from CSV file and Split into Train and Test
sni_df = pd.read_csv('data/sni.csv', dtype={'Target Class': str})

In [3]:
# Perform Exploratory Data Analysis
# Check data types, missing values, etc.
sni_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   ID                                                 100000 non-null  int64  
 1   Tangki Septik_Penggunaan                           79971 non-null   object 
 2   Tangki Septik_Terawat                              80001 non-null   object 
 3   Tangki Septik_Bau Tidak Sedap                      80069 non-null   object 
 4   Tangki Septik_Ketercemaran Lingkungan              80011 non-null   object 
 5   Tangki Septik_Jarak Dari Sumber Air (m)            100000 non-null  float64
 6   Sarana Pembuangan BAB_Penggunaan                   80201 non-null   object 
 7   Sarana Pembuangan BAB_Bangunan Terawat             79921 non-null   object 
 8   Sarana Pembuangan BAB_Keberadaan Dinding Bangunan  80041 non-null   object 

In [4]:
print(sni_df.head())

   ID Tangki Septik_Penggunaan Tangki Septik_Terawat   
0   0          Tidak Berfungsi                 Tidak  \
1   1          Tidak Berfungsi                    Ya   
2   2          Tidak Berfungsi                   NaN   
3   3          Tidak Berfungsi                 Tidak   
4   4                Berfungsi                   NaN   

  Tangki Septik_Bau Tidak Sedap Tangki Septik_Ketercemaran Lingkungan   
0                            Ya                                   NaN  \
1                            Ya                                   NaN   
2                         Tidak                                    Ya   
3                            Ya                                 Tidak   
4                            Ya                                   NaN   

   Tangki Septik_Jarak Dari Sumber Air (m) Sarana Pembuangan BAB_Penggunaan   
0                                     6.06                              NaN  \
1                                    12.00                  Tidak 

In [5]:
# From domain knowledge, we know that target class is a categorical variable type of string
# The value follows a format of "ABCD" with A, B, C, D are either 0 or 1
# If A is 1, then it means that "Tangki Septik" is "Layak". Else, it is "Tidak Layak"
# If B is 1, then it means that "Sarana Pembuangan BAB" is "Layak". Else, it is "Tidak Layak"
# If C is 1, then it means that "IPLT/IPAL" is "Layak". Else, it is "Tidak Layak"
# If D is 1, then it means that "Saluran Drainase" is "Layak". Else, it is "Tidak Layak"
# Since the target consists of multiple sub-targets, we will split the target into 4 columns. and then the data itself

# Create the new columns
sni_df['Tangki Septik_Class'] = ''
sni_df['Sarana Pembuangan BAB_Class'] = ''
sni_df['IPLT/IPAL_Class'] = ''
sni_df['Saluran Drainase_Class'] = ''

# Initialize the new columns with the contents
sni_df['Tangki Septik_Class'] = sni_df['Target Class'].str[0]
sni_df['Sarana Pembuangan BAB_Class'] = sni_df['Target Class'].str[1]
sni_df['IPLT/IPAL_Class'] = sni_df['Target Class'].str[2]
sni_df['Saluran Drainase_Class'] = sni_df['Target Class'].str[3]

# Check results
sni_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 31 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   ID                                                 100000 non-null  int64  
 1   Tangki Septik_Penggunaan                           79971 non-null   object 
 2   Tangki Septik_Terawat                              80001 non-null   object 
 3   Tangki Septik_Bau Tidak Sedap                      80069 non-null   object 
 4   Tangki Septik_Ketercemaran Lingkungan              80011 non-null   object 
 5   Tangki Septik_Jarak Dari Sumber Air (m)            100000 non-null  float64
 6   Sarana Pembuangan BAB_Penggunaan                   80201 non-null   object 
 7   Sarana Pembuangan BAB_Bangunan Terawat             79921 non-null   object 
 8   Sarana Pembuangan BAB_Keberadaan Dinding Bangunan  80041 non-null   object 

In [6]:
# Creates 4 different dataframes for each target
sni_tangki_septik_range = [1, 2, 3, 4, 5]
sni_sp_bab_range = [6, 7, 8, 9, 10, 11, 12]
sni_iplt_range = [13, 14, 15, 16, 17, 18]
sni_drainase_range = [19, 20, 21, 22, 23, 24, 25]

sni_tangki_septik = sni_df.iloc[:, sni_tangki_septik_range + [-4]]
sni_sp_bab = sni_df.iloc[:, sni_sp_bab_range + [-3]]
sni_iplt = sni_df.iloc[:, sni_iplt_range + [-2]]
sni_drainase = sni_df.iloc[:, sni_drainase_range + [-1]]

In [7]:
# Check results
print(sni_tangki_septik.info())
print("------------------------------------------------------------------")
print(sni_sp_bab.info())
print("------------------------------------------------------------------")
print(sni_iplt.info())
print("------------------------------------------------------------------")
print(sni_drainase.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 6 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   Tangki Septik_Penggunaan                 79971 non-null   object 
 1   Tangki Septik_Terawat                    80001 non-null   object 
 2   Tangki Septik_Bau Tidak Sedap            80069 non-null   object 
 3   Tangki Septik_Ketercemaran Lingkungan    80011 non-null   object 
 4   Tangki Septik_Jarak Dari Sumber Air (m)  100000 non-null  float64
 5   Tangki Septik_Class                      100000 non-null  object 
dtypes: float64(1), object(5)
memory usage: 4.6+ MB
None
------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column                                             Non-Null Count   Dtype 
---  -----

In [8]:
# Then, split the data into train and test
sni_tangki_septik_train, sni_tangki_septik_test = train_test_split(sni_tangki_septik, test_size=0.2, random_state=42)
sni_sp_bab_train, sni_sp_bab_test = train_test_split(sni_sp_bab, test_size=0.2, random_state=42)
sni_iplt_train, sni_iplt_test = train_test_split(sni_iplt, test_size=0.2, random_state=42)
sni_drainase_train, sni_drainase_test = train_test_split(sni_drainase, test_size=0.2, random_state=42)

In [9]:
# Now, preprocess the training data for machine learning modelling
# We need to find duplicate rows in each database and count them to make sure they're of random instances instead of results of the natural variations since the data is computer-generated
# Also delete them, as it will reduce overfitting and reduce model's training time and bias
database_title = ["Tangki Septik", "Sarana Pembuangan BAB", "IPLT/IPAL", "Saluran Drainase"]
database = [sni_tangki_septik_train, sni_sp_bab_train, sni_iplt_train, sni_drainase_train]

for data_idx in range(len(database)):
    print("------------------------------------------------------------------")
    print("Duplicate rows in " + database_title[data_idx] + " database:" + str(database[data_idx].duplicated().sum()))
    print("Percentage of duplicate rows in " + database_title[data_idx] + " database:" + str(round(database[data_idx].duplicated().sum() / len(database[data_idx]) * 100, 2)) + "%")
    database[data_idx] = database[data_idx].drop_duplicates()

------------------------------------------------------------------
Duplicate rows in Tangki Septik database:9599
Percentage of duplicate rows in Tangki Septik database:12.0%
------------------------------------------------------------------
Duplicate rows in Sarana Pembuangan BAB database:77820
Percentage of duplicate rows in Sarana Pembuangan BAB database:97.28%
------------------------------------------------------------------
Duplicate rows in IPLT/IPAL database:8659
Percentage of duplicate rows in IPLT/IPAL database:10.82%
------------------------------------------------------------------
Duplicate rows in Saluran Drainase database:65154
Percentage of duplicate rows in Saluran Drainase database:81.44%


In [10]:
# Find missing values and then decides what to do with it
for data_idx in range(len(database)):
    print("------------------------------------------------------------------")
    print("Currently processing: " + database_title[data_idx])
    print("------------------------------------------------------------------")
    data = database[data_idx]
    length_data = len(data)
    for col in data.columns:
        print("Column {} has {} missing values. Percentage from total length: {}".format(col, data[col].isnull().sum(), data[col].isnull().sum()/length_data*100))
        print("Column {} has {} unique values.".format(col, len(data[col].unique())))
        print()
    print()

------------------------------------------------------------------
Currently processing: Tangki Septik
------------------------------------------------------------------
Column Tangki Septik_Penggunaan has 14959 missing values. Percentage from total length: 21.248277723327792
Column Tangki Septik_Penggunaan has 3 unique values.

Column Tangki Septik_Terawat has 14887 missing values. Percentage from total length: 21.146006448772035
Column Tangki Septik_Terawat has 3 unique values.

Column Tangki Septik_Bau Tidak Sedap has 14851 missing values. Percentage from total length: 21.094870811494154
Column Tangki Septik_Bau Tidak Sedap has 3 unique values.

Column Tangki Septik_Ketercemaran Lingkungan has 14858 missing values. Percentage from total length: 21.104813852075964
Column Tangki Septik_Ketercemaran Lingkungan has 3 unique values.

Column Tangki Septik_Jarak Dari Sumber Air (m) has 0 missing values. Percentage from total length: 0.0
Column Tangki Septik_Jarak Dari Sumber Air (m) has 50

In [11]:
# Since there's a large percentage of missing values, it is impossible to fill them with the mean or median as it can ruin the distribution of the data
# We will drop the rows with too many missing values. Definition of too many is 50% of the total length of the data excluding the target class
for data_idx in range(len(database)):
    print("------------------------------------------------------------------")
    print("Currently processing: " + database_title[data_idx])
    print("Old length of the data is: " + str(len(database[data_idx])))
    data = database[data_idx]
    width_data = len(data.columns) - 1 # Exclude the target class
    threshold = round(width_data * 0.5)
    data = data.dropna(thresh=width_data - threshold + 1) 
    print("New length of the data is: " + str(len(data)))
    print("------------------------------------------------------------------")
    print()

------------------------------------------------------------------
Currently processing: Tangki Septik
Old length of the data is: 70401
New length of the data is: 68236
------------------------------------------------------------------

------------------------------------------------------------------
Currently processing: Sarana Pembuangan BAB
Old length of the data is: 2180
New length of the data is: 2087
------------------------------------------------------------------

------------------------------------------------------------------
Currently processing: IPLT/IPAL
Old length of the data is: 71341
New length of the data is: 70814
------------------------------------------------------------------

------------------------------------------------------------------
Currently processing: Saluran Drainase
Old length of the data is: 14846
New length of the data is: 14771
------------------------------------------------------------------



In [12]:
# Now, to process the remaining missing values, we will fill them with the mode of the column
for data_idx in range(len(database)):
    data = database[data_idx]
    length_data = len(data)
    for col in data.columns:
        data.loc[:, col] = data.loc[:, col].fillna(data.loc[:, col].mode()[0])

# Recheck the missing values
for data_idx in range(len(database)):
    print("------------------------------------------------------------------")
    print("Currently processing: " + database_title[data_idx])
    print("------------------------------------------------------------------")
    data = database[data_idx]
    length_data = len(data)
    for col in data.columns:
        print("Column {} has {} missing values. Percentage from total length: {}".format(col, data[col].isnull().sum(), data[col].isnull().sum()/length_data*100))
        print("Column {} has {} unique values.".format(col, len(data[col].unique())))
        print()
    print()

------------------------------------------------------------------
Currently processing: Tangki Septik
------------------------------------------------------------------
Column Tangki Septik_Penggunaan has 0 missing values. Percentage from total length: 0.0
Column Tangki Septik_Penggunaan has 2 unique values.

Column Tangki Septik_Terawat has 0 missing values. Percentage from total length: 0.0
Column Tangki Septik_Terawat has 2 unique values.

Column Tangki Septik_Bau Tidak Sedap has 0 missing values. Percentage from total length: 0.0
Column Tangki Septik_Bau Tidak Sedap has 2 unique values.

Column Tangki Septik_Ketercemaran Lingkungan has 0 missing values. Percentage from total length: 0.0
Column Tangki Septik_Ketercemaran Lingkungan has 2 unique values.

Column Tangki Septik_Jarak Dari Sumber Air (m) has 0 missing values. Percentage from total length: 0.0
Column Tangki Septik_Jarak Dari Sumber Air (m) has 5001 unique values.

Column Tangki Septik_Class has 0 missing values. Percenta

In [13]:
# Next, we need to check for outliers.
# There's only two columns that can be checked for outliers, which are "Tangki Septik_Jarak Dari Sumber Air (m)" and "IPLT_Jarak Dari Permukiman (km)"
# We will also use this opportunity to use StandardScaler to scale the data
# For the outlier check, we'll use the IQR method
database_title = ["Tangki Septik", "IPLT/IPAL"]
database = [sni_tangki_septik_train, sni_iplt_train]
columns = ["Tangki Septik_Jarak Dari Sumber Air (m)", "IPLT_Jarak Dari Permukiman (km)"]

for idx in range(len(database_title)):
    print("------------------------------------------------------------------")
    print("Currently processing: " + database_title[idx] + "for column: " + columns[idx])
    data = database[idx]
    column_name = columns[idx]
    q1 = data[column_name].quantile(0.25)
    q3 = data[column_name].quantile(0.75)
    iqr = q3 - q1
    upper_limit = q3 + 1.5 * iqr
    lower_limit = q1 - 1.5 * iqr
    outliers = data[(data[column_name] < lower_limit) | (data[column_name] > upper_limit)]
    print("Number of outliers: " + str(len(outliers)))
    print("Percentage of outliers: " + str(round(len(outliers) / len(data) * 100, 2)) + "%")
    if len(outliers) == 0:
        scaler = RobustScaler()
        data[column_name] = scaler.fit_transform(data[column_name].values.reshape(-1, 1))
    print(data.describe())
    print("------------------------------------------------------------------")
        
        

------------------------------------------------------------------
Currently processing: Tangki Septikfor column: Tangki Septik_Jarak Dari Sumber Air (m)
Number of outliers: 0
Percentage of outliers: 0.0%
       Tangki Septik_Jarak Dari Sumber Air (m)
count                             8.000000e+04
mean                             -2.285563e-03
std                               5.802509e-01
min                              -1.010571e+00
25%                              -5.031712e-01
50%                               7.153024e-17
75%                               4.968288e-01
max                               1.002920e+00
------------------------------------------------------------------
------------------------------------------------------------------
Currently processing: IPLT/IPALfor column: IPLT_Jarak Dari Permukiman (km)
Number of outliers: 0
Percentage of outliers: 0.0%
       IPLT_Jarak Dari Permukiman (km)
count                     80000.000000
mean                          0.00

In [14]:
# Since there are no outliers, we can proceed to the next step, which is to split the data into features and target class
sni_tangki_septik_train_x = sni_tangki_septik_train.drop("Tangki Septik_Class", axis=1)
sni_tangki_septik_train_y = sni_tangki_septik_train["Tangki Septik_Class"]

sni_tangki_septik_test_x = sni_tangki_septik_test.drop("Tangki Septik_Class", axis=1)
sni_tangki_septik_test_y = sni_tangki_septik_test["Tangki Septik_Class"]

sni_sp_bab_train_x = sni_sp_bab_train.drop("Sarana Pembuangan BAB_Class", axis=1)
sni_sp_bab_train_y = sni_sp_bab_train["Sarana Pembuangan BAB_Class"]

sni_sp_bab_test_x = sni_sp_bab_test.drop("Sarana Pembuangan BAB_Class", axis=1)
sni_sp_bab_test_y = sni_sp_bab_test["Sarana Pembuangan BAB_Class"]

sni_iplt_train_x = sni_iplt_train.drop("IPLT/IPAL_Class", axis=1)
sni_iplt_train_y = sni_iplt_train["IPLT/IPAL_Class"]

sni_iplt_test_x = sni_iplt_test.drop("IPLT/IPAL_Class", axis=1)
sni_iplt_test_y = sni_iplt_test["IPLT/IPAL_Class"]

sni_drainase_train_x = sni_drainase_train.drop("Saluran Drainase_Class", axis=1)
sni_drainase_train_y = sni_drainase_train["Saluran Drainase_Class"]

sni_drainase_test_x = sni_drainase_test.drop("Saluran Drainase_Class", axis=1)
sni_drainase_test_y = sni_drainase_test["Saluran Drainase_Class"]


In [15]:
# Lastly, we need to encode the categorical data
# We do this after splitting the data because we don't want to accidentally leak the feature data to the target class

# First, retrieve the categorical columns and set them as categorical
feature_database = [sni_tangki_septik_train_x, sni_sp_bab_train_x, sni_iplt_train_x, sni_drainase_train_x, sni_tangki_septik_test_x, sni_sp_bab_test_x, sni_iplt_test_x, sni_drainase_test_x]

for data_idx in range(len(feature_database)):
    categorical_columns = []
    data = feature_database[data_idx]
    
    for col in data.columns:
        if data[col].dtype == 'object':
            categorical_columns.append(col)
            data.loc[:, col] = data[col].astype('category')
    
    temp = pd.get_dummies(data, columns=categorical_columns, dtype='int64')
    data = temp.reindex(columns=categorical_columns + temp.columns.tolist())
    data = data.drop(categorical_columns, axis=1)
    
    feature_database[data_idx] = data
    
    print(data.info())
    print("------------------------------------------------------------------")
    
# Re-set the database values
sni_tangki_septik_train_x = feature_database[0]
sni_sp_bab_train_x = feature_database[1]
sni_iplt_train_x = feature_database[2]
sni_drainase_train_x = feature_database[3]
sni_tangki_septik_test_x = feature_database[4]
sni_sp_bab_test_x = feature_database[5]
sni_iplt_test_x = feature_database[6]
sni_drainase_test_x = feature_database[7]

# We need to make sure the order of columns are the same for both training and testing data
if sni_tangki_septik_train_x.columns.tolist() != sni_tangki_septik_test_x.columns.tolist():
    print("Columns are not the same for both training and testing data for Tangki Septik")
if sni_sp_bab_train_x.columns.tolist() != sni_sp_bab_test_x.columns.tolist():
    print("Columns are not the same for both training and testing data for Sarana Pembuangan BAB")
if sni_iplt_train_x.columns.tolist() != sni_iplt_test_x.columns.tolist():
    print("Columns are not the same for both training and testing data for IPLT/IPAL")
if sni_drainase_train_x.columns.tolist() != sni_drainase_test_x.columns.tolist():
    print("Columns are not the same for both training and testing data for Saluran Drainase")

<class 'pandas.core.frame.DataFrame'>
Index: 80000 entries, 75220 to 15795
Data columns (total 9 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Tangki Septik_Jarak Dari Sumber Air (m)      80000 non-null  float64
 1   Tangki Septik_Penggunaan_Berfungsi           80000 non-null  int64  
 2   Tangki Septik_Penggunaan_Tidak Berfungsi     80000 non-null  int64  
 3   Tangki Septik_Terawat_Tidak                  80000 non-null  int64  
 4   Tangki Septik_Terawat_Ya                     80000 non-null  int64  
 5   Tangki Septik_Bau Tidak Sedap_Tidak          80000 non-null  int64  
 6   Tangki Septik_Bau Tidak Sedap_Ya             80000 non-null  int64  
 7   Tangki Septik_Ketercemaran Lingkungan_Tidak  80000 non-null  int64  
 8   Tangki Septik_Ketercemaran Lingkungan_Ya     80000 non-null  int64  
dtypes: float64(1), int64(8)
memory usage: 6.1 MB
None
------------------------

In [16]:
# Lastly, we need to check on the class imbalance.
# We can do this by checking the percentage of each class in the target class
# If the data is imbalanced, we need to recombine the x and y data, and then perform undersampling/oversampling based on the minority/majority class
class_columns = ["Tangki Septik_Class", "Sarana Pembuangan BAB_Class", "IPLT/IPAL_Class", "Saluran Drainase_Class"]
class_database = [sni_tangki_septik_train_y, sni_sp_bab_train_y, sni_iplt_train_y, sni_drainase_train_y]

# First, recast the class data to int64
for database in class_database:
    database = database.astype("int64")

# Count the number of each class (it's binary, only 1 or 0)
for data_idx in range(len(class_database)):
    data = class_database[data_idx]
    print("------------------------------------------------------------------")
    print("Currently processing: " + class_columns[data_idx])
    print(data.value_counts())
    print(data.head())

------------------------------------------------------------------
Currently processing: Tangki Septik_Class
Tangki Septik_Class
0    68563
1    11437
Name: count, dtype: int64
75220    0
48955    0
44966    0
13568    0
92727    0
Name: Tangki Septik_Class, dtype: object
------------------------------------------------------------------
Currently processing: Sarana Pembuangan BAB_Class
Sarana Pembuangan BAB_Class
0    79463
1      537
Name: count, dtype: int64
75220    0
48955    0
44966    0
13568    0
92727    0
Name: Sarana Pembuangan BAB_Class, dtype: object
------------------------------------------------------------------
Currently processing: IPLT/IPAL_Class
IPLT/IPAL_Class
0    62411
1    17589
Name: count, dtype: int64
75220    0
48955    0
44966    0
13568    1
92727    0
Name: IPLT/IPAL_Class, dtype: object
------------------------------------------------------------------
Currently processing: Saluran Drainase_Class
Saluran Drainase_Class
1    42679
0    37321
Name: count,

In [17]:
# We hope to see a 50:50 split, 40:60 split, or 20:80 split. If not, we need to perform undersampling/oversampling
# That said, we need to perform undersampling of "0" class to data "Tangki Septik" and "Sistem Pembuangan BAB"

# So, we need to combine the x and y data first
# We will process "Tangki Septik" data first
# Concat the database and then split it based on the majority class and minority class
sni_tangki_septik_train = pd.concat([sni_tangki_septik_train_x, sni_tangki_septik_train_y], axis=1)

minority_class = sni_tangki_septik_train[sni_tangki_septik_train["Tangki Septik_Class"] == "1"]
majority_class = sni_tangki_septik_train[sni_tangki_septik_train["Tangki Septik_Class"] == "0"]

minority_size = len(minority_class)
print(minority_size)

undersampled_majority_class = majority_class.sample(n=round(minority_size * 1.65), random_state=42) # To make the split 60:40

sni_tangki_septik_train = pd.concat([minority_class, undersampled_majority_class], axis=0)

# print(sni_tangki_septik_train.info())
# print(sni_tangki_septik_train["Tangki Septik_Class"].value_counts())

# Then split it again as x and y
sni_tangki_septik_train_x = sni_tangki_septik_train.drop("Tangki Septik_Class", axis=1)
sni_tangki_septik_train_y = sni_tangki_septik_train["Tangki Septik_Class"]

11437


In [18]:
# Do the same for "Sistem Pembuangan BAB" data first
# Concat the database and then split it based on the majority class and minority class
sni_sp_bab_train = pd.concat([sni_sp_bab_train_x, sni_sp_bab_train_y], axis=1)

minority_class = sni_sp_bab_train[sni_sp_bab_train["Sarana Pembuangan BAB_Class"] == "1"]
majority_class = sni_sp_bab_train[sni_sp_bab_train["Sarana Pembuangan BAB_Class"] == "0"]

minority_size = len(minority_class)
print(minority_size)

undersampled_majority_class = majority_class.sample(n=round(minority_size * 1.65), random_state=42) # To make the split 60:40

sni_sp_bab_train = pd.concat([minority_class, undersampled_majority_class], axis=0)

print(sni_sp_bab_train.info())
print(sni_sp_bab_train["Sarana Pembuangan BAB_Class"].value_counts())

sni_sp_bab_train_x = sni_sp_bab_train.drop("Sarana Pembuangan BAB_Class", axis=1)
sni_sp_bab_train_y = sni_sp_bab_train["Sarana Pembuangan BAB_Class"]

537
<class 'pandas.core.frame.DataFrame'>
Index: 1423 entries, 51435 to 83472
Data columns (total 15 columns):
 #   Column                                                   Non-Null Count  Dtype 
---  ------                                                   --------------  ----- 
 0   Sarana Pembuangan BAB_Penggunaan_Berfungsi               1423 non-null   int64 
 1   Sarana Pembuangan BAB_Penggunaan_Tidak Berfungsi         1423 non-null   int64 
 2   Sarana Pembuangan BAB_Bangunan Terawat_Tidak             1423 non-null   int64 
 3   Sarana Pembuangan BAB_Bangunan Terawat_Ya                1423 non-null   int64 
 4   Sarana Pembuangan BAB_Keberadaan Dinding Bangunan_Tidak  1423 non-null   int64 
 5   Sarana Pembuangan BAB_Keberadaan Dinding Bangunan_Ya     1423 non-null   int64 
 6   Sarana Pembuangan BAB_Keberadaan Ventilasi Pintu_Tidak   1423 non-null   int64 
 7   Sarana Pembuangan BAB_Keberadaan Ventilasi Pintu_Ya      1423 non-null   int64 
 8   Sarana Pembuangan BAB_Penerangan y

In [19]:
# Exploratory Data Analysis and Data Preprocessing completed
# We will not use PCA or any other feature selection methods as the number of features are not that many
# Next, we will create machine learning models for each of the data.
# For this assignment, we will create in total 12 models, 3 for each data

# First model is Logistic Regression.
# Reason for choosing this model is because it is simple and perfect for binary classification (1 or 0)
# The domain knowledge source also said that the data is linearly separable, so this model is perfect for this data
# By data, I mean all of the data, not just one of them ("Tangki Septik", "Sistem Pembuangan BAB", "Saluran Drainase", and "IPLT/IPAL")
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

model_tangki_septik = LogisticRegression()
model_tangki_septik.fit(sni_tangki_septik_train_x, sni_tangki_septik_train_y)

model_sp_bab = LogisticRegression()
model_sp_bab.fit(sni_sp_bab_train_x, sni_sp_bab_train_y)

model_drainase = LogisticRegression()
model_drainase.fit(sni_drainase_train_x, sni_drainase_train_y)

model_iplt = LogisticRegression()
model_iplt.fit(sni_iplt_train_x, sni_iplt_train_y)

In [27]:
# Now, we need to test the model using the test data and then evaluate the models

# For "Tangki Septik" data
sni_tangki_septik_test_y_pred = model_tangki_septik.predict(sni_tangki_septik_test_x)
accuracy = accuracy_score(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)
conf_matrix = confusion_matrix(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)
class_report = classification_report(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)

print("Tangki Septik Logistic Regression Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

# For "Sistem Pembuangan BAB" data
sni_sp_bab_test_y_pred = model_sp_bab.predict(sni_sp_bab_test_x)
accuracy = accuracy_score(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)
conf_matrix = confusion_matrix(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)
class_report = classification_report(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)

print("Sistem Pembuangan BAB Logistic Regression Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

# For "Saluran Drainase" data
sni_drainase_test_y_pred = model_drainase.predict(sni_drainase_test_x)
accuracy = accuracy_score(sni_drainase_test_y, sni_drainase_test_y_pred)
conf_matrix = confusion_matrix(sni_drainase_test_y, sni_drainase_test_y_pred)
class_report = classification_report(sni_drainase_test_y, sni_drainase_test_y_pred)

print("Saluran Drainase Logistic Regression Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

# For "IPLT/IPAL" data
sni_iplt_test_y_pred = model_iplt.predict(sni_iplt_test_x)
accuracy = accuracy_score(sni_iplt_test_y, sni_iplt_test_y_pred)
conf_matrix = confusion_matrix(sni_iplt_test_y, sni_iplt_test_y_pred)
class_report = classification_report(sni_iplt_test_y, sni_iplt_test_y_pred)

print("IPLT/IPAL Logistic Regression Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")


Tangki Septik Logistic Regression Evaluation Results
Accuracy:  0.19925
Confusion Matrix: 
 [[ 1128 16015]
 [    0  2857]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.07      0.12     17143
           1       0.15      1.00      0.26      2857

    accuracy                           0.20     20000
   macro avg       0.58      0.53      0.19     20000
weighted avg       0.88      0.20      0.14     20000

---------------------------------------------
Sistem Pembuangan BAB Logistic Regression Evaluation Results
Accuracy:  0.99185
Confusion Matrix: 
 [[19709   163]
 [    0   128]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     19872
           1       0.44      1.00      0.61       128

    accuracy                           0.99     20000
   macro avg       0.72      1.00      0.80     20000
weighted avg       1.00      0.99      0.99     200

In [28]:
# From the results above, we can see that for "Saluran Drainase" data, the model is perfect with 100% accuracy
# Therefore, we can assume that the data is 100% linearly separable and hyperparameter tuning is not needed

# Next, we need to use hyperparameter tuning for the other 3 models as it is still not quite there yet
# We will use GridSearchCV for this task
# We will also use 5-fold cross validation to make sure that the model is not overfitting
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1.0, 10.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

# For "Tangki Septik" data
grid_search = GridSearchCV(estimator=model_tangki_septik, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
grid_search.fit(sni_tangki_septik_train_x, sni_tangki_septik_train_y)

best_params = grid_search.best_params_

best_model_tangki_septik = LogisticRegression(**best_params)
best_model_tangki_septik.fit(sni_tangki_septik_train_x, sni_tangki_septik_train_y)

# For "Sistem Pembuangan BAB" data
grid_search = GridSearchCV(estimator=model_sp_bab, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
grid_search.fit(sni_sp_bab_train_x, sni_sp_bab_train_y)

best_params = grid_search.best_params_

best_model_sp_bab = LogisticRegression(**best_params)
best_model_sp_bab.fit(sni_sp_bab_train_x, sni_sp_bab_train_y)

# For "IPLT/IPAL" data
grid_search = GridSearchCV(estimator=model_iplt, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
grid_search.fit(sni_iplt_train_x, sni_iplt_train_y)

best_params = grid_search.best_params_

best_model_iplt = LogisticRegression(**best_params)
best_model_iplt.fit(sni_iplt_train_x, sni_iplt_train_y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fitting 5 folds for each of 36 candidates, totalling 180 fits




Fitting 5 folds for each of 36 candidates, totalling 180 fits


In [29]:
# Now we will re-evaluate the models using the test data just like before
# For "Tangki Septik" data
sni_tangki_septik_test_y_pred = best_model_tangki_septik.predict(sni_tangki_septik_test_x)
accuracy = accuracy_score(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)
conf_matrix = confusion_matrix(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)
class_report = classification_report(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)

print("Tangki Septik Logistic Regression with Hyperparameter Tuning Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

# For "Sistem Pembuangan BAB" data
sni_sp_bab_test_y_pred = best_model_sp_bab.predict(sni_sp_bab_test_x)
accuracy = accuracy_score(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)
conf_matrix = confusion_matrix(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)
class_report = classification_report(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)

print("Sistem Pembuangan BAB Logistic Regression with Hyperparameter Tuning Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

# For "IPLT/IPAL" data
sni_iplt_test_y_pred = best_model_iplt.predict(sni_iplt_test_x)
accuracy = accuracy_score(sni_iplt_test_y, sni_iplt_test_y_pred)
conf_matrix = confusion_matrix(sni_iplt_test_y, sni_iplt_test_y_pred)
class_report = classification_report(sni_iplt_test_y, sni_iplt_test_y_pred)

print("IPLT/IPAL Logistic Regression with Hyperparameter Tuning Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")


Tangki Septik Logistic Regression with Hyperparameter Tuning Evaluation Results
Accuracy:  0.18405
Confusion Matrix: 
 [[  824 16319]
 [    0  2857]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.05      0.09     17143
           1       0.15      1.00      0.26      2857

    accuracy                           0.18     20000
   macro avg       0.57      0.52      0.18     20000
weighted avg       0.88      0.18      0.12     20000

---------------------------------------------
Sistem Pembuangan BAB Logistic Regression with Hyperparameter Tuning Evaluation Results
Accuracy:  0.99625
Confusion Matrix: 
 [[19797    75]
 [    0   128]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19872
           1       0.63      1.00      0.77       128

    accuracy                           1.00     20000
   macro avg       0.82      1.00      0.89     200

In [30]:
# After parameter tuning, we can see that the accuracy for the 3 models have mixed results
# For the "Sistem Pembuangan BAB" data, the accuracy has increased from 0.99185 to 0.99625
# While for the "Tangki Septik" data, the accuracy has decreased to 0.18405 from 0.19925
# The same goes to "IPLT/IPAL" data, the accuracy has decreased to 0.3298 from 0.3449

# We will now attempt to use other classification models to see if we can get better results
# We will use K-Nearest Neighbors (KNN) for the second model
# Reason is that KNN is a non-parametric model and it is also a lazy learner
# This means that it does not make any assumptions about the data distribution and it does not learn anything from the training data
from sklearn.neighbors import KNeighborsClassifier

# For "Tangki Septik" data
knn_tangki_septik = KNeighborsClassifier(n_neighbors=5)
knn_tangki_septik.fit(sni_tangki_septik_train_x, sni_tangki_septik_train_y)

# For "Sistem Pembuangan BAB" data
knn_sp_bab = KNeighborsClassifier(n_neighbors=5)
knn_sp_bab.fit(sni_sp_bab_train_x, sni_sp_bab_train_y)

# For "IPLT/IPAL" data
knn_iplt = KNeighborsClassifier(n_neighbors=5)
knn_iplt.fit(sni_iplt_train_x, sni_iplt_train_y)

In [31]:
# Now, we need to test the model using the test data and then evaluate the models

# For "Tangki Septik" data
sni_tangki_septik_test_y_pred = knn_tangki_septik.predict(sni_tangki_septik_test_x)
accuracy = accuracy_score(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)
conf_matrix = confusion_matrix(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)
class_report = classification_report(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)

print("Tangki Septik KNN Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

# For "Sistem Pembuangan BAB" data
sni_sp_bab_test_y_pred = knn_sp_bab.predict(sni_sp_bab_test_x)
accuracy = accuracy_score(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)
conf_matrix = confusion_matrix(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)
class_report = classification_report(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)

print("Sistem Pembuangan BAB KNN Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

# For "IPLT/IPAL" data
sni_iplt_test_y_pred = knn_iplt.predict(sni_iplt_test_x)
accuracy = accuracy_score(sni_iplt_test_y, sni_iplt_test_y_pred)
conf_matrix = confusion_matrix(sni_iplt_test_y, sni_iplt_test_y_pred)
class_report = classification_report(sni_iplt_test_y, sni_iplt_test_y_pred)

print("IPLT/IPAL KNN Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

Tangki Septik KNN Evaluation Results
Accuracy:  0.958
Confusion Matrix: 
 [[16303   840]
 [    0  2857]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.95      0.97     17143
           1       0.77      1.00      0.87      2857

    accuracy                           0.96     20000
   macro avg       0.89      0.98      0.92     20000
weighted avg       0.97      0.96      0.96     20000

---------------------------------------------
Sistem Pembuangan BAB KNN Evaluation Results
Accuracy:  0.98005
Confusion Matrix: 
 [[19473   399]
 [    0   128]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     19872
           1       0.24      1.00      0.39       128

    accuracy                           0.98     20000
   macro avg       0.62      0.99      0.69     20000
weighted avg       1.00      0.98      0.99     20000

------------------------------

In [33]:
# You can see from the results above that it's inverted to the results from the Logistic Regression model
# Both models for "Tangki Septik" and "IPLT/IPAL" have increased marginally in accuracy
# While the model for "Sistem Pembuangan BAB" has decreased in accuracy

# Now, we will perform hyperparameter tuning for the KNN model. Again we will use GridSearchCV since there's not many combinations to try
param_grid = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15]}

# For "Tangki Septik" data
grid_search = GridSearchCV(estimator=knn_tangki_septik, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
grid_search.fit(sni_tangki_septik_train_x, sni_tangki_septik_train_y)

best_params = grid_search.best_params_

best_knn_tangki_septik = grid_search.best_estimator_
best_knn_tangki_septik.fit(sni_tangki_septik_train_x, sni_tangki_septik_train_y)

# For "Sistem Pembuangan BAB" data
grid_search = GridSearchCV(estimator=knn_sp_bab, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
grid_search.fit(sni_sp_bab_train_x, sni_sp_bab_train_y)

best_params = grid_search.best_params_

best_knn_sp_bab = grid_search.best_estimator_
best_knn_sp_bab.fit(sni_sp_bab_train_x, sni_sp_bab_train_y)

# For "IPLT/IPAL" data
grid_search = GridSearchCV(estimator=knn_iplt, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
grid_search.fit(sni_iplt_train_x, sni_iplt_train_y)

best_params = grid_search.best_params_

best_knn_iplt = grid_search.best_estimator_
best_knn_iplt.fit(sni_iplt_train_x, sni_iplt_train_y)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Fitting 5 folds for each of 7 candidates, totalling 35 fits
Fitting 5 folds for each of 7 candidates, totalling 35 fits


In [34]:
# Now we will re-evaluate the models using the test data just like before
# For "Tangki Septik" data
sni_tangki_septik_test_y_pred = best_knn_tangki_septik.predict(sni_tangki_septik_test_x)
accuracy = accuracy_score(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)
conf_matrix = confusion_matrix(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)
class_report = classification_report(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)

print("Tangki Septik K-Nearest Neighbor Classifier with Hyperparameter Tuning Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

# For "Sistem Pembuangan BAB" data
sni_sp_bab_test_y_pred = best_knn_sp_bab.predict(sni_sp_bab_test_x)
accuracy = accuracy_score(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)
conf_matrix = confusion_matrix(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)
class_report = classification_report(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)

print("Sistem Pembuangan BAB K-Nearest Neighbor Classifier with Hyperparameter Tuning Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

# For "IPLT/IPAL" data
sni_iplt_test_y_pred = best_knn_iplt.predict(sni_iplt_test_x)
accuracy = accuracy_score(sni_iplt_test_y, sni_iplt_test_y_pred)
conf_matrix = confusion_matrix(sni_iplt_test_y, sni_iplt_test_y_pred)
class_report = classification_report(sni_iplt_test_y, sni_iplt_test_y_pred)

print("IPLT/IPAL K-Nearest Neighbor Classifier with Hyperparameter Tuning Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

Tangki Septik K-Nearest Neighbor Classifier with Hyperparameter Tuning Evaluation Results
Accuracy:  0.96025
Confusion Matrix: 
 [[16348   795]
 [    0  2857]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.95      0.98     17143
           1       0.78      1.00      0.88      2857

    accuracy                           0.96     20000
   macro avg       0.89      0.98      0.93     20000
weighted avg       0.97      0.96      0.96     20000

---------------------------------------------
Sistem Pembuangan BAB K-Nearest Neighbor Classifier with Hyperparameter Tuning Evaluation Results
Accuracy:  0.98575
Confusion Matrix: 
 [[19587   285]
 [    0   128]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     19872
           1       0.31      1.00      0.47       128

    accuracy                           0.99     20000
   macro avg       0.65      0.

In [35]:
# Overall, there is a slight increase in accuracy for all models except for the "IPLT/IPAL" model which shows no change

# Moving on to the next model, we will use the Random Forest Classifier
# Reason for choosing this method is because comparing the previous 2 methods, at first it is thought that the data is linearly separable
# Which is most likely that it is not (except for "Saluran Drainase" data) since Logistic Regression is outperformed by KNN
# So the hypothesis is that the data is non-linearly separable and that there's no independence of features, which is why we will use Random Forest Classifier
# Also, this classifier is also of ensemble method, which means it will combine multiple decision trees for a better accuracy
from sklearn.ensemble import RandomForestClassifier

# For "Tangki Septik" data
rf_tangki_septik = RandomForestClassifier()
rf_tangki_septik.fit(sni_tangki_septik_train_x, sni_tangki_septik_train_y)

# For "Sistem Pembuangan BAB" data
rf_sp_bab = RandomForestClassifier()
rf_sp_bab.fit(sni_sp_bab_train_x, sni_sp_bab_train_y)

# For "IPLT/IPAL" data
rf_iplt = RandomForestClassifier()
rf_iplt.fit(sni_iplt_train_x, sni_iplt_train_y)

In [36]:
# Now, we need to test the model using the test data and then evaluate the models

# For "Tangki Septik" data
sni_tangki_septik_test_y_pred = rf_tangki_septik.predict(sni_tangki_septik_test_x)
accuracy = accuracy_score(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)
conf_matrix = confusion_matrix(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)
class_report = classification_report(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)

print("Tangki Septik Random Forest Classifier Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

# For "Sistem Pembuangan BAB" data
sni_sp_bab_test_y_pred = rf_sp_bab.predict(sni_sp_bab_test_x)
accuracy = accuracy_score(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)
conf_matrix = confusion_matrix(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)
class_report = classification_report(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)

print("Sistem Pembuangan BAB Random Forest Classifier Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

# For "IPLT/IPAL" data
sni_iplt_test_y_pred = rf_iplt.predict(sni_iplt_test_x)
accuracy = accuracy_score(sni_iplt_test_y, sni_iplt_test_y_pred)
conf_matrix = confusion_matrix(sni_iplt_test_y, sni_iplt_test_y_pred)
class_report = classification_report(sni_iplt_test_y, sni_iplt_test_y_pred)

print("IPLT/IPAL Random Forest Classifier Evaluation Results")
print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")


Tangki Septik Random Forest Classifier Evaluation Results
Accuracy:  0.98115
Confusion Matrix: 
 [[16766   377]
 [    0  2857]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     17143
           1       0.88      1.00      0.94      2857

    accuracy                           0.98     20000
   macro avg       0.94      0.99      0.96     20000
weighted avg       0.98      0.98      0.98     20000

---------------------------------------------
Sistem Pembuangan BAB Random Forest Classifier Evaluation Results
Accuracy:  0.9979
Confusion Matrix: 
 [[19830    42]
 [    0   128]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19872
           1       0.75      1.00      0.86       128

    accuracy                           1.00     20000
   macro avg       0.88      1.00      0.93     20000
weighted avg       1.00      1.00      1.0

In [37]:
# You can see from the results above that it has really good performance for all models for all data compared to the previous two methods

# Now, we will perform hyperparameter tuning for the KNN model. Again we will use GridSearchCV since there's not many combinations to try
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
}

# For "Tangki Septik" data
grid_search = GridSearchCV(estimator=rf_tangki_septik, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
grid_search.fit(sni_tangki_septik_train_x, sni_tangki_septik_train_y)

best_params = grid_search.best_params_
best_score_tangki_septik = grid_search.best_score_

best_rf_tangki_septik = grid_search.best_estimator_
best_rf_tangki_septik.fit(sni_tangki_septik_train_x, sni_tangki_septik_train_y)

# For "Sistem Pembuangan BAB" data
grid_search = GridSearchCV(estimator=rf_sp_bab, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
grid_search.fit(sni_sp_bab_train_x, sni_sp_bab_train_y)

best_params = grid_search.best_params_
best_score_sp_bab = grid_search.best_score_

best_rf_sp_bab = grid_search.best_estimator_
best_rf_sp_bab.fit(sni_sp_bab_train_x, sni_sp_bab_train_y)

# For "IPLT/IPAL" data
grid_search = GridSearchCV(estimator=rf_iplt, param_grid=param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)
grid_search.fit(sni_iplt_train_x, sni_iplt_train_y)

best_params = grid_search.best_params_
best_score_iplt = grid_search.best_score_

best_rf_iplt = grid_search.best_estimator_
best_rf_iplt.fit(sni_iplt_train_x, sni_iplt_train_y)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [38]:
# Now we will re-evaluate the models using the test data just like before
# For "Tangki Septik" data
sni_tangki_septik_test_y_pred = best_rf_tangki_septik.predict(sni_tangki_septik_test_x)
accuracy = accuracy_score(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)
conf_matrix = confusion_matrix(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)
class_report = classification_report(sni_tangki_septik_test_y, sni_tangki_septik_test_y_pred)

print("Tangki Septik Random Forest Classifier with Hyperparameter Tuning Evaluation Results")
print("Accuracy: ", accuracy)
print("Best Score: ", best_score_tangki_septik)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

# For "Sistem Pembuangan BAB" data
sni_sp_bab_test_y_pred = best_rf_sp_bab.predict(sni_sp_bab_test_x)
accuracy = accuracy_score(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)
conf_matrix = confusion_matrix(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)
class_report = classification_report(sni_sp_bab_test_y, sni_sp_bab_test_y_pred)

print("Sistem Pembuangan BAB Random Forest Classifier with Hyperparameter Tuning Evaluation Results")
print("Accuracy: ", accuracy)
print("Best Score: ", best_score_sp_bab)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

# For "IPLT/IPAL" data
sni_iplt_test_y_pred = best_rf_iplt.predict(sni_iplt_test_x)
accuracy = accuracy_score(sni_iplt_test_y, sni_iplt_test_y_pred)
conf_matrix = confusion_matrix(sni_iplt_test_y, sni_iplt_test_y_pred)
class_report = classification_report(sni_iplt_test_y, sni_iplt_test_y_pred)

print("IPLT/IPAL Random Forest Classifier with Hyperparameter Tuning Evaluation Results")
print("Accuracy: ", accuracy)
print("Best Score: ", best_score_iplt)
print("Confusion Matrix: \n", conf_matrix)
print("Classification Report: \n", class_report)
print("---------------------------------------------")

Tangki Septik Random Forest Classifier with Hyperparameter Tuning Evaluation Results
Accuracy:  0.98115
Best Score:  1.0
Confusion Matrix: 
 [[16766   377]
 [    0  2857]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     17143
           1       0.88      1.00      0.94      2857

    accuracy                           0.98     20000
   macro avg       0.94      0.99      0.96     20000
weighted avg       0.98      0.98      0.98     20000

---------------------------------------------
Sistem Pembuangan BAB Random Forest Classifier with Hyperparameter Tuning Evaluation Results
Accuracy:  0.9979
Best Score:  1.0
Confusion Matrix: 
 [[19830    42]
 [    0   128]]
Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     19872
           1       0.75      1.00      0.86       128

    accuracy                           1.00     20000
   macro

In [40]:
# After all the model and hyperparameter tuning process has completed, the following are the best models for each data
# For "Tangki Septik" data, the best model is the Random Forest Classifier with Hyperparameter Tuning with accuracy of 0.98115
# For "Sistem Pembuangan BAB" data, the best model is the Random Forest Classifier with Hyperparameter Tuning with accuracy of 0.9979
# For "IPLT/IPAL" data, the best model is the Random Forest Classifier with Hyperparameter Tuning with accuracy of 0.945
# For "Saluran Drainase" data, the best model is the Logistic Regression with accuracy of 1.0

# Now, we need to save the models so that we can load them at a later time.
import pickle

# For "Tangki Septik" data, we save the Random Forest Classifier with Hyperparameter Tuning model
pickle.dump(best_rf_tangki_septik, open("saved_models/best_rf_tangki_septik.pkl", "wb"))
# For "Sistem Pembuangan BAB" data, we save the Random Forest Classifier with Hyperparameter Tuning model
pickle.dump(best_rf_sp_bab, open("saved_models/best_rf_sp_bab.pkl", "wb"))
# For "IPLT/IPAL" data, we save the Random Forest Classifier with Hyperparameter Tuning model
pickle.dump(best_rf_iplt, open("saved_models/best_rf_iplt.pkl", "wb"))
# For "Saluran Drainase" data, we save the Logistic Regression model
pickle.dump(model_drainase, open("saved_models/model_drainase.pkl", "wb"))

In [41]:
# This section of the code is used to load the saved models
tangki_septik_model = pickle.load(open("saved_models/best_rf_tangki_septik.pkl", "rb"))
sp_bab_model = pickle.load(open("saved_models/best_rf_sp_bab.pkl", "rb"))
iplt_model = pickle.load(open("saved_models/best_rf_iplt.pkl", "rb"))
drainase_model = pickle.load(open("saved_models/model_drainase.pkl", "rb"))

In [42]:
print(sni_drainase_train_x.columns)

Index(['Saluran Drainase_Hierarki Drainase_Primer',
       'Saluran Drainase_Hierarki Drainase_Sekunder',
       'Saluran Drainase_Hierarki Drainase_Tersier',
       'Saluran Drainase_Jenis Drainase_Terbuka',
       'Saluran Drainase_Jenis Drainase_Tertutup',
       'Saluran Drainase_Bentuk Penampang_Persegi',
       'Saluran Drainase_Bentuk Penampang_Segitiga',
       'Saluran Drainase_Bentuk Penampang_Setengah Lingkaran',
       'Saluran Drainase_Bentuk Penampang_Trapesiun',
       'Saluran Drainase_Perkerasan Tepi Drainase_Batuan',
       'Saluran Drainase_Perkerasan Tepi Drainase_Kerikil Halus',
       'Saluran Drainase_Perkerasan Tepi Drainase_Lempung Kepasiran',
       'Saluran Drainase_Perkerasan Tepi Drainase_Pasir Halus',
       'Saluran Drainase_Perkerasan Tepi Drainase_Tanah',
       'Saluran Drainase_Kondisi Drainase_Terawat',
       'Saluran Drainase_Kondisi Drainase_Tidak Terawat',
       'Saluran Drainase_Bau Tidak Sedap_Tidak',
       'Saluran Drainase_Bau Tidak Sedap_Y