In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import numpy as np

In [4]:
n_samples = 5000
data = {
        'provinsi_id': np.random.randint(1, 35, n_samples),  # 34 provinsi
        'kabupaten_id': np.random.randint(1, 515, n_samples),  # ~514 kab/kota
        'kepadatan_penduduk': np.random.lognormal(5, 1.5, n_samples),
        'tingkat_pendidikan': np.random.normal(8.5, 2.5, n_samples),  # rata2 tahun sekolah
        'tingkat_pengangguran': np.random.gamma(2, 3, n_samples),  # %
        'akses_air_bersih': np.random.beta(7, 2, n_samples) * 100,  # %
        'akses_listrik': np.random.beta(8, 1.5, n_samples) * 100,  # %
        'fasilitas_kesehatan': np.random.poisson(15, n_samples),  # per 10k penduduk
        'jalan_aspal': np.random.beta(5, 3, n_samples) * 100,  # %
        'luas_sawah': np.random.exponential(5000, n_samples),  # hektar
        'pendapatan_perkapita': np.random.lognormal(14.5, 0.8, n_samples),  # rupiah/bulan
        'jenis_wilayah': np.random.choice(['urban', 'rural'], n_samples, p=[0.6, 0.4])
    }
    

In [5]:
df = pd.DataFrame(data)

In [6]:
df

Unnamed: 0,provinsi_id,kabupaten_id,kepadatan_penduduk,tingkat_pendidikan,tingkat_pengangguran,akses_air_bersih,akses_listrik,fasilitas_kesehatan,jalan_aspal,luas_sawah,pendapatan_perkapita,jenis_wilayah
0,27,380,275.993366,6.195379,12.731975,58.209852,68.085994,11,67.566386,18600.020621,1.825657e+06,rural
1,16,84,113.498011,11.855878,2.730515,70.076751,61.047407,18,67.486958,2723.657763,1.519617e+06,rural
2,27,411,306.020230,8.628373,5.077963,93.178346,83.270381,20,58.413336,2758.631273,3.729450e+06,rural
3,7,221,41.560935,7.674385,14.370874,80.675507,93.424962,18,74.057577,9732.912606,1.542648e+06,urban
4,16,78,103.633391,9.445138,7.324625,93.532929,77.530889,14,55.556559,6065.250459,2.729549e+06,rural
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,32,116,239.783711,11.221832,2.313656,82.490191,99.067337,16,78.154494,2051.844637,2.083741e+06,rural
4996,16,156,644.652838,10.447609,9.021351,83.821680,87.806195,18,69.114678,4923.503051,1.982096e+06,urban
4997,19,109,98.635075,10.793040,6.688788,89.109776,87.243733,8,54.806571,4185.865302,3.810731e+06,urban
4998,14,330,117.584664,10.882580,12.044552,54.383087,76.643606,15,66.339085,13840.997466,5.837477e+05,rural


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   provinsi_id           5000 non-null   int32  
 1   kabupaten_id          5000 non-null   int32  
 2   kepadatan_penduduk    5000 non-null   float64
 3   tingkat_pendidikan    5000 non-null   float64
 4   tingkat_pengangguran  5000 non-null   float64
 5   akses_air_bersih      5000 non-null   float64
 6   akses_listrik         5000 non-null   float64
 7   fasilitas_kesehatan   5000 non-null   int32  
 8   jalan_aspal           5000 non-null   float64
 9   luas_sawah            5000 non-null   float64
 10  pendapatan_perkapita  5000 non-null   float64
 11  jenis_wilayah         5000 non-null   object 
dtypes: float64(8), int32(3), object(1)
memory usage: 410.3+ KB


In [10]:
pendapatan = df['pendapatan_perkapita']<1800000
air = df['akses_air_bersih']<80
listrik = df['akses_listrik'] <85
pendidikan = df['tingkat_pendidikan']<8

In [11]:
df['kemiskinan'] = (pendapatan.astype(int) & (air.astype(int) + listrik.astype(int)+pendidikan.astype(int)>=2)).astype(int)

In [12]:
df

Unnamed: 0,provinsi_id,kabupaten_id,kepadatan_penduduk,tingkat_pendidikan,tingkat_pengangguran,akses_air_bersih,akses_listrik,fasilitas_kesehatan,jalan_aspal,luas_sawah,pendapatan_perkapita,jenis_wilayah,kemiskinan
0,27,380,275.993366,6.195379,12.731975,58.209852,68.085994,11,67.566386,18600.020621,1.825657e+06,rural,0
1,16,84,113.498011,11.855878,2.730515,70.076751,61.047407,18,67.486958,2723.657763,1.519617e+06,rural,1
2,27,411,306.020230,8.628373,5.077963,93.178346,83.270381,20,58.413336,2758.631273,3.729450e+06,rural,0
3,7,221,41.560935,7.674385,14.370874,80.675507,93.424962,18,74.057577,9732.912606,1.542648e+06,urban,0
4,16,78,103.633391,9.445138,7.324625,93.532929,77.530889,14,55.556559,6065.250459,2.729549e+06,rural,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,32,116,239.783711,11.221832,2.313656,82.490191,99.067337,16,78.154494,2051.844637,2.083741e+06,rural,0
4996,16,156,644.652838,10.447609,9.021351,83.821680,87.806195,18,69.114678,4923.503051,1.982096e+06,urban,0
4997,19,109,98.635075,10.793040,6.688788,89.109776,87.243733,8,54.806571,4185.865302,3.810731e+06,urban,0
4998,14,330,117.584664,10.882580,12.044552,54.383087,76.643606,15,66.339085,13840.997466,5.837477e+05,rural,1


In [43]:
numeric_cols = ['kepadatan_penduduk','tingkat_pendidikan',
                'tingkat_pengangguran','akses_air_bersih','akses_listrik',
                'fasilitas_kesehatan','jalan_aspal',
                'luas_sawah','pendapatan_perkapita']
cat_cols = ['jenis_wilayah']

In [44]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(), cat_cols)])

In [45]:
X = df[numeric_cols + cat_cols]

In [46]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
X = df[numeric_cols + cat_cols]
X_processed = pipeline.fit_transform(X)

In [47]:
y = df['kemiskinan']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

In [49]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
model = keras.Sequential([
    keras.layers.Input(shape=(X_train.shape[1],)),
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dense(8, activation="relu"),
    keras.layers.Dense(1, activation="sigmoid")
])

In [50]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

In [51]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5593 - loss: 0.7049 - val_accuracy: 0.8180 - val_loss: 0.4719
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8147 - loss: 0.4470 - val_accuracy: 0.8210 - val_loss: 0.3700
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8276 - loss: 0.3572 - val_accuracy: 0.8470 - val_loss: 0.3164
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8564 - loss: 0.3007 - val_accuracy: 0.8710 - val_loss: 0.2831
Epoch 5/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8748 - loss: 0.2809 - val_accuracy: 0.8810 - val_loss: 0.2565
Epoch 6/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8971 - loss: 0.2437 - val_accuracy: 0.8920 - val_loss: 0.2351
Epoch 7/50
[1m125/125[0m 

In [52]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("Confusion Matrix:\n", cm)

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Accuracy: 0.934
Precision: 0.8372093023255814
Recall: 0.7912087912087912
Confusion Matrix:
 [[790  28]
 [ 38 144]]


In [54]:
import joblib
joblib.dump(preprocessor, "preprocessor.pkl")
joblib.dump(preprocessor, "BPS.pkl")

['BPS.pkl']