In [None]:
import json

import numpy as np
import pandas as pd

import shap
from SALib.analyze import rbd_fast
from SALib.sample.morris import sample

from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler

from sqlalchemy import create_engine

In [82]:
cs = "mssql+pyodbc://smart-economy:ddsK^yiNTQ#^*2Gj@smart-economy-db.database.windows.net:1433/smart-economy?driver=ODBC+Driver+17+for+SQL+Server"
connection = create_engine(cs, connect_args={"check_same_thread": True})

df = pd.read_sql_query("SELECT [form] FROM [kuisioner]", connection)
df = pd.DataFrame([json.loads(row[1]) for row in df.itertuples()])

In [None]:
# df = pd.read_json("smarteconomy_dbo_kuisioner.json", orient="records")["form"].values
# df = pd.DataFrame([json.loads(row) for row in df])

In [83]:
df

Unnamed: 0,doc_id,x0_nama,x0_jenis_kelamin,x0_no_hp,x0_status_di_kelompok,x0_domisili,x0_kelas_bangunan_usaha,x1_4,x1_19,x1_29_1,...,y2_114_a,y2_114_b,y2_115_a,y2_115_b,y2_116_a,y2_116_b,y2_117_a,y2_117_b,y2_118_a,y2_118_b
0,,OJANG,L,085723556067,1,MEKARJAYA,PETANI,5,2,0,...,2,3,2,2,2,3,2,3,2,3
1,,IWAN,L,084758574854,1,CIANAGA,PETANI,1,1,0,...,3,4,2,4,2,4,3,4,2,4
2,,DIAH,P,085647382635,1,CIHAMERANG,PETANI,1,2,50000,...,3,4,3,4,3,4,3,4,3,4
3,,YULI,P,085647382635,1,MEKARJAYA,PEDAGANG,5,1,0,...,3,4,3,5,3,4,3,4,3,4
4,,RATIH,P,085647382635,1,MEKARJAYA,PETANI,1,1,0,...,3,3,3,3,3,3,3,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,376,UEN,P,085721895756,1,MEKARJAYA,PETANI,1,3,50000,...,4,3,4,3,4,3,4,3,4,3
365,377,IMAM JULIANSYAH,L,081221858565,1,MEKARJAYA,TANI,1,3,50000,...,2,3,2,3,2,3,2,3,1,3
366,378,SITI MARLINA,P,085860952788,1,MEKARJAYA,PEDAGANG,5,3,50000,...,4,5,3,4,2,3,2,3,4,5
367,379,HERI,L,085611123434,1,MEKARJAYA,PETANI,1,3,50000,...,2,3,2,3,1,3,1,3,2,3


In [102]:
def summarize_variables(data: pd.DataFrame, var_name: str):
    IGNORED_COLS = [
        "x1_29_1",
        "x1_29_2",
        "x1_29_3",
        "x1_29_4",
        "x1_30_1",
        "x1_30_2",
        "x1_30_3",
        "x1_30_4",
    ]

    # pick the column with the var_name prefix and not in ignored columns
    indexes = [
        i
        for i, col in enumerate(df.columns)
        if col.startswith(var_name)
        and not col.endswith("_alasan")
        and not col.endswith("_tahun")
        and not col.endswith("_info")
        and not col.endswith("_frekuensi")
        and not col.endswith("_lainnya")
        and col not in IGNORED_COLS
    ]

    # get all row with column indexes
    summarize_df = df.iloc[:, indexes].astype(float).fillna(0)

    # get the mean of each row
    return summarize_df.mean(axis=1)


In [103]:
def analyze_sensitivity(model, features, bounds):
    # create problem space
    problem = {
        "num_vars": len(features),
        "names": features,
        "bounds": bounds,
    }

    # create samples
    param_values = sample(problem, 1000, num_levels=4, optimal_trajectories=None, seed=42)
    
    # run prediction
    preds = model.predict(param_values)

    # run sensitivity analysis
    sens = rbd_fast.analyze(problem, param_values, preds, conf_level=0.95, seed=42)

    return pd.DataFrame(sens, index=sens["names"])

In [104]:
df_summarize = pd.DataFrame({
    "X1": summarize_variables(df, "x1"),
    "X2": summarize_variables(df, "x2"),
    "X3": summarize_variables(df, "x3"),
    "X4": summarize_variables(df, "x4"),
    "X5": summarize_variables(df, "x5"),
    "Y1": summarize_variables(df, "y1"),
    "Y2": summarize_variables(df, "y2"),
})

df_summarize.head()

Unnamed: 0,X1,X2,X3,X4,X5,Y1,Y2
0,1.75,3.0,2.047619,2.75,2.403846,2.777778,2.304348
1,1.0625,2.0,2.02381,2.4,2.615385,2.666667,2.728261
2,1.25,2.0,2.095238,2.45,2.826923,3.0,2.73913
3,1.4375,2.333333,1.547619,2.35,3.0,2.666667,2.858696
4,1.25,2.166667,1.595238,3.0,2.826923,2.666667,2.728261


In [105]:
X = df_summarize[["X1", "X2", "X3", "X4", "X5", "Y1"]].values
y = pd.cut(df_summarize["Y2"].values, 3, labels=[1, 2, 3])

In [153]:
fold_nums = []
sensitivities = []

kfold = KFold(n_splits=10, shuffle=True, random_state=42)
for i, (train, test) in enumerate(kfold.split(X, y)):
    print("Running analysis for fold {}".format(i))
    
    # get the data
    X_train = X[train]
    y_train = y[train]

    # build the model
    model = MLPClassifier(max_iter=200, activation="relu", hidden_layer_sizes=(20), alpha=0.001, solver="adam", random_state=42)

    # fit model
    model.fit(X_train, y_train)

    # create problem space
    features = ["X1", "X2", "X3", "X4", "X5", "Y1"]
    bounds = [
        [X_train[:, 0].min(), X_train[:, 0].max()],
        [X_train[:, 1].min(), X_train[:, 1].max()],
        [X_train[:, 2].min(), X_train[:, 2].max()],
        [X_train[:, 3].min(), X_train[:, 3].max()],
        [X_train[:, 4].min(), X_train[:, 4].max()],
        [X_train[:, 5].min(), X_train[:, 5].max()],
    ]

    # run sensitivity analysis
    sensi = analyze_sensitivity(model, features, bounds)["S1"].values

    # save run data
    fold_nums.append(i)
    sensitivities.append(sensi)

Running analysis for fold 0


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Running analysis for fold 1


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Running analysis for fold 2


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Running analysis for fold 3


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Running analysis for fold 4


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Running analysis for fold 5


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Running analysis for fold 6


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Running analysis for fold 7


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Running analysis for fold 8


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Running analysis for fold 9


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


In [154]:
scaler = MinMaxScaler()

sensi_ori = np.nan_to_num(np.array(sensitivities))
sensi_scaled = scaler.fit_transform(sensi_ori)
sensi_means = np.mean(sensi_scaled, axis=0)
sensi_means

array([0.40580542, 0.44120659, 0.32161526, 0.48572695, 0.57395125,
       0.34470136])

In [155]:
df_summ = pd.DataFrame(sensi_scaled, columns=["X1", "X2", "X3", "X4", "X5", "Y1"])
df_summ.loc["mean"] = df_summ.mean()
df_summ.style.background_gradient(cmap='Blues')

Unnamed: 0,X1,X2,X3,X4,X5,Y1
0,0.611613,0.382663,0.117733,0.902365,0.486707,0.163315
1,0.134442,0.887634,0.143084,0.327466,0.595782,0.167626
2,0.798211,0.198014,0.145436,1.0,0.577676,0.115993
3,0.237876,0.421198,1.0,0.0,0.857583,0.298063
4,0.425935,0.0,0.362843,0.070188,1.0,0.998549
5,0.216272,0.508631,0.379559,0.613645,0.630343,0.222392
6,0.0,0.508814,0.0,0.659471,0.0,0.0
7,0.534125,0.382872,0.435241,0.563253,0.698114,1.0
8,0.09958,1.0,0.070488,0.320312,0.426953,0.342646
9,1.0,0.12224,0.561769,0.400569,0.466355,0.13843
