In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import statsmodels.api as sm
from scipy import stats
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson
from statsmodels.tools import eval_measures
import itertools
import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('data/cdv.csv')
print(df.columns.tolist())

['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'Y']


In [3]:
# df.describe()

In [4]:
X = df.drop(columns=['Y'])
df['Y'] = df['Y'].astype('category')
y_cat = df['Y'].cat.reorder_categories([2,1,3], ordered=True)
print(y_cat.value_counts())

n = len(df['Y'])
p = df.shape[1]
k = p - 1

Y
2    9
1    8
3    4
Name: count, dtype: int64


#### Cek multikolinearitas

In [5]:
X_const = add_constant(X)
vif_df = pd.DataFrame()
vif_df["Variable"] = X_const.columns
vif_df["VIF"] = [variance_inflation_factor(X_const.values, i) for i in range(X_const.shape[1])]
print(vif_df)

   Variable         VIF
0     const  310.528334
1        X1    1.659555
2        X2    1.321321
3        X3    1.370177
4        X4    2.297288
5        X5    2.571857
6        X6    2.002548
7        X7    1.737764
8        X8    2.139399
9        X9    2.152826
10      X10    1.870077


#### Standarisasi Variabel Rasio

In [6]:
rasio_vars = ['X1', 'X4', 'X5', 'X6', 'X7', 'X8']
ctg_vars = ['X2','X3','X9','X10']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[rasio_vars])
X_scaled_df = pd.DataFrame(X_scaled, columns=rasio_vars, index=X.index)

Isi X: X_rasio scaled & kombinasi X_ctg

### V1

In [7]:
category_combinations = [
    ['X10'],
    ['X2'],
    ['X3'],
    ['X9'],
    ['X10', 'X2'],
    ['X2', 'X3'],
    ['X10', 'X3'],
    ['X9','X10'],
    ['X9','X3'],
    ['X9','X2'],
    ['X10', 'X2', 'X3'],
    ['X10', 'X2', 'X3'],
]

In [8]:
results_list = []
for combo in category_combinations:
    X_combo = pd.concat([X_scaled_df, X[combo]], axis=1)

    try:
        X_const = sm.add_constant(X_combo)
        model = sm.MNLogit(y_cat, X_const)
        result = model.fit()
        ll_full = result.llf
        ll_null = result.llnull
        pseudo_r2 = 1 - (ll_full / ll_null)
        
        if not pd.isna(pseudo_r2):
            summary_text = result.summary().as_text()
            results_list.append({
                'Kombinasi': combo,
                "Predictors Model": list(X_combo.columns),
                'Jumlah Variabel': len(X_combo.columns),
                'Pseudo R² (McFadden)': pseudo_r2,
                'LogLik': ll_full,
                'AIC': result.aic,
                'BIC': result.bic,
                "Summary": summary_text
            })

    except Exception as e:
        print(f"Model gagal untuk kombinasi {combo}: {e}")
        continue

Optimization terminated successfully.
         Current function value: nan
         Iterations 13
Optimization terminated successfully.
         Current function value: 0.561501
         Iterations 9
Optimization terminated successfully.
         Current function value: nan
         Iterations 17
Optimization terminated successfully.
         Current function value: nan
         Iterations 17
         Current function value: 0.311299
         Iterations: 35
Optimization terminated successfully.
         Current function value: nan
         Iterations 22
Optimization terminated successfully.
         Current function value: nan
         Iterations 22


In [9]:
results_df = pd.DataFrame(results_list)
results_df = results_df.drop(columns=["Summary"])
results_df.sort_values(by="Pseudo R² (McFadden)", ascending=False, inplace=True)
results_df.reset_index(drop=True, inplace=True)

print("\n===== HASIL REGRESI MULTINOMIAL =====\n")
results_df


===== HASIL REGRESI MULTINOMIAL =====



Unnamed: 0,Kombinasi,Predictors Model,Jumlah Variabel,Pseudo R² (McFadden),LogLik,AIC,BIC
0,"[X2, X3]","[X1, X4, X5, X6, X7, X8, X2, X3]",8,0.70257,-6.537278,49.074555,67.875959
1,[X2],"[X1, X4, X5, X6, X7, X8, X2]",7,0.463516,-11.791512,55.583023,72.295382


In [10]:
print(results_list[0]["Summary"])
print("\033[34m" + "="*78 + "\033[0m")
print(results_list[1]["Summary"])

                          MNLogit Regression Results                          
Dep. Variable:                      Y   No. Observations:                   21
Model:                        MNLogit   Df Residuals:                        5
Method:                           MLE   Df Model:                           14
Date:                Mon, 08 Dec 2025   Pseudo R-squ.:                  0.4635
Time:                        19:02:16   Log-Likelihood:                -11.792
converged:                       True   LL-Null:                       -21.979
Covariance Type:            nonrobust   LLR p-value:                    0.1187
       Y=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.1976      0.986     -0.200      0.841      -2.129       1.734
X1            -1.8652      1.154     -1.616      0.106      -4.128       0.397
X4             0.4107      0.941      0.436      0.6

Tidak ada variabel yang signifikan, sehingga dilakukan pengombinasian pada variabel rasio dengan variabel kategorik X2 dan X3 saja

### V2

In [11]:
def build_mnlogit_model(X, y):
    X_model = sm.add_constant(X, has_constant='add')
    model = sm.MNLogit(y, X_model)
    result = model.fit(method='newton', disp=False)
    return result

ctg_vars = ['X2', 'X3']
num_rasio_to_use = 1

results = []
for num_rasio in range(1, num_rasio_to_use + 1):
    for rasio_comb in itertools.combinations(rasio_vars, num_rasio):
        X_rasio_selected = X_scaled_df[list(rasio_comb)]

        for k in [1, 2]:
            for cat_comb in itertools.combinations(ctg_vars, k):
                X_cat_selected = df[list(cat_comb)]
                X_all = pd.concat([X_rasio_selected, X_cat_selected], axis=1)

                try:
                    result = build_mnlogit_model(X_all, y_cat)
                    
                    # Pseudo R² McFadden
                    llf = result.llf
                    llnull = result.llnull
                    pseudo_r2 = 1 - (llf / llnull)

                    if pd.notna(pseudo_r2):
                        # Signifikansi α=0.1
                        summary_table = result.summary2().tables[1]
                        pvals = summary_table["P>|z|"]
                        significance_bool = (pvals < 0.1)
                        if "const" in significance_bool.index:
                            jumlah_signif = significance_bool.drop("const").sum()
                        else:
                            jumlah_signif = significance_bool.sum()

                        results.append({
                            "kombinasi": list(rasio_comb) + list(cat_comb),
                            "predictors_in_model": list(sm.add_constant(X_all, has_constant='add').columns),
                            "n_vars": X_all.shape[1] + 1,  
                            "Pseudo_R2": pseudo_r2,
                            "AIC": result.aic,
                            "BIC": result.bic,
                            "n_significant": jumlah_signif,
                            "significance_table": significance_bool,
                            "summary": summary_table.to_string()
                        })

                except Exception as e:
                    continue

In [12]:
if results:
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values(by=["Pseudo_R2", "n_significant"], ascending=[False, False])
    # results_df = results_df.sort_values(by="n_significant", ascending=False)
    results_df = results_df.reset_index(drop=True)
    
    results_df_no_summary = results_df.drop(columns=["summary", "significance_table"])
    print("\n=== HASIL MODEL VALID ===\n")
    print(results_df_no_summary)
else:
    print("Tidak ada model berhasil dijalankan.")

Tidak ada model berhasil dijalankan.
