In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression

## Data

In [2]:
df = pd.read_csv('data/cdv.csv')
print(df.columns.tolist())

['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'Y']


In [3]:
df.head(2)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,Y
0,1.1,0,1,28.44,134,3.54,4.32,55.7,0,0,1
1,1.3,1,1,34.13,126,5.87,3.95,53.1,0,1,1


In [4]:
df.describe()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,Y
count,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
mean,3.666667,0.380952,0.380952,31.935238,133.761905,4.633333,4.604286,67.004762,0.47619,0.666667,1.809524
std,1.470147,0.497613,0.497613,7.573592,27.93368,2.248318,0.531522,8.52065,0.511766,0.483046,0.749603
min,1.1,0.0,0.0,20.78,96.0,1.38,3.9,52.5,0.0,0.0,1.0
25%,2.7,0.0,0.0,25.22,118.0,2.69,4.18,59.4,0.0,0.0,1.0
50%,3.6,0.0,0.0,30.88,128.0,4.4,4.52,69.1,0.0,1.0,2.0
75%,4.6,1.0,1.0,36.83,134.0,5.87,4.86,73.1,1.0,1.0,2.0
max,6.3,1.0,1.0,46.76,200.0,9.93,5.63,77.2,1.0,1.0,3.0


In [23]:
X = df[['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10']].values
X = np.hstack((np.ones((X.shape[0], 1)), X))
y = df[['Y']].values.flatten()
y = y - 1 

n = len(y)
p = df.shape[1]
k = p - 1

| Variabel | Keterangan                              | Skala        |
|----------|----------------------------------------|-------------|
| X1       | Waktu tahan hidup pasien                | Rasio       |
| X2       | Jenis kelamin pasien                    | Nominal     |
| X3       | Intensitas merokok                      | Nominal     |
| X4       | Indeks massa tubuh                      | Rasio       |
| X5       | Tekanan darah sistolik (mmHg)           | Rasio       |
| X6       | Logaritme rasio albumin dan kreatinin urin | Rasio    |
| X7       | Logaritme trigliserida                  | Rasio       |
| X8       | Umur pasien                             | Rasio       |
| X9       | Status hipertensi                        | Nominal     |
| X10      | Status diabetes                          | Nominal     |
| Y        | Jenis penyakit kardiovaskular           | Nominal     |
|          | 1: stroke                               |             |
|          | 2: coronary heart disease               |             |
|          | 3: angina                               |             |

## Regresi Logistik Multinomial

### Uji Multikolinearitas

In [24]:
vif_data = pd.DataFrame()
vif_data["feature"] = ['Intercept'] + ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10']
vif_data["VIF"] = [variance_inflation_factor(X, i) for i in range(X.shape[1])]
print(vif_data)

      feature         VIF
0   Intercept  310.528334
1          X1    1.659555
2          X2    1.321321
3          X3    1.370177
4          X4    2.297288
5          X5    2.571857
6          X6    2.002548
7          X7    1.737764
8          X8    2.139399
9          X9    2.152826
10        X10    1.870077


### Bangun Model

In [25]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = sm.add_constant(X_scaled)

# model = sm.MNLogit(y, X_scaled)
model = LogisticRegression(solver='lbfgs', max_iter=500,  C=1.0)
result = model.fit(X_scaled, y)
print(result.coef_)
pred_proba = result.predict_proba(X_scaled)


[[ 2.14969272e-03  0.00000000e+00 -7.07939414e-01 -2.78839136e-01
   9.90986817e-02  1.87651575e-01  7.59938328e-01  1.28414728e-01
  -3.25041230e-01 -2.52099609e-01 -7.42270950e-01 -4.05493203e-01]
 [-1.69434051e-03  0.00000000e+00  2.74346024e-01 -9.92046354e-02
   3.97755079e-01 -1.09356306e-02 -8.49329988e-01 -5.70827290e-01
   2.73331445e-01  3.63121141e-01 -1.92541220e-01  9.18482339e-01]
 [-4.55352211e-04  0.00000000e+00  4.33593390e-01  3.78043772e-01
  -4.96853760e-01 -1.76715944e-01  8.93916607e-02  4.42412562e-01
   5.17097855e-02 -1.11021532e-01  9.34812171e-01 -5.12989137e-01]]


In [26]:
# model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=2000)
# model.fit(X, y)
# print("Koefisien model:", model.coef_)

### Uji Simultan

In [27]:
# print("Log-Likelihood:", result.llf)
log_likelihood = 0
for i in range(len(y)):
    true_class = y[i]  # Kelas sebenarnya untuk data ke-i
    log_likelihood += np.log(pred_proba[i, true_class])  # Ambil probabilitas untuk kelas yang benar

print("Log-Likelihood (LLF):", log_likelihood)

Log-Likelihood (LLF): -6.152193919680184


### Uji Parsial

In [28]:
from sklearn.utils import resample

hessian = np.dot(X_scaled.T, X_scaled)
X_resampled, y_resampled = resample(X_scaled, y, n_samples=500, random_state=42)

model_resampled = LogisticRegression(solver='lbfgs', max_iter=500, multi_class='multinomial',  C=1.0)
model_resampled.fit(X_resampled, y_resampled)
coefficients_resampled = model_resampled.coef_

coef_first = coefficients_resampled[0, 0]
var_first = np.var(coefficients_resampled[0, 0])

if var_first == 0:
    print("Varians koefisien pertama adalah nol. Wald stat tidak dapat dihitung.")
else:
    wald_stat = coef_first**2 / var_first
    print("Wald Stat untuk koefisien pertama:", wald_stat)

Varians koefisien pertama adalah nol. Wald stat tidak dapat dihitung.




### Uji Kesesuaian Model