In [19]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import patsy as pt
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

df = pd.read_csv("BIRTH_WEIGHT.csv")

df = df[['LOW','AGE','LWT','RACE','SMOKE','PTL','HT','UI','FTV']]

categorical_vars = ['RACE', 'SMOKE', 'PTL', 'HT', 'UI', 'FTV']

for var in categorical_vars:
    print(f'\nLOW vs {var}')
    print(pd.crosstab(df['LOW'], df[var]))


LOW vs RACE
RACE   1   2   3
LOW             
0     73  15  42
1     23  11  25

LOW vs SMOKE
SMOKE   0   1
LOW          
0      86  44
1      29  30

LOW vs PTL
PTL    0   1  2  3
LOW               
0    118   8  3  1
1     41  16  2  0

LOW vs HT
HT     0  1
LOW        
0    125  5
1     52  7

LOW vs UI
UI     0   1
LOW         
0    117  13
1     45  14

LOW vs FTV
FTV   0   1   2  3  4  6
LOW                     
0    64  36  23  3  3  1
1    36  11   7  4  1  0


In [21]:
y, X = pt.dmatrices(
    'LOW ~ AGE + LWT + C(RACE) + SMOKE + PTL + HT + UI + FTV',
    df,
    return_type = 'dataframe'
)

logit_model = sm.Logit(y, X)
result = logit_model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.531508
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                    LOW   No. Observations:                  189
Model:                          Logit   Df Residuals:                      179
Method:                           MLE   Df Model:                            9
Date:                Tue, 11 Nov 2025   Pseudo R-squ.:                  0.1439
Time:                        18:44:07   Log-Likelihood:                -100.45
converged:                       True   LL-Null:                       -117.34
Covariance Type:            nonrobust   LLR p-value:                 9.832e-05
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.4428      1.198      0.370      0.712      -1.906       2.792
C(RACE)[T.2]     1.2935

In [32]:
df['P_LOW'] = result.predict(X)
df[['LOW', 'P_LOW']].head()

Unnamed: 0,LOW,P_LOW
0,0,0.171549
1,0,0.140005
2,0,0.322097
3,0,0.518443
4,0,0.513538


In [36]:
def classification_table(y_true, y_prob, cutoff):
    y_pred = (y_prob >= cutoff).astype(int)

    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 00))

    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    misclassification = (fp + fn) / (tp + tn + fp +fn)

    return {
        'Cutoff' : cutoff,
        'Sensitivity' : round(sensitivity, 3),
        'Specificity': round(specificity, 3),
        'Misclassification Rate': round(misclassification, 3)
    }

y_true = df['LOW']

results = [classification_table(y_true, df['P_LOW'], c) for c in [0.3, 0.4, 0.55]]
pd.DataFrame(results)

Unnamed: 0,Cutoff,Sensitivity,Specificity,Misclassification Rate
0,0.3,0.678,0.669,0.328
1,0.4,0.475,0.823,0.286
2,0.55,0.288,0.923,0.275
