In [1]:
import pandas as pd 
import numpy as np
import statsmodels.api as sm
from statsmodels.api import Logit, add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
df = pd.read_csv('LR2.csv')

df.head(5)

Unnamed: 0,REMISS,CELL,SMEAR,INFIL,LI,BLAST,TEMP
0,1,0.8,0.83,0.66,1.9,1.1,1.0
1,1,0.9,0.36,0.32,1.4,0.74,0.99
2,0,0.8,0.88,0.7,0.8,0.18,0.98
3,0,1.0,0.87,0.87,0.7,1.05,0.99
4,1,0.9,0.75,0.68,1.3,0.52,0.98


In [3]:
# check for multi-collinearity before considering variables for modelling
def calculate_vif(df):
    vif = pd.DataFrame()
    vif["variables"] = df.columns
    vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    vif.sort_values('VIF', ascending=False, inplace=True)
    return(vif)

In [4]:
# 1. Remove INFIL, since it has highest vif value
# 2. Remove TEMP, since it has highest vif value of 36.9
# 3. Remove SMEAR, since it has highest vif value of 12.86
x_var = ['CELL', 'LI', 'BLAST']
y_var = ['REMISS']

X = df[x_var]
y = df[y_var]

calculate_vif(X)

Unnamed: 0,variables,VIF
1,LI,7.636598
0,CELL,5.825588
2,BLAST,4.446984


In [5]:
def fit_logistic_reg_with_intercept(X, Y):
    X = sm.add_constant(X) # adding a constant
    log_reg_model = sm.Logit(y, X).fit()
    return log_reg_model

In [6]:
# 1. Remove BLAST, since its coefficient is not significant
# 2. Remove CELL, since its coefficient is not significant
x_var = ['LI']
y_var = ['REMISS']

X = df[x_var]
y = df[y_var]
log_model = fit_logistic_reg_with_intercept(X, y)
print(log_model.summary())

Optimization terminated successfully.
         Current function value: 0.482833
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                 REMISS   No. Observations:                   27
Model:                          Logit   Df Residuals:                       25
Method:                           MLE   Df Model:                            1
Date:                Mon, 06 Jun 2022   Pseudo R-squ.:                  0.2414
Time:                        19:22:55   Log-Likelihood:                -13.036
converged:                       True   LL-Null:                       -17.186
Covariance Type:            nonrobust   LLR p-value:                  0.003967
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.7771      1.379     -2.740      0.006      -6.479      -1.075
LI             2.8973      1.

In [7]:
# calculating odds for LI = 0.9
-3.7771 + 2.8973 * 0.9
np.exp(-3.7771 + 2.8973 * 0.9)

0.3105128480132796

# Answers

Below are the ansers:
  - The final regression equation is:
   **prob = -3.7771 + 2.8973 * LI**
      - Below variables are removed after VIF check
          - **INFIL, TEMP** and **SMEAR**
      - Below variables are removed after checking for significance beased on p-value
          -  **BLAST** and **CELL**
      - Finally only **LI** variable came as significant
  - For **LI=0.9, Odds Ratio = exp(-3.7771 + 2.8973 * 0.9) = exp(-1.16953) = 0.31**