In [None]:
import pandas as pd 
import numpy as np
import statsmodels.api as sm
from statsmodels.api import Logit, add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
df = pd.read_csv('LR2.csv')

df.head(5)

In [None]:
# check for multi-collinearity before considering variables for modelling
def calculate_vif(df):
    vif = pd.DataFrame()
    vif["variables"] = df.columns
    vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    vif.sort_values('VIF', ascending=False, inplace=True)
    return(vif)

In [None]:
# 1. Remove INFIL, since it has highest vif value
# 2. Remove TEMP, since it has highest vif value of 36.9
# 3. Remove SMEAR, since it has highest vif value of 12.86
x_var = ['CELL', 'LI', 'BLAST']
y_var = ['REMISS']

X = df[x_var]
y = df[y_var]

calculate_vif(X)

In [None]:
def fit_logistic_reg_with_intercept(X, Y):
    X = sm.add_constant(X) # adding a constant
    log_reg_model = sm.Logit(y, X).fit()
    return log_reg_model

In [None]:
# 1. Remove BLAST, since its coefficient is not significant
# 2. Remove CELL, since its coefficient is not significant
x_var = ['LI']
y_var = ['REMISS']

X = df[x_var]
y = df[y_var]
log_model = fit_logistic_reg_with_intercept(X, y)
print(log_model.summary())

In [None]:
# calculating odds for LI = 0.9
-3.7771 + 2.8973 * 0.9
np.exp(-3.7771 + 2.8973 * 0.9)

# Answers

Below are the ansers:
  - The final regression equation is:<br>
    **prob = -3.7771 + 2.8973 * LI**
      - Below variables are removed after VIF check
          - **INFIL, TEMP** and **SMEAR**
      - Below variables are removed after checking for significance beased on p-value
          -  **BLAST** and **CELL**
      - Finally only **LI** variable came as significant
  - For **LI=0.9, Odds Ratio = exp(-3.7771 + 2.8973 * 0.9) = exp(-1.16953) = 0.31**