In [1]:
import pandas as pd 
import numpy as np
import statsmodels.api as sm
from statsmodels.api import Logit, add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
df = pd.read_csv('LR3.csv')

df.head(5)

Unnamed: 0,UserID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [3]:
# check for multi-collinearity before considering variables for modelling
def calculate_vif(df):
    vif = pd.DataFrame()
    vif["variables"] = df.columns
    vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    vif.sort_values('VIF', ascending=False, inplace=True)
    return(vif)

In [4]:
def fit_logistic_reg_with_intercept(X, Y):
    X = sm.add_constant(X) # adding a constant
    log_reg_model = sm.Logit(y, X).fit()
    return log_reg_model

In [5]:
# Check for VIF between numerical variables
x_var = ['Age', 'EstimatedSalary']
X = df[x_var]
calculate_vif(X)

Unnamed: 0,variables,VIF
0,Age,4.575819
1,EstimatedSalary,4.575819


In [6]:
# Add dummy variables
cat_vars = ["Gender"]
df_dummy = pd.get_dummies(df.drop(['UserID'], axis=1),
                           columns=cat_vars, 
                           drop_first=True)

In [7]:
# Remove Gender_Male, since its not significant based on p-value(0.274)
x_vars = ['Age', 'EstimatedSalary']
y_var = ['Purchased']
X = df_dummy[x_vars]
y = df_dummy[y_var]
logistic_model = fit_logistic_reg_with_intercept(X, y)
print(logistic_model.summary())

Optimization terminated successfully.
         Current function value: 0.346314
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:              Purchased   No. Observations:                  400
Model:                          Logit   Df Residuals:                      397
Method:                           MLE   Df Model:                            2
Date:                Tue, 07 Jun 2022   Pseudo R-squ.:                  0.4688
Time:                        20:52:54   Log-Likelihood:                -138.53
converged:                       True   LL-Null:                       -260.79
Covariance Type:            nonrobust   LLR p-value:                 7.995e-54
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
const             -12.4340      1.300     -9.566      0.000     -14.982      -9.886
Age           

# Wald's Test for Age

In [8]:
wald_test = logistic_model.wald_test_terms(scalar = False)
print(wald_test.table.loc['Age'])#['statistic'])

statistic         [[81.23137211446682]]
pvalue           2.0077923075850262e-19
df_constraint                         1
Name: Age, dtype: object


# Answers

  - The logistic regression equation is
       - prob(Purchased) = -12.4340 + 0.2335 * Age + 3.59e-05 * EstimatedSalary
       - With increase in age and Salary, we should expect that probability of purchase will increase
  - Wald's Test for Age : statistic 81.23, with p-value of 2.00e-19