In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Probit, Logit
from scipy import stats
from gme import GMELogit

In [2]:
# Read the data
# Note: You'll need to change the file path to match your system
data = pd.read_excel("../../data/smoking.xls", index_col = 0)

In [3]:
data.head()

Unnamed: 0,smoker,smkban,age,hsdrop,hsgrad,colsome,colgrad,black,hispanic,female
1,1,1,41,0,1,0,0,0,0,1
2,1,1,44,0,0,1,0,0,0,1
3,0,0,19,0,0,1,0,0,0,1
4,1,0,29,0,1,0,0,0,0,1
5,0,1,28,0,0,1,0,0,0,1


In [4]:
# Create age squared variable
data['age_sq'] = data['age'] ** 2

# Create dummy variable for constant term
data['const'] = 1

# Define independent variables
X = ['const', 'smkban', 'female', 'age', 'hsdrop', 'hsgrad', 'colsome', 'colgrad', 'black', 'hispanic']
# Calculate condition number
X_matrix = data[X].values
cond_num = np.linalg.cond(X_matrix)
print(f"Condition number: {cond_num}")

Condition number: 311.56108397256776


In [5]:
# Define dependent and independent variables
y = data['smoker']
X = data[['const', 'smkban', 'female', 'age', 'hsdrop', 'hsgrad', 'colsome', 'colgrad', 'black', 'hispanic']]


In [6]:
# Fit the model
model = GMELogit()
model.fit(X, y)

In [7]:
model.summary()

Generalized Maximum Entropy (Logit)
Number of obs: 10000
Degrees of freedom: 10
Entropy for probs: 5251.1
Normalized entropy: 0.7576
Ent. ratio stat: 3360.8
P Val for LR: 0.0000
Criterion F (log L) = -5251.0949
Pseudo R2: 0.2424

Coefficients:
X0: -1.6968 (std err: 0.2005, t-stat: -8.4632, p-value: 0.0000)
X1: -0.2507 (std err: 0.0531, t-stat: -4.7187, p-value: 0.0000)
X2: -0.1887 (std err: 0.0524, t-stat: -3.5990, p-value: 0.0006)
X3: -0.0075 (std err: 0.0019, t-stat: -3.9065, p-value: 0.0002)
X4: 1.9311 (std err: 0.1924, t-stat: 10.0350, p-value: 0.0000)
X5: 1.5233 (std err: 0.1895, t-stat: 8.0379, p-value: 0.0000)
X6: 1.1801 (std err: 0.1917, t-stat: 6.1570, p-value: 0.0000)
X7: 0.4248 (std err: 0.1853, t-stat: 2.2925, p-value: 0.0437)
X8: -0.1495 (std err: 0.0844, t-stat: -1.7700, p-value: 0.1535)
X9: -0.5848 (std err: 0.0781, t-stat: -7.4922, p-value: 0.0000)


In [12]:
# Calculate probability for Mr. A
test_predictions_df = pd.DataFrame({
    'const': [1,1],
    'smkban': [1,0],
    'female': [0,0],
    'age': [20, 20],
    'hsdrop': [1, 1],
    'hsgrad': [0, 0],
    'colsome': [0, 0],
    'colgrad': [0, 0],
    'black': [0, 0],
    'hispanic': [0, 0]
})

In [13]:
model.predict_proba(test_predictions_df)

0    0.458729
1    0.521306
dtype: float64