In [1]:
import pandas as pd
import math

from sklearn.linear_model import LogisticRegression

from platform import python_version
from sklearn import __version__ as sklearn_ver
from statsmodels import __version__ as statsmodels_ver

print(f'python: {python_version()}')
print(f'\npandas: {pd.__version__}')
print(f'sklearn: {sklearn_ver}')
print(f'statsmodels: {statsmodels_ver}')

python: 3.9.7

pandas: 1.5.2
sklearn: 1.1.2
statsmodels: 0.13.2


## Simple example

In [2]:
df = pd.read_excel('../data/data.xlsx')
df.head(3)

Unnamed: 0,target,age,smoke
0,0,20,0
1,1,21,0
2,0,22,0


#### sklearn

In [3]:
X = df[['age']]
y = df['target']

model = LogisticRegression()
model.fit(X, y)

# probability of death
age_to_predict = 30
model.predict_proba([[age_to_predict]])[:,1]



array([0.12675046])

In [4]:
theta0 = model.intercept_[0]
theta1 = model.coef_[0][0]

print(f'theta1={round(theta1,3)}; theta0 ={round(theta0,3)}')

# probability of death from math formula
round(1/(1+math.exp(-(age_to_predict*theta1+theta0))),8)

theta1=0.056; theta0 =-3.618


0.12675046

#### statsmodels

In [5]:
import statsmodels.api as sm

X = df[['age']]
y = df['target']

X = sm.add_constant(X)

model = sm.Logit(y,X)

result = model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.534205
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                   81
Model:                          Logit   Df Residuals:                       79
Method:                           MLE   Df Model:                            1
Date:                Sun, 19 Mar 2023   Pseudo R-squ.:                  0.2251
Time:                        20:43:01   Log-Likelihood:                -43.271
converged:                       True   LL-Null:                       -55.842
Covariance Type:            nonrobust   LLR p-value:                 5.323e-07
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.6184      0.861     -4.202      0.000      -5.306      -1.931
age            0.0563      0.

In [6]:
# numbers to blog posts :)
30 * 0.0563 -3.6184 

-1.9293999999999998

In [7]:
64 * 0.0563 -3.6184 

-0.015199999999999658

In [8]:
100 * 0.0563 -3.6184 

2.0116

In [9]:
# create model for 2 variables
X = df[['age', 'smoke']]
y = df['target']

X = sm.add_constant(X)

model = sm.Logit(y,X)

result = model.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.516661
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                   81
Model:                          Logit   Df Residuals:                       78
Method:                           MLE   Df Model:                            2
Date:                Sun, 19 Mar 2023   Pseudo R-squ.:                  0.2506
Time:                        20:43:01   Log-Likelihood:                -41.850
converged:                       True   LL-Null:                       -55.842
Covariance Type:            nonrobust   LLR p-value:                 8.377e-07
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.9205      0.920     -4.261      0.000      -5.724      -2.117
age            0.0568      0.