In [1]:
from Fairness_aware_model import FairnessAwareModel
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split

In [2]:
dataCar = pd.read_csv('dataCar_clean.csv') # contient le train-test split pour comparer avec R au besoin
#clm = dataCar['clm']
#Xtrain, test, _, _ = train_test_split(dataCar, clm, test_size=0.2)
train=dataCar.loc[dataCar['train']==1]
test=dataCar.loc[dataCar['train']==0]

# Modèle baseline avec la méthode bfgs

In [3]:
mod = smf.glm(formula="clm ~ veh_value + veh_body + C(veh_age) + area + gender + C(agecat)", family=sm.families.Binomial(), data=train)
res = mod.fit(method="bfgs")
print(res.summary()) # donne la même chose qu'en R

                 Generalized Linear Model Regression Results                  
Dep. Variable:                    clm   No. Observations:                54285
Model:                            GLM   Df Residuals:                    54257
Model Family:                Binomial   Df Model:                           27
Link Function:                  Logit   Scale:                          1.0000
Method:                          bfgs   Log-Likelihood:                -13435.
Date:                Mon, 14 Nov 2022   Deviance:                       26870.
Time:                        09:14:21   Pearson chi2:                 5.43e+04
No. Iterations:                     0   Pseudo R-squ. (CS):           0.002612
Covariance Type:            nonrobust                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -1.9774      0.48

In [4]:
protected_attributes = ["gender", "agecat"]
# Standard scaling for regression
binary_answer = dataCar["clm"].values
numclaim = dataCar["numclaims"].values
reg_claim = dataCar["claimcst0"].values

In [5]:
def hot_encoder(data, optional_columns):
    categorical_cols = []
    for column in data.columns:
            if data[column].dtype == object or column in optional_columns:
                categorical_cols.append(column)

    to_return = pd.get_dummies(data, columns = categorical_cols)
    return to_return

dataCar_encoded = hot_encoder(dataCar, ["gender", "agecat"])

dataCar_encoded = dataCar_encoded.drop(["clm", "numclaims", "claimcst0", "veh_body_BUS","area_A","exposure", "gender_F", "agecat_1"], axis=1)

In [6]:
train_encoded = dataCar_encoded.loc[dataCar_encoded['train']==1]
test_encoded = dataCar_encoded.loc[dataCar_encoded['train']==0]
y = dataCar['clm']

y_train = dataCar.loc[dataCar['train']==1, 'clm']
y_test = dataCar.loc[dataCar['train']==0, 'clm']
X_train_encoded = train_encoded.drop(["train"], axis=1)
X_test_encoded = test_encoded.drop(["train"], axis=1)

In [7]:
protected_attributes = [np.where(dataCar.columns.values == i)[0][0] for i in protected_attributes] # position des colonnes d'attributs protégés dans les données

In [8]:
fam = FairnessAwareModel(regularization=0, protected_attributes=protected_attributes, family="binomial", offset=1)
fam.fit(np.column_stack([np.ones(X_train_encoded.shape[0]), np.array(X_train_encoded)]), y_train.values) # il faut inclute un "intercept", donc colonne de 1

         Current function value: 13442.990079
         Iterations: 43
         Function evaluations: 1485
         Gradient evaluations: 55


# Comparaison avec `statsmodel`

In [9]:
model2 = sm.Logit(y_train, sm.add_constant(X_train_encoded)).fit()

Optimization terminated successfully.
         Current function value: 0.247637
         Iterations 8


In [12]:
print(model2.summary())

                           Logit Regression Results                           
Dep. Variable:                    clm   No. Observations:                54285
Model:                          Logit   Df Residuals:                    54259
Method:                           MLE   Df Model:                           25
Date:                Mon, 14 Nov 2022   Pseudo R-squ.:                0.004681
Time:                        09:15:08   Log-Likelihood:                -13443.
converged:                       True   LL-Null:                       -13506.
Covariance Type:            nonrobust   LLR p-value:                 1.596e-15
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -1.8913      0.489     -3.865      0.000      -2.850      -0.932
veh_value          0.0574      0.020      2.906      0.004       0.019       0.096
veh_age            0.0078      0.021

In [10]:
pd.DataFrame({"variable":["intercept"] + X_train_encoded.columns.tolist(), "beta":fam.beta, "beta_statsnodel":model2.params})

Unnamed: 0,variable,beta,beta_statsnodel
const,intercept,-1.89124,-1.891278
veh_value,veh_value,0.057412,0.057412
veh_age,veh_age,0.007808,0.007808
veh_body_CONVT,veh_body_CONVT,-1.866971,-1.866944
veh_body_COUPE,veh_body_COUPE,-0.347521,-0.347482
veh_body_HBACK,veh_body_HBACK,-0.589881,-0.589844
veh_body_HDTOP,veh_body_HDTOP,-0.434935,-0.434897
veh_body_MCARA,veh_body_MCARA,0.054582,0.054627
veh_body_MIBUS,veh_body_MIBUS,-0.648116,-0.648079
veh_body_PANVN,veh_body_PANVN,-0.35023,-0.350192
