In [102]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_tweedie_deviance

In [62]:
data = pd.read_csv('/Users/arthurrastelli/Desktop/Assignment/Assignment.csv')
data = data.rename(columns= {'duree': 'expo','nbrtotan': 'freq','nbrtotc': 'nclaims'})
data["sev"] = data['chargtot']/data['freq']
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163657 entries, 0 to 163656
Data columns (total 17 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   AGEPH     163657 non-null  int64  
 1   CODPOSS   163657 non-null  int64  
 2   expo      163657 non-null  float64
 3   lnexpo    163657 non-null  float64
 4   nclaims   163657 non-null  int64  
 5   freq      163657 non-null  float64
 6   chargtot  163657 non-null  float64
 7   agecar    163657 non-null  object 
 8   sexp      163657 non-null  object 
 9   fuelc     163657 non-null  object 
 10  split     163657 non-null  object 
 11  usec      163657 non-null  object 
 12  fleetc    163657 non-null  object 
 13  sportc    163657 non-null  object 
 14  coverp    163657 non-null  object 
 15  powerc    163657 non-null  object 
 16  sev       18345 non-null   float64
dtypes: float64(5), int64(3), object(9)
memory usage: 21.2+ MB


Unnamed: 0,AGEPH,CODPOSS,expo,lnexpo,nclaims,freq,chargtot,agecar,sexp,fuelc,split,usec,fleetc,sportc,coverp,powerc,sev
0,64,1000,1.0,0.0,0,0.0,0.0,2-5,Female,Petrol,Once,Private,No,No,MTPL+,66-110,
1,28,1000,0.046575,-3.066684,1,21.470588,155.974606,6-10,Female,Petrol,Twice,Private,No,No,MTPL,66-110,7.264571
2,58,1000,0.40274,-0.909465,0,0.0,0.0,>10,Female,Petrol,Thrice,Private,No,No,MTPL,<66,
3,37,1030,0.169863,-1.772763,0,0.0,0.0,2-5,Female,Petrol,Once,Professional,No,No,MTPL+++,66-110,
4,29,1030,1.0,0.0,0,0.0,0.0,6-10,Female,Petrol,Once,Private,No,No,MTPL+,<66,


Bin CODPOSS

In [63]:
def categorize_postal_code(code):
    if 1000 <= code <= 1299:
        return 'BHG'
    elif 1300 <= code <= 1499:
        return 'WB'
    elif 1500 <= code <= 1999 or 3000 <= code <= 3499:
        return 'VB'
    elif 2000 <= code <= 2999:
        return 'ANT'
    elif 3500 <= code <= 3999:
        return 'LIM'
    elif 4000 <= code <= 4999:
        return 'LUI'
    elif 5000 <= code <= 5999:
        return 'NAM'
    elif 6000 <= code <= 6599 or 7000 <= code <= 7999:
        return 'HEN'
    elif 6600 <= code <= 6999:
        return 'LUX'
    elif 8000 <= code <= 8999:
        return 'WV'
    elif 9000 <= code <= 9999:
        return 'OV'
    else:
        return 'Unknown'
    
data['region'] = data['CODPOSS'].apply(categorize_postal_code)
data

Unnamed: 0,AGEPH,CODPOSS,expo,lnexpo,nclaims,freq,chargtot,agecar,sexp,fuelc,split,usec,fleetc,sportc,coverp,powerc,sev,region
0,64,1000,1.000000,0.000000,0,0.000000,0.000000,2-5,Female,Petrol,Once,Private,No,No,MTPL+,66-110,,BHG
1,28,1000,0.046575,-3.066684,1,21.470588,155.974606,6-10,Female,Petrol,Twice,Private,No,No,MTPL,66-110,7.264571,BHG
2,58,1000,0.402740,-0.909465,0,0.000000,0.000000,>10,Female,Petrol,Thrice,Private,No,No,MTPL,<66,,BHG
3,37,1030,0.169863,-1.772763,0,0.000000,0.000000,2-5,Female,Petrol,Once,Professional,No,No,MTPL+++,66-110,,BHG
4,29,1030,1.000000,0.000000,0,0.000000,0.000000,6-10,Female,Petrol,Once,Private,No,No,MTPL+,<66,,BHG
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163652,50,9960,1.000000,0.000000,0,0.000000,0.000000,2-5,Male,Petrol,Once,Private,No,No,MTPL+,<66,,OV
163653,86,9960,1.000000,0.000000,0,0.000000,0.000000,6-10,Male,Petrol,Once,Private,No,No,MTPL,<66,,OV
163654,64,9960,1.000000,0.000000,0,0.000000,0.000000,2-5,Male,Petrol,Twice,Private,No,No,MTPL,<66,,OV
163655,58,9970,1.000000,0.000000,1,1.000000,3177.994988,>10,Male,Petrol,Once,Private,Yes,No,MTPL,<66,3177.994988,OV


GENERALIZED LINEAR MODELS: FREQUENCY

In [64]:
formula1 = '''
freq ~
AGEPH + 
C(agecar, Treatment(reference="6-10")) + 
C(sexp, Treatment(reference="Male")) + 
C(fuelc, Treatment(reference="Petrol")) + 
C(split, Treatment(reference="Once")) + 
C(usec, Treatment(reference="Private")) + 
C(fleetc, Treatment(reference="No")) + 
C(sportc, Treatment(reference="No")) + 
C(powerc, Treatment(reference="<66")) +
C(region, Treatment(reference="HEN"))
'''

In [65]:
glm_fit1 = smf.glm(formula=formula1, data=data, exposure=data['expo'], family = sm.families.Poisson(link=sm.families.links.log())).fit()
print(glm_fit1.summary())



                 Generalized Linear Model Regression Results                  
Dep. Variable:                   freq   No. Observations:               163657
Model:                            GLM   Df Residuals:                   163632
Model Family:                 Poisson   Df Model:                           24
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -91817.
Date:                Mon, 20 May 2024   Deviance:                   1.4391e+05
Time:                        18:44:01   Pearson chi2:                 5.98e+07
No. Iterations:                     7   Pseudo R-squ. (CS):            0.02750
Covariance Type:            nonrobust                                         
                                                              coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------

REMOVE THE USEC VARIABLE AS IT IS INSIGNIFICANT

In [66]:
formula2 = '''
freq ~
AGEPH + 
C(agecar, Treatment(reference="6-10")) + 
C(sexp, Treatment(reference="Male")) + 
C(fuelc, Treatment(reference="Petrol")) + 
C(split, Treatment(reference="Once")) + 
C(fleetc, Treatment(reference="No")) + 
C(sportc, Treatment(reference="No")) + 
C(powerc, Treatment(reference="<66")) +
C(region, Treatment(reference="HEN"))
'''

In [67]:
glm_fit2 = smf.glm(formula=formula2, data=data, exposure=data['expo'], family = sm.families.Poisson(link=sm.families.links.log())).fit()
print(glm_fit2.summary())



                 Generalized Linear Model Regression Results                  
Dep. Variable:                   freq   No. Observations:               163657
Model:                            GLM   Df Residuals:                   163633
Model Family:                 Poisson   Df Model:                           23
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -91817.
Date:                Mon, 20 May 2024   Deviance:                   1.4392e+05
Time:                        18:44:05   Pearson chi2:                 5.97e+07
No. Iterations:                     7   Pseudo R-squ. (CS):            0.02749
Covariance Type:            nonrobust                                         
                                                        coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------

BIN THE AGEPH variable

In [68]:
bins = [19,29,39,49,59,69,79,89,99]
labels=['0-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-99']

data['age_bins'] = pd.cut(data['AGEPH'],bins=bins,labels=labels,right=False)

In [55]:
formula3 = '''
freq ~
C(age_bins, Treatment(reference="20-29")) + 
C(agecar, Treatment(reference="6-10")) + 
C(sexp, Treatment(reference="Male")) + 
C(fuelc, Treatment(reference="Petrol")) + 
C(split, Treatment(reference="Once")) + 
C(fleetc, Treatment(reference="No")) + 
C(sportc, Treatment(reference="No")) + 
C(powerc, Treatment(reference="<66")) +
C(region, Treatment(reference="HEN"))
'''

In [69]:
glm_fit3 = smf.glm(formula=formula3, data=data, exposure=data['expo'], family = sm.families.Poisson(link=sm.families.links.log())).fit()
print(glm_fit3.summary())



                 Generalized Linear Model Regression Results                  
Dep. Variable:                   freq   No. Observations:               163638
Model:                            GLM   Df Residuals:                   163608
Model Family:                 Poisson   Df Model:                           29
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -91513.
Date:                Mon, 20 May 2024   Deviance:                   1.4332e+05
Time:                        18:44:12   Pearson chi2:                 5.65e+07
No. Iterations:                     9   Pseudo R-squ. (CS):            0.03051
Covariance Type:            nonrobust                                         
                                                         coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------

REMOVE THE INSIGNIFICANT VARIABLES
1. agecat 70-79

In [85]:
data = data[data['age_bins'] != '70-79']

In [86]:
formula4 = '''
freq ~
C(age_bins, Treatment(reference="20-29")) + 
C(agecar, Treatment(reference="6-10")) + 
C(sexp, Treatment(reference="Male")) + 
C(fuelc, Treatment(reference="Petrol")) + 
C(split, Treatment(reference="Once")) + 
C(fleetc, Treatment(reference="No")) + 
C(sportc, Treatment(reference="No")) + 
C(powerc, Treatment(reference="<66")) +
C(region, Treatment(reference="HEN"))
'''

In [87]:
glm_fit4 = smf.glm(formula=formula4, data=data, exposure=data['expo'], family = sm.families.Poisson(link=sm.families.links.log())).fit()
print(glm_fit4.summary())



                 Generalized Linear Model Regression Results                  
Dep. Variable:                   freq   No. Observations:               154947
Model:                            GLM   Df Residuals:                   154919
Model Family:                 Poisson   Df Model:                           27
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -86910.
Date:                Mon, 20 May 2024   Deviance:                   1.3626e+05
Time:                        18:48:54   Pearson chi2:                 5.63e+07
No. Iterations:                     9   Pseudo R-squ. (CS):            0.03143
Covariance Type:            nonrobust                                         
                                                         coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------

REMOVE INSIGNIFICANT VARIABLES
2. WB

In [88]:
data = data[data['region']!="WB"]

In [89]:
formula5 = '''
freq ~
C(age_bins, Treatment(reference="20-29")) + 
C(agecar, Treatment(reference="6-10")) + 
C(sexp, Treatment(reference="Male")) + 
C(fuelc, Treatment(reference="Petrol")) + 
C(split, Treatment(reference="Once")) + 
C(fleetc, Treatment(reference="No")) + 
C(sportc, Treatment(reference="No")) + 
C(powerc, Treatment(reference="<66")) +
C(region, Treatment(reference="HEN"))
'''

In [90]:
glm_fit5 = smf.glm(formula=formula5, data=data, exposure=data['expo'], family = sm.families.Poisson(link=sm.families.links.log())).fit()
print(glm_fit5.summary())



                 Generalized Linear Model Regression Results                  
Dep. Variable:                   freq   No. Observations:               154947
Model:                            GLM   Df Residuals:                   154919
Model Family:                 Poisson   Df Model:                           27
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -86910.
Date:                Mon, 20 May 2024   Deviance:                   1.3626e+05
Time:                        18:49:15   Pearson chi2:                 5.63e+07
No. Iterations:                     9   Pseudo R-squ. (CS):            0.03143
Covariance Type:            nonrobust                                         
                                                         coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------

REMOVE INSIGNIFICANT VARIABLES: 
3. LUI

In [91]:
data = data[data['region']!="LUI"]

In [92]:
formula6 = '''
freq ~
C(age_bins, Treatment(reference="20-29")) + 
C(agecar, Treatment(reference="6-10")) + 
C(sexp, Treatment(reference="Male")) + 
C(fuelc, Treatment(reference="Petrol")) + 
C(split, Treatment(reference="Once")) + 
C(fleetc, Treatment(reference="No")) + 
C(sportc, Treatment(reference="No")) + 
C(powerc, Treatment(reference="<66")) +
C(region, Treatment(reference="HEN"))
'''

In [93]:
glm_fit6 = smf.glm(formula=formula6, data=data, exposure=data['expo'], family = sm.families.Poisson(link=sm.families.links.log())).fit()
print(glm_fit6.summary())



                 Generalized Linear Model Regression Results                  
Dep. Variable:                   freq   No. Observations:               133532
Model:                            GLM   Df Residuals:                   133505
Model Family:                 Poisson   Df Model:                           26
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -76581.
Date:                Mon, 20 May 2024   Deviance:                   1.2081e+05
Time:                        18:50:56   Pearson chi2:                 5.55e+07
No. Iterations:                    10   Pseudo R-squ. (CS):            0.03346
Covariance Type:            nonrobust                                         
                                                         coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------

REMOVE INSIGNIFICANT VARIABLES

In [94]:
data = data[data['region']!="ANT"]

In [96]:
formula7 = '''
freq ~
C(age_bins, Treatment(reference="20-29")) + 
C(agecar, Treatment(reference="6-10")) + 
C(sexp, Treatment(reference="Male")) + 
C(fuelc, Treatment(reference="Petrol")) + 
C(split, Treatment(reference="Once")) + 
C(fleetc, Treatment(reference="No")) + 
C(sportc, Treatment(reference="No")) + 
C(powerc, Treatment(reference="<66")) +
C(region, Treatment(reference="HEN"))
'''

In [97]:
glm_fit7 = smf.glm(formula=formula7, data=data, exposure=data['expo'], family = sm.families.Poisson(link=sm.families.links.log())).fit()
print(glm_fit7.summary())



                 Generalized Linear Model Regression Results                  
Dep. Variable:                   freq   No. Observations:               113979
Model:                            GLM   Df Residuals:                   113953
Model Family:                 Poisson   Df Model:                           25
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -65745.
Date:                Mon, 20 May 2024   Deviance:                   1.0369e+05
Time:                        18:52:20   Pearson chi2:                 3.89e+07
No. Iterations:                    10   Pseudo R-squ. (CS):            0.03756
Covariance Type:            nonrobust                                         
                                                         coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------

REMOVE THE >110 In POWERC

In [98]:
data = data[data['powerc']!=">110"]

In [99]:
formula8 = '''
freq ~
C(age_bins, Treatment(reference="20-29")) + 
C(agecar, Treatment(reference="6-10")) + 
C(sexp, Treatment(reference="Male")) + 
C(fuelc, Treatment(reference="Petrol")) + 
C(split, Treatment(reference="Once")) + 
C(fleetc, Treatment(reference="No")) + 
C(sportc, Treatment(reference="No")) + 
C(powerc, Treatment(reference="<66")) +
C(region, Treatment(reference="HEN"))
'''

In [100]:
glm_fit8 = smf.glm(formula=formula8, data=data, exposure=data['expo'], family = sm.families.Poisson(link=sm.families.links.log())).fit()
print(glm_fit8.summary())



                 Generalized Linear Model Regression Results                  
Dep. Variable:                   freq   No. Observations:               112849
Model:                            GLM   Df Residuals:                   112824
Model Family:                 Poisson   Df Model:                           24
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -65230.
Date:                Mon, 20 May 2024   Deviance:                   1.0293e+05
Time:                        18:54:29   Pearson chi2:                 3.89e+07
No. Iterations:                    10   Pseudo R-squ. (CS):            0.03773
Covariance Type:            nonrobust                                         
                                                         coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------

In [106]:
prediction = glm_fit8.predict()
diff = data['freq']-prediction

ValueError: operands could not be broadcast together with shapes (112863,) (112849,) 