## Предобработка данных

In [1]:
import numpy as np
import pandas as pd
from catboost import Pool, CatBoostRegressor

In [2]:
# Загрузим набор данных

df = pd.read_csv('../data/freMPL-R.csv', low_memory=False)

In [3]:
df = df.loc[df.Dataset.isin([5, 6, 7, 8, 9])]
df.drop('Dataset', axis=1, inplace=True)
df.dropna(axis=1, how='all', inplace=True)
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
df.loc[df.ClaimAmount < 0, 'ClaimAmount'] = 0
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115155 entries, 0 to 115154
Data columns (total 20 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Exposure           115155 non-null  float64
 1   LicAge             115155 non-null  int64  
 2   RecordBeg          115155 non-null  object 
 3   RecordEnd          59455 non-null   object 
 4   Gender             115155 non-null  object 
 5   MariStat           115155 non-null  object 
 6   SocioCateg         115155 non-null  object 
 7   VehUsage           115155 non-null  object 
 8   DrivAge            115155 non-null  int64  
 9   HasKmLimit         115155 non-null  int64  
 10  BonusMalus         115155 non-null  int64  
 11  ClaimAmount        115155 non-null  float64
 12  ClaimInd           115155 non-null  int64  
 13  ClaimNbResp        115155 non-null  float64
 14  ClaimNbNonResp     115155 non-null  float64
 15  ClaimNbParking     115155 non-null  float64
 16  Cl

In [4]:
df["VehUsage"].value_counts()

Private+trip to office    59834
Private                   38839
Professional              14302
Professional run           2180
Name: VehUsage, dtype: int64

In [5]:
def SeriesFactorizer(series):
    series, unique = pd.factorize(series)
    reference = {x: i for x, i in enumerate(unique)}
    print(reference)
    return series, reference

In [6]:
df.Gender, GenderRef = SeriesFactorizer(df.Gender)

{0: 'Male', 1: 'Female'}


In [7]:
df.MariStat, MariStatRef = SeriesFactorizer(df.MariStat)

{0: 'Other', 1: 'Alone'}


In [8]:
df['SocioCateg'] = df.SocioCateg.str.slice(0,4)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 115155 entries, 0 to 115154
Data columns (total 20 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Exposure           115155 non-null  float64
 1   LicAge             115155 non-null  int64  
 2   RecordBeg          115155 non-null  object 
 3   RecordEnd          59455 non-null   object 
 4   Gender             115155 non-null  int64  
 5   MariStat           115155 non-null  int64  
 6   SocioCateg         115155 non-null  object 
 7   VehUsage           115155 non-null  object 
 8   DrivAge            115155 non-null  int64  
 9   HasKmLimit         115155 non-null  int64  
 10  BonusMalus         115155 non-null  int64  
 11  ClaimAmount        115155 non-null  float64
 12  ClaimInd           115155 non-null  int64  
 13  ClaimNbResp        115155 non-null  float64
 14  ClaimNbNonResp     115155 non-null  float64
 15  ClaimNbParking     115155 non-null  float64
 16  Cl

In [10]:
df['DrivAgeSq'] = df.DrivAge.apply(lambda x: x**2)
df.head()

Unnamed: 0,Exposure,LicAge,RecordBeg,RecordEnd,Gender,MariStat,SocioCateg,VehUsage,DrivAge,HasKmLimit,...,ClaimAmount,ClaimInd,ClaimNbResp,ClaimNbNonResp,ClaimNbParking,ClaimNbFireTheft,ClaimNbWindscreen,OutUseNb,RiskArea,DrivAgeSq
0,0.083,332,2004-01-01,2004-02-01,0,0,CSP5,Professional,46,0,...,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,9.0,2116
1,0.916,333,2004-02-01,,0,0,CSP5,Professional,46,0,...,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,9.0,2116
2,0.55,173,2004-05-15,2004-12-03,0,0,CSP5,Private+trip to office,32,0,...,0.0,0,0.0,2.0,0.0,0.0,0.0,0.0,7.0,1024
3,0.089,364,2004-11-29,,1,0,CSP5,Private+trip to office,52,0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2704
4,0.233,426,2004-02-07,2004-05-01,0,0,CSP6,Private,57,0,...,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,3249


In [11]:
df['ClaimsCount'] = df.ClaimInd + df.ClaimNbResp + df.ClaimNbNonResp + \
                    df.ClaimNbParking + df.ClaimNbFireTheft + df.ClaimNbWindscreen
df.loc[df.ClaimAmount == 0, 'ClaimsCount'] = 0
df.drop(["ClaimNbResp", "ClaimNbNonResp", "ClaimNbParking", "ClaimNbFireTheft", "ClaimNbWindscreen"], axis=1, inplace=True)

In [12]:
df.loc[df.ClaimsCount == 11, 'ClaimsCount'] = 10

In [13]:
df["RiskArea"] = df["RiskArea"].astype('int64')

In [14]:
dfAC = df[df.ClaimsCount > 0].copy()
dfAC['AvgClaim'] = dfAC.ClaimAmount/dfAC.ClaimsCount

## Разделение набора данных на обучающую, валидационную и тестовую выборки

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
# Разбиение датасета для частоты на train/val/test
X_c = df.drop(["ClaimInd", "ClaimAmount", "ClaimsCount", "RecordBeg", "RecordEnd"], axis=1)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_c,
                                                            df["ClaimsCount"],
                                                            test_size=0.2,
                                                            random_state=1)

In [17]:
# Разбиение датасета для среднего убытка на train/val/test 
X_ac = dfAC.drop(["ClaimInd", "ClaimAmount", "ClaimsCount", "RecordBeg", "RecordEnd", "AvgClaim"], axis=1)
X_train_ac, X_test_ac, y_train_ac, y_test_ac = train_test_split(X_ac,
                                                                dfAC["AvgClaim"],
                                                                test_size=0.2,
                                                                random_state=1)

In [18]:
cat_columns = ["Gender", "MariStat", "SocioCateg", "VehUsage", "RiskArea"]

In [19]:
train_pool_c = Pool(data=X_train_c.drop(columns="Exposure"),
                    label=y_train_c,
                    cat_features=cat_columns,
                    weight=X_train_c["Exposure"])

test_pool_c = Pool(data=X_test_c.drop(columns="Exposure"),
                   cat_features=cat_columns,
                   weight=X_test_c["Exposure"])

train_pool_ac = Pool(data=X_train_ac.drop(columns="Exposure"),
                     label=y_train_ac,
                     cat_features=cat_columns,
                     weight=X_train_ac["Exposure"])

test_pool_ac = Pool(data=X_test_ac.drop(columns="Exposure"),
                    cat_features=cat_columns,
                    weight=X_test_ac["Exposure"])

In [20]:
model_c = CatBoostRegressor(logging_level="Silent").fit(train_pool_c)

In [21]:
model_ac = CatBoostRegressor(logging_level="Silent").fit(train_pool_ac)

In [22]:
pred_c = model_c.predict(test_pool_c)
pred_ac = model_ac.predict(test_pool_ac)

In [23]:
from sklearn.metrics import r2_score as R2, mean_absolute_error as MAE, mean_squared_error as MSE

In [24]:
print(f'ClaimsCount:\nR2 = {R2(y_test_c, pred_c)}\nMAE = {MAE(y_test_c, pred_c)}\nMSE = {MSE(y_test_c, pred_c)}')
print(f'\nAvgClaim model:\nR2 = {R2(y_test_ac, pred_ac)}\nMAE = {MAE(y_test_ac, pred_ac)}\nMSE = {MSE(y_test_ac, pred_ac)}')

ClaimsCount:
R2 = -0.017285104594594092
MAE = 0.447560585637332
MSE = 0.5809544937283284

AvgClaim model:
R2 = -0.034295315130244086
MAE = 1140.6234235176398
MSE = 9026741.961362759


In [25]:
model_c.save_model("./models/ClaimsCount_model.cbm", format="cbm")
model_ac.save_model("./models/AvgClaim_model.cbm", format="cbm")