In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('DataFrame_Claims_Spain.csv', encoding='latin_1', index_col=0,
                 parse_dates=['Claim Incident date', 'FE_Declaration_date', 'Initial coverage date',
                              'First claim decision date', 'Last claim decisión date', 'Policy Holder date of birth'],
                              infer_datetime_format=True, dtype={'Age at signature': np.int64})

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.head()

Unnamed: 0,Risk code,Claim Incident date,FE_Declaration_date,Initial coverage date,First claim decision date,Last claim decisión date,Insured amount,Initial_Instalment_Amount,Policy Holder date of birth,Age at signature,Sexo,Refused decision reason code,Trad_Refusal_reason,Refusal_Category,Claim_Status_Level_0,Refusal_Flag,Local Partner name categories,Insured NIF categories,Claim Number categories
2,5,2009-04-22,2010-01-15,2006-10-04,2010-01-21,2010-09-28,2896.44,241.37,1945-07-22 00:00:00,61,Femme,139.0,Waiting Period,Administrative,Claim refused,Yes,4,0,56529
3,5,2009-04-22,2010-01-15,2004-08-05,2010-01-21,2010-09-28,1080.0,90.0,1945-07-22 00:00:00,59,Femme,139.0,Waiting Period,Administrative,Claim refused,Yes,4,0,56528
5,5,2009-04-13,2013-10-09,2007-10-23,2013-10-09,2014-06-20,5899.44,983.24,1945-11-13 00:00:00,61,Homme,139.0,Waiting Period,Administrative,Claim refused,Yes,65,1,79211
6,5,2009-12-22,2010-04-05,2008-11-25,2010-04-12,2010-10-15,410.58,68.43,1945-12-04 00:00:00,62,Homme,139.0,Waiting Period,Administrative,Claim refused,Yes,15,2,5822
7,13,2010-04-09,2010-07-06,2010-02-19,2010-07-14,2011-01-14,790.02,131.67,1946-02-03 00:00:00,64,Homme,139.0,Waiting Period,Administrative,Claim refused,Yes,4,3,8935


The column `'Policy Holder date of birth'` contain a corrupted date

In [4]:
df['Policy Holder date of birth'][df['Policy Holder date of birth'].str.contains('1070')]

33949    19/05/1070
Name: Policy Holder date of birth, dtype: object

In [5]:
df.loc[33949, 'Policy Holder date of birth'] = '19/05/1970'
df.loc[33949, 'Age at signature'] = 29

Convert the column containing some date to the `datetime` dtype

In [6]:
df['Policy Holder date of birth'] = pd.to_datetime(df['Policy Holder date of birth'])

Convert to category the categorical columns

In [7]:
def convert_to_int_object(col):
    serie = []
    for _, x in col.iteritems():
        try:
            serie.append(int(x))
        except ValueError:
            serie.append(np.nan)
    return pd.Series(serie, index=col.index, dtype=object)

In [8]:
df['Risk code'] = df['Risk code'].astype('category')
df['Sexo'] = df['Sexo'].astype('category')
df['Refused decision reason code'] = convert_to_int_object(df['Refused decision reason code']).astype('category')
df['Trad_Refusal_reason'] = df['Trad_Refusal_reason'].astype('category')
df['Refusal_Category'] = df['Refusal_Category'].astype('category')
df['Claim_Status_Level_0']= df['Claim_Status_Level_0'].astype('category')
df['Refusal_Flag'] = df['Refusal_Flag'].astype('category')
df['Local Partner name categories'] = df['Local Partner name categories'].astype('category')
df['Insured NIF categories'] = df['Insured NIF categories'].astype('category')

Drop the column to not consider during classification

In [9]:
target = df['Refusal_Flag']
data = df.drop(columns=[
    'Refusal_Flag',
    'Refused decision reason code',
    'Claim_Status_Level_0',
    'Trad_Refusal_reason',
    'Refusal_Category',
    'First claim decision date',
    'Last claim decisión date',
    'Insured NIF categories',
    'Claim Number categories'
])

Exercise : Remove claims with administrative refusal using `'Refusal_Category'` column

Encode the label

In [10]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label = label_encoder.fit_transform(target)

Define the different part of the pipeline:

* One-hot encode the categorical features;
* Ordinal encode the binary categorical features;
* Standard scale the numerical features;
* Feature engineer the date by creating: (i) the time to declaration and (ii) the contract seniority.

In [11]:
one_hot_encoded_cat_features = ['Risk code',
                                'Local Partner name categories']
ordinal_encoded_features = ['Sexo']
standard_scaled_features = ['Insured amount', 'Initial_Instalment_Amount',
                            'Age at signature']
time_to_declaration_features = ['FE_Declaration_date', 'Claim Incident date']
contract_seniority_features = ['Claim Incident date', 'Initial coverage date']

In [12]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer

from sklearn.impute import SimpleImputer

from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

from function_FunctionTransformer import time_to_declaration
from function_FunctionTransformer import contract_seniority

In [13]:
preprocessor = make_column_transformer(
    (ordinal_encoded_features, OrdinalEncoder()),
    (one_hot_encoded_cat_features, make_pipeline(
        SimpleImputer(strategy='constant', fill_value=-1),
        OneHotEncoder(handle_unknown='ignore'))),
    (time_to_declaration_features, make_pipeline(
        FunctionTransformer(func=time_to_declaration, validate=False),
        MinMaxScaler())),
    (contract_seniority_features, make_pipeline(
        FunctionTransformer(func=contract_seniority, validate=False),
        MinMaxScaler())),
    (standard_scaled_features, make_pipeline(
        StandardScaler(),
        SimpleImputer(strategy='median'))),
    n_jobs=-1
)



Use a RandomForestClassifier to make some classification within a 3-fold cross-validation. We will return the `balanced_accuracy_score` and the `roc_auc_score`.

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

In [15]:
pipe = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, n_jobs=-1))

In [None]:
scores = cross_validate(pipe, data, label, scoring=['roc_auc', 'balanced_accuracy'], cv=3, n_jobs=-1)

Convert the scores to a dataframe to have a nice display

In [None]:
scores = pd.DataFrame(scores)
scores

Compute the mean performance

In [None]:
scores.mean().to_frame().T

As well as the std. dev. of those performance

In [None]:
scores.std().to_frame().T

Exercise : Create a new feature containing number of claims already declared by customer (use `'Insured NIF categories'` column) and rerun classification

Exercise : Create a new feature containing the month when claim occurs (use `Claim Incident date` column) and rerun classification