In [12]:
import pandas as pd

df = pd.read_csv('cleaner_dataframe.csv', index_col=0,
                 parse_dates=['FE_Declaration_date',
                              'Claim Incident date',
                              'Initial coverage date',
                              'First claim decision date',
                              'Last claim decisión date',
                              'Policy Holder date of birth'])
df.loc[:, 'Age policy at claim'] = pd.to_timedelta(df.loc[:, 'Age policy at claim'])
df.loc[:, 'Delay declaration'] = pd.to_timedelta(df.loc[:, 'Delay declaration'])
df.loc[:, 'Age client at claim'] = pd.to_timedelta(df.loc[:, 'Age client at claim'])
df.head()

Unnamed: 0,Risk code,Claim Incident date,FE_Declaration_date,Initial coverage date,First claim decision date,Last claim decisión date,Insured amount,Initial_Instalment_Amount,Policy Holder date of birth,Age at signature,...,Trad_Refusal_reason,Refusal_Category,Claim_Status_Level_0,Refusal_Flag,Local Partner name categories,Insured NIF categories,Claim Number categories,Age policy at claim,Delay declaration,Age client at claim
7,13,2010-04-09,2010-07-06,2010-02-19,2010-07-14,2011-01-14,790.02,131.67,1946-02-03,64,...,Waiting Period,Administrative,Claim refused,Yes,4,3,8935,49 days,88 days,23441 days
9,1,2010-01-22,2010-02-22,2010-01-11,2010-03-02,2011-03-24,6000.0,111.22,1946-08-18,63,...,Waiting Period,Administrative,Claim accepted after refusal,Yes,35,4,59206,11 days,31 days,23168 days
11,3,2009-11-16,2011-06-27,2009-07-21,2011-07-07,2012-01-11,3337.08,73.6,1946-09-28,62,...,Waiting Period,Administrative,Claim refused,Yes,35,6,65416,118 days,588 days,23060 days
12,13,2010-04-09,2010-05-18,2009-08-11,2010-05-20,2010-11-29,610.74,101.79,1947-02-21,62,...,Waiting Period,Administrative,Claim refused,Yes,3,7,8072,241 days,39 days,23058 days
14,13,2009-07-31,2010-07-27,2009-07-30,2010-08-03,2011-02-04,599.7,99.95,1947-09-11,61,...,Waiting Period,Administrative,Claim refused,Yes,15,9,6499,1 days,361 days,22604 days


In [25]:
seniority = (df['Claim Incident date'] -
             df['Initial coverage date']).rename('seniority')
time_to_declaration = (df['FE_Declaration_date'] -
                       df['Claim Incident date']).rename('declaration time')

In [27]:
df = pd.concat([df, seniority, time_to_declaration], axis=1)

In [32]:
timedelta_cols = ['Age policy at claim', 'Delay declaration', 'Age client at claim',
                  'seniority', 'declaration time']
for col in timedelta_cols:
    df.loc[:, col] = df.loc[:, col].dt.days

In [34]:
df.columns

Index(['Risk code', 'Claim Incident date', 'FE_Declaration_date',
       'Initial coverage date', 'First claim decision date',
       'Last claim decisión date', 'Insured amount',
       'Initial_Instalment_Amount', 'Policy Holder date of birth',
       'Age at signature', 'Sexo', 'Refused decision reason code',
       'Trad_Refusal_reason', 'Refusal_Category', 'Claim_Status_Level_0',
       'Refusal_Flag', 'Local Partner name categories',
       'Insured NIF categories', 'Claim Number categories',
       'Age policy at claim', 'Delay declaration', 'Age client at claim',
       'seniority', 'declaration time'],
      dtype='object')

# Make a model for numerical data

In [35]:
num_cols = ['Insured amount', 'Initial_Instalment_Amount',
            'Age at signature', 'Age policy at claim',
            'Delay declaration', 'Age client at claim',
            'seniority', 'declaration time']

In [38]:
from sklearn.model_selection import train_test_split
X = df.drop(columns='Refusal_Flag')
y = df['Refusal_Flag']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42
)

In [47]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score

pipe = make_pipeline(
    StandardScaler(), RandomForestClassifier(n_estimators=100)
)

In [48]:
pipe.fit(X_train[num_cols], y_train)
y_pred = pipe.predict(X_test[num_cols])

In [49]:
balanced_accuracy_score(y_test, y_pred)

0.609351862302786

In [50]:
from sklearn.model_selection import cross_val_score

In [52]:
cross_val_score(pipe, X[num_cols], y, cv=3, n_jobs=-1,
                scoring='balanced_accuracy')

array([0.36224379, 0.43797341, 0.42971412])