## Read the dataset

In [None]:
import pandas as pd

df = pd.read_csv('cleaner_dataframe.csv', index_col=0,
                 parse_dates=['FE_Declaration_date',
                              'Claim Incident date',
                              'Initial coverage date',
                              'First claim decision date',
                              'Last claim decisión date',
                              'Policy Holder date of birth'])
df.loc[:, 'Age policy at claim'] = pd.to_timedelta(df.loc[:, 'Age policy at claim'])
df.loc[:, 'Delay declaration'] = pd.to_timedelta(df.loc[:, 'Delay declaration'])
df.loc[:, 'Age client at claim'] = pd.to_timedelta(df.loc[:, 'Age client at claim'])
df.head()

## Add some new feature based on the date

In [None]:
seniority = (df['Claim Incident date'] -
             df['Initial coverage date']).rename('seniority')
time_to_declaration = (df['FE_Declaration_date'] -
                       df['Claim Incident date']).rename('declaration time')

In [None]:
time_to_declaration

In [None]:
df = pd.concat([df, seniority, time_to_declaration], axis=1)

In [None]:
df.head()

## Convert timedelta into integer

In [None]:
timedelta_cols = ['Age policy at claim', 'Delay declaration', 'Age client at claim',
                  'seniority', 'declaration time']
for col in timedelta_cols:
    df.loc[:, col] = df.loc[:, col].dt.days

In [None]:
df.head()

In [None]:
df.columns

## Make a model for numerical data

### Create a machine-learning pipeline

First, we can define the numerical columns.

In [None]:
num_cols = ['Insured amount', 'Initial_Instalment_Amount',
            'Age at signature', 'Age policy at claim',
            'Delay declaration', 'Age client at claim',
            'seniority', 'declaration time']

Select only the numerical columns, split into the data and target, and finally into a training and testing set.

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop(columns='Refusal_Flag')[num_cols]
y = df['Refusal_Flag']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42
)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score

pipe = make_pipeline(
    StandardScaler(), LogisticRegression(solver='lbfgs', max_iter=1000)
)

In [None]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [None]:
balanced_accuracy_score(y_test, y_pred)

### Use cross-validation instead of single train-test split

Instead of using a single split, we can make a stratified cross-validation.

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(
    pipe, X[num_cols], y, cv=3, n_jobs=-1,
    scoring='balanced_accuracy'
)

In [None]:
print(f"The balanced accuracy is: {scores.mean():.3f} +/- {scores.std():.3f}")

### Make a grid-search to optimize hyper-parameters

We might want to optimize the hyper-parameters of the classifier. So we could use a grid-search or a randomized-search

In [None]:
for param in pipe.get_params().keys():
    print(param)

Let's try to optimize the parameter `C` which is defined as `logisticregression__C`. Note that you can always use the method `get_params()` to find the name of the parameter to optimize.

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}
grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
grid_search.best_params_

Now that we fit `grid_search`, it will behave like any other estimator where the parameters are fixed to the best parameters found during `fit`.

In [None]:
pred = grid_search.predict(X_test)
balanced_accuracy_score(y_test, pred)

We can use the same as well use the grid-search directly in a cross-validation.

In [None]:
scores = cross_val_score(
    grid_search, X, y, cv=5, n_jobs=-1,
    scoring='balanced_accuracy'
)

In [None]:
print(f"The balanced accuracy is: {scores.mean():.3f} +/- {scores.std():.3f}")

Instead of using a grid-search, we could use a randomized search. In this case, we need to provide a distribution and mentioned how many tries do we want to check.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal

param_distributions = {
    'logisticregression__C': reciprocal(0.001, 10)
}
random_search = RandomizedSearchCV(
    pipe, param_distributions=param_distributions,
    n_iter=10, cv=5
)
_ = random_search.fit(X_train, y_train)

We can check which parameter values have been tried:

In [None]:
results = pd.DataFrame(random_search.cv_results_)
results

In [None]:
random_search.best_params_

## Make a pipeline using categorical columns

In [None]:
cat_cols = ['Risk code', 'Sexo']

In [None]:
X = df.drop(columns='Refusal_Flag')[cat_cols]
y = df['Refusal_Flag']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42
)

In [None]:
from sklearn.preprocessing import OneHotEncoder

pipe = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'),
    LogisticRegression(solver='lbfgs', max_iter=1000)
)
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

In [None]:
balanced_accuracy_score(y_test, y_pred)

We could apply similar methods as before (cross-validation and grid-search). Instead, of repeating the same analysis, we will combine both categorical and numerical pipelines using a `ColumnTransformer`.

## Combine categorical and numerical data

Split into train-test data.

In [None]:
X = df.drop(columns='Refusal_Flag')
y = df['Refusal_Flag']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42
)

Define the numerical pipeline of the preprocessor.

In [None]:
num_cols = ['Insured amount', 'Initial_Instalment_Amount',
            'Age at signature', 'Age policy at claim',
            'Delay declaration', 'Age client at claim',
            'seniority', 'declaration time']
pipe_num = StandardScaler()

Define the categorical pipeline of the preprocessor.

In [None]:
cat_cols = ['Risk code', 'Sexo']
pipe_cat = OneHotEncoder(handle_unknown='ignore')

Create a `ColumnTransformer` to dispatch the categorical columns to the categorical pipeline and the numerical colums to the numerical pipeline.

In [None]:
from sklearn.compose import make_column_transformer

preprocessor = make_column_transformer(
    (pipe_num, num_cols), (pipe_cat, cat_cols)
)

Finally, add a classifier after the preprocessor.

In [None]:
model = make_pipeline(
    preprocessor,
    LogisticRegression(solver='lbfgs', max_iter=1000)
)

In [None]:
_ = model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

You can use the a cross-validation as before:

In [None]:
scores = cross_val_score(
    model, X, y, cv=5, scoring='balanced_accuracy', n_jobs=-1
)

In [None]:
print(f"The balanced accuracy is: {scores.mean():.3f} +/- {scores.std():.3f}")

We can as well make a grid-search to set some parameters.

In [None]:
for param in model.get_params().keys():
    print(param)

In [None]:
random_search = RandomizedSearchCV(
    model, param_distributions=param_distributions,
    n_iter=10
)

In [None]:
scores = cross_val_score(
    random_search, X, y, cv=5,
    scoring='balanced_accuracy', n_jobs=-1
)

In [None]:
print(f"The balanced accuracy is: {scores.mean():.3f} +/- {scores.std():.3f}")