In [None]:
import optuna

import numpy as np
import pandas as pd
import catboost as cb

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import brier_score_loss
from sklearn.model_selection import KFold, train_test_split

pd.options.future.infer_string = True

df = (
    pd.read_csv(
        '../data/train.csv', 
        dtype={
            'attendance_category': 'str', 
            'stated_gender': 'str',
            'treatment_function_code': 'str', 
            'palliative_care_description': 'str',
            }
        )
)

## EDA

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.nunique()

In [None]:
df.columns

## Preprocessing

In [None]:
feats = ['organisation_code_provider', 'age_at_arrival',
       'index_of_multiple_deprivation', 'stated_gender',
       'arrival_mode_desc', 'attendance_category',
       'long_term_condition_count_number', 'gp_practice_code',
       'care_home_status', 
       'living_alone',
       'disability_count_number',
       'segmentation_bridges_to_health',]

target = ['frequent_attender']

In [None]:
feats + target

In [None]:
df = df.dropna(subset=feats)

In [None]:
X = df[feats]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train.shape

## Baseline Models

In [None]:
# naive baseline (using observed proportions for each class)

clf = DummyClassifier()
clf.fit(X_train.select_dtypes(exclude='str'), y_train)
dummy_probs = clf.predict_proba(y_test)[:, 1]
dummy_bsl = brier_score_loss(y_test, dummy_probs)

print(f"Brier Score: {dummy_bsl:.3f}")

In [None]:
# simple logistic regression with one-hot encoding and standardisation

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

cat_feats = ['organisation_code_provider', 'stated_gender', 'arrival_mode_desc', 
            'gp_practice_code', 'living_alone', 'attendance_category', 'care_home_status']
num_feats = ['age_at_arrival', 'index_of_multiple_deprivation', 'long_term_condition_count_number', 
            'segmentation_bridges_to_health', 'disability_count_number']

categorical_transformer = Pipeline(steps=[("encoder", OneHotEncoder(handle_unknown='ignore'))])
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

col_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_feats),
        ("cat", categorical_transformer, cat_feats),
    ]
)

preprocessor = Pipeline(steps=[("col_transformer", col_transformer)])

clf = Pipeline(
    steps=[("preprocessor", preprocessor), 
           ("regressor", LogisticRegression(max_iter=1000))]
)

clf.fit(X_train, np.ravel(y_train))

log_probs = clf.predict_proba(X_test)[:, 1]
log_bsl = brier_score_loss(y_test, log_probs)
print(f"Brier Score: {log_bsl:.3f}")

## Catboost

In [None]:
# simple model (no folds)

# def objective(trial):

#     params = {
#         'iterations': 500,
#         'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
#         "depth": trial.suggest_int("depth", 1, 10),
#         "subsample": trial.suggest_float("subsample", 0.05, 1.0),
#         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
#     }

#     clf = cb.CatBoostClassifier(**params, silent=True, cat_features=cat_feats)
#     clf.fit(X_train, y_train)
#     probs = clf.predict_proba(X_test)
#     bsl = brier_score_loss(y_test, probs)

#     return bsl

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=20)

In [None]:
def objective(trial):

    n_splits = 5

    cv_scores = np.zeros(n_splits)

    kf = KFold(n_splits=n_splits, shuffle=True)

    params = {
        'iterations': 150,
        'learning_rate': trial.suggest_float('learning_rate', 1e-2, 0.3, log=True),
        'depth': trial.suggest_int('depth', 5, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.4, 0.8),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
    }

    clf = cb.CatBoostClassifier(**params, silent=True)

    # for loop to train and validate model        
    for idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        # train/val split on folds
        dtrain = cb.Pool(X.iloc[train_idx], y.iloc[train_idx], cat_features=cat_feats)
        dval = cb.Pool(X.iloc[val_idx], y.iloc[val_idx], cat_features=cat_feats)
    
        # fit model with early stoppings after 10 rounds without improvement
        clf.fit(
            dtrain,
            eval_set=dval,
            early_stopping_rounds=10,
        )

        # compute validation probabilities
        probs = clf.predict_proba(dval)[:, 1]
        
        # compute rmse for each fold
        cv_scores[idx] = brier_score_loss(dval.get_label(), probs)

    return cv_scores.mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

In [None]:
print('Best Hyperparameters:', study.best_params)
print(f"Best Brier Score: {study.best_value:.3f}")

## Evaluation

In [None]:
# brier skill score
mean_prob = np.mean(y_test)
ref_bsl = brier_score_loss(y_test, np.full_like(y_test, mean_prob))

In [None]:
best_params = study.best_params

dtrain = cb.Pool(X_train, y_train, cat_features=cat_feats)
dtest = cb.Pool(X_test, y_test, cat_features=cat_feats)

clf = cb.CatBoostClassifier(**best_params, silent=True, random_state=42)

clf.fit(dtrain)

cb_probs = clf.predict_proba(dtest)[:, 1]
cb_bsl = brier_score_loss(dtest.get_label(), cb_probs)
cb_bss = 1 - (cb_bsl/ref_bsl)

print(f"Brier Score: {cb_bsl:.3f}")
print(f"Reference Brier Score: {ref_bsl:.3f}")
print(f"Brier Skill Score: {cb_bss:.3f}")

In [None]:
clf.get_feature_importance(prettified=True)

## Model Score

In [None]:
test_df = (
    pd.read_csv(
        'data/test.csv', 
        dtype={
            'attendance_category': 'str', 
            'stated_gender': 'str',
            }
        )
)

In [None]:
test_probs = clf.predict_proba(test_df[feats])[:, 1]

test_mean_prob = np.mean(test_df['frequent_attender'])
test_ref_bsl = brier_score_loss(test_df['frequent_attender'], np.full_like(test_df['frequent_attender'], test_mean_prob))

test_bsl = brier_score_loss(test_df['frequent_attender'], test_probs)
test_bss = 1 - (test_bsl/test_ref_bsl)

print(f"Brier Score: {test_bsl:.3f}")
print(f"Reference Brier Score: {test_ref_bsl:.3f}")
print(f"Brier Skill Score: {test_bss:.3f}")