# Replication of Recommender Systems for Insurance Marketing

Done by Luke Strassburg

In [19]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss
import xgboost as xgb
import lightgbm as lgb

from sklearn.preprocessing import LabelEncoder
from utils.pandas_utils import group_categoricals_tail
import utils.pimpmatplotlib as pm
from utils.xgbextras import stopping_at
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    average_precision_score,
    fbeta_score
)
from xgboost.callback import EarlyStopping
from lightgbm.callback import early_stopping


### Table 1: Insurance Product Possession: Period One

In [23]:
DATA_FOLDER = os.path.join("..", "sample_data")
DATA_FILE = "dataset.csv"
MODEL_TYPE = "model"
CONFIG_FOLDER = os.path.join("config")
CONFIG_FILE = "features.csv"

source_config = os.path.join(CONFIG_FOLDER, CONFIG_FILE)
features = pd.read_csv(source_config, keep_default_na=False, na_values=[""])
index = features.loc[features[MODEL_TYPE] == "index", "column"].tolist()
predictors = features.loc[features[MODEL_TYPE] == "predictor", "column"].tolist()
labels = features.loc[features[MODEL_TYPE] == "label", "column"].tolist()
categorical = features.loc[(features["categorical"] == 1) & (features["column"].isin(predictors)), "column"].tolist()

index = features.loc[features[MODEL_TYPE] == "index", "column"].tolist()
labels = features.loc[features[MODEL_TYPE] == "label", "column"].tolist()

source_file = os.path.join(DATA_FOLDER, DATA_FILE)
data = pd.read_csv(source_file, usecols=index+labels+["SET"],
        sep=";", decimal=".", encoding="latin1",
        keep_default_na = False, na_values = [""])

term1 = data[['HAS_A_1', 'HAS_B_1', 'HAS_C_1', 'HAS_D_1', 'HAS_E_1', 'HAS_F_1', 'HAS_G_1', 'HAS_H_1', 'HAS_I_1', 'HAS_L_1']]

col_means = term1.mean()

col_means

HAS_A_1    0.7810
HAS_B_1    0.2193
HAS_C_1    0.0110
HAS_D_1    0.2068
HAS_E_1    0.0193
HAS_F_1    0.0223
HAS_G_1    0.0243
HAS_H_1    0.0073
HAS_I_1    0.0204
HAS_L_1    0.0321
dtype: float64

### Table 2: Purchases Within Periods

### Table 11: Machine Learning Models

#### LightGBM

In [None]:
# Config
CONFIG_FOLDER = os.path.join("config")
CONFIG_FILE = "features.csv"
DATA_FOLDER = os.path.join("..", "sample_data")
DATA_FILE = "dataset.csv"
MODELS_FOLDER = os.path.join("models")
RESULTS_FOLDER = os.path.join("results")

MODEL_TYPE = "model"
TAG = "lgb"

PREDICTIONS_FILE = "_".join([MODEL_TYPE, TAG, "pred.csv"])

SAVE = True

# lightgbm params
# use an optimisation method to find the best params
SEED = 17
LGB_PARAMS = {
    "objective": "binary",
    "eval_metric": "logloss",
    "seed": SEED,
    "verbose": 0,
    "max_depth": 8,
    "num_leaves": 22,
    "min_data_in_leaf": 500,
    "colsample_bytree": 0.75,
    "subsample": 0.75,
    "learning_rate": 0.1
}


if __name__ == "__main__":

    pmp = pm.PimpPlot(save=SAVE, folder=os.path.join(RESULTS_FOLDER, "plots"))

    # Load config
    source_config = os.path.join(CONFIG_FOLDER, CONFIG_FILE)
    features = pd.read_csv(source_config, keep_default_na=False, na_values=[""])
    #print(features.columns)
    index = features.loc[features[MODEL_TYPE] == "index", "column"].tolist()
    predictors = features.loc[features[MODEL_TYPE] == "predictor", "column"].tolist()
    labels = features.loc[features[MODEL_TYPE] == "label", "column"].tolist()
    categorical = features.loc[(features["categorical"] == 1) & (features["column"].isin(predictors)), "column"].tolist()

    # Load data
    source_file = os.path.join(DATA_FOLDER, DATA_FILE)
    data = pd.read_csv(source_file, usecols=index+labels+predictors+["SET"],
                    sep=";", decimal=".", encoding="latin1",
                    keep_default_na = False, na_values = [""])

    # Preprocessing lightgbm
    group_categoricals_tail(data, categorical)

    # Label encoder categorical variables
    label_encoding = {}
    for col in categorical:
        unique_values = data[col].unique().tolist()
        label_encoding[col] = LabelEncoder()
        label_encoding[col].fit(sorted(unique_values))
        data[col] = label_encoding[col].transform(data[col].values)

    # Split the dataset
    indexes = {"train": None, "valid": None, "test": None}
    for set_name in indexes.keys():
        indexes[set_name] = np.where(data["SET"] == set_name)[0]

    # Get only relevant features
    lgb_features = [x for x in sorted(data.columns.tolist()) if x not in labels + index + ["SET"]]

    d = {}
    for set_name, set_indexes in indexes.items():
        if set_name == "test":
            d[set_name] = data.loc[set_indexes, lgb_features].values
        else:
            d[set_name] = lgb.Dataset(data.loc[set_indexes, lgb_features], 
                                        feature_name=lgb_features, 
                                        categorical_feature=categorical, 
                                        free_raw_data=False)

    predictions = {}
    for label in labels:
        print("----------------------------", end="\n")
        print(label, end="\n")
        print("----------------------------", end="\n\n")
        
        print("Creating the Dataset...")
        for set_name, set_indexes in indexes.items():
            if set_name == "test":
                y_test = data.loc[set_indexes, label].values
            else:
                d[set_name].set_label(data.loc[set_indexes, label].values)
        print("Done!", end="\n\n")

        print("Training lightgbm...")
        bst = lgb.train(params=LGB_PARAMS, 
                        train_set=d["train"],
                        num_boost_round=3000, 
                        valid_sets=[d["valid"]],
                        callbacks=[early_stopping(stopping_rounds=5)])
        print("Done!", end="\n\n")

        print("Predictions and plots LGB...")
        pred_label = "{0}_PRED_{1}".format(label, TAG.upper())
        predictions[pred_label] = bst.predict(d["test"])
        
        print("Logloss: {}".format(log_loss(y_test, predictions[pred_label])), end="\n\n")

        print(predictions[pred_label])

        threshold_preds = (predictions[pred_label] > 0.5).astype(int)

        auc_score = roc_auc_score(y_test, predictions[pred_label])
        accuracy = accuracy_score(y_test, threshold_preds)
        precision = precision_score(y_test, threshold_preds)
        recall = recall_score(y_test, threshold_preds)
        pr_auc = average_precision_score(y_test, predictions[pred_label])
        f2_score = fbeta_score(y_test, threshold_preds, beta=2)
        
        print(f"AUC {label}: {auc_score:.4f}")
        print(f"Accuracy {label}: {accuracy:.4f}")
        print(f"Precision {label}: {precision:.4f}")
        print(f"Recall {label}: {recall:.4f}")
        print(f"PR_AUC {label}: {pr_auc:.4f}")
        print(f"F2-Score {label}: {f2_score:.4f}")
        
        if TAG:
            title = "_".join([TAG, label])
        else:
            title = label
        pmp.plot_roc(y_test, predictions[pred_label], title)
        pmp.plot_distributions(y_test, predictions[pred_label], title)
        threshold = pmp.find_threshold_max_f1(y_test, predictions[pred_label], title, N = 100)
        binary_predictions = np.where(predictions[pred_label] >= threshold, 1, 0)
        pmp.plot_confusion_matrix(y_test, binary_predictions, [0, 1], title)
        print("Done!", end="\n\n")

        if SAVE:
            print("Saving...")
            bst.save_model(os.path.join(MODELS_FOLDER, title + ".model"))
            print("Done!", end="\n\n")

    predictions = pd.DataFrame(predictions)
    predictions.to_csv(os.path.join(RESULTS_FOLDER, PREDICTIONS_FILE), sep=";", index=False)

#### XGBoost

In [None]:
# Config
CONFIG_FOLDER = os.path.join("config")
CONFIG_FILE = "features.csv"
DATA_FOLDER = os.path.join("..", "sample_data")
DATA_FILE = "dataset.csv"
MODELS_FOLDER = os.path.join("models")
RESULTS_FOLDER = os.path.join("results")

MODEL_TYPE = "model"
TAG = "xgb"

PREDICTIONS_FILE = "_".join([MODEL_TYPE, TAG, "pred.csv"])

SAVE = True

# XGBoost params
# use an optimisation method to find the best params
SEED = 17
XGB_PARAMS = {
    "learning_rate": 0.1,
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "seed": SEED,
    "verbose": 0,
    "max_depth": 8,
    "min_child_weight": 3,
    "colsample_bytree": 0.75,
    "subsample": 0.75,
    "gamma": 5
}


if __name__ == "__main__":
    pmp = pm.PimpPlot(save=SAVE, folder=os.path.join(RESULTS_FOLDER, "plots"))

    # Load config
    source_config = os.path.join(CONFIG_FOLDER, CONFIG_FILE)
    features = pd.read_csv(source_config, keep_default_na=False, na_values=[""])
    print(features.columns)
    index = features.loc[features[MODEL_TYPE] == "index", "column"].tolist()
    print(index)
    predictors = features.loc[features[MODEL_TYPE] == "predictor", "column"].tolist()
    labels = features.loc[features[MODEL_TYPE] == "label", "column"].tolist()
    categorical = features.loc[(features["categorical"] == 1) & (features["column"].isin(predictors)), "column"].tolist()

    # Load data
    source_file = os.path.join(DATA_FOLDER, DATA_FILE)
    data = pd.read_csv(source_file, usecols=index+labels+predictors+["SET"],
                    sep=";", decimal=".", encoding="latin1",
                    keep_default_na = False, na_values = [""])

    # Preprocessing XGBoost
    group_categoricals_tail(data, categorical)
    data = pd.get_dummies(data, columns=categorical).copy()

    # Split the dataset
    indexes = {"train": None, "valid": None, "test": None}
    for set_name in indexes.keys():
        indexes[set_name] = np.where(data["SET"] == set_name)[0]

    # Get only relevant features
    xgb_features = [x for x in sorted(data.columns.tolist()) if x not in labels + index + ["SET"]]

    d = {}
    for set_name, set_indexes in indexes.items():
        d[set_name] = xgb.DMatrix(data.loc[set_indexes, xgb_features])

    predictions = {}

    print(labels)
    for label in labels:
        print("----------------------------", end="\n")
        print(label, end="\n")
        print("----------------------------", end="\n\n")
        
        print("Creating the DMatrix...")
        for set_name, set_indexes in indexes.items():
            d[set_name].set_label(data.loc[set_indexes, label].values)
        print("Done!", end="\n\n")

        print("Training XGB...")
        bst = xgb.train(params=XGB_PARAMS, 
                        num_boost_round=3000, 
                        dtrain=d["train"], evals=[(d["valid"], "val")],
                        callbacks=[EarlyStopping(rounds=10)])
        print("Done!", end="\n\n")

        print("Predictions and plots XGB...")
        pred_label = "{0}_PRED_{1}".format(label, TAG.upper())
        predictions[pred_label] = bst.predict(d["test"])
        
        print("Logloss: {}".format(log_loss(d["test"].get_label(), predictions[pred_label])), end="\n\n")

        threshold_preds = (predictions[pred_label] > 0.5).astype(int)
        
        auc_score = roc_auc_score(d["test"].get_label(), predictions[pred_label])
        accuracy = accuracy_score(d["test"].get_label(), threshold_preds)
        precision = precision_score(d["test"].get_label(), threshold_preds)
        recall = recall_score(d["test"].get_label(), threshold_preds)
        pr_auc = average_precision_score(d["test"].get_label(), predictions[pred_label])
        f2_score = fbeta_score(d["test"].get_label(), threshold_preds, beta=2)
        
        print(f"AUC {label}: {auc_score:.4f}")
        print(f"Accuracy {label}: {accuracy:.4f}")
        print(f"Precision {label}: {precision:.4f}")
        print(f"Recall {label}: {recall:.4f}")
        print(f"PR_AUC {label}: {pr_auc:.4f}")
        print(f"F2-Score {label}: {f2_score:.4f}")
        
        if TAG:
            title = "_".join([TAG, label])
        else:
            title = label
        pmp.plot_roc(d["test"].get_label(), predictions[pred_label], title)
        pmp.plot_distributions(d["test"].get_label(), predictions[pred_label], title)
        threshold = pmp.find_threshold_max_f1(d["test"].get_label(), predictions[pred_label], title, N = 100)
        binary_predictions = np.where(predictions[pred_label] >= threshold, 1, 0)
        pmp.plot_confusion_matrix(d["test"].get_label(), binary_predictions, [0, 1], title)
        print("Done!", end="\n\n")

        if SAVE:
            print("Saving...")
            bst.save_model(os.path.join(MODELS_FOLDER, title + ".model"))
            print("Done!", end="\n\n")

    predictions = pd.DataFrame(predictions)
    predictions.to_csv(os.path.join(RESULTS_FOLDER, PREDICTIONS_FILE), sep=";", index=False)