In [None]:
from collections import Counter
import numpy as np
import pandas as pd

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns
import dataframe_image as dfi

# Preprocessing and Model Selection
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report

In [None]:
# Models to be tested
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

# Set the list of models to compete

models = {
    "Logistic Reg.": LogisticRegression(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(logging_level="Silent"),
    # "Neural Network": MLPClassifier(),
    "Random Forest": RandomForestClassifier(),
    # "SVM": SVC(class_weight="balanced", probability=True),
    "K-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "AdaBoost": AdaBoostClassifier(),
}

In [None]:
preprocessed_path = "../data/preprocessed/"

train_df = pd.read_csv(preprocessed_path + "train.csv")
test_df = pd.read_csv(preprocessed_path + "test.csv")
val_df = pd.read_csv(preprocessed_path + "validate.csv")


target = ["TX_FRAUD"]

y_train = train_df[target].values
y_val = val_df[target].values
y_test = test_df[target].values

X_train = train_df.drop(target, axis=1)
X_val = val_df.drop(target, axis=1)
X_test = test_df.drop(target, axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [None]:
# # Data Augmentation ########################################################
# # It is well reported that the dataset is imbalanced. I use SMOTE to increase
# # the number of fraud cases in the training set.
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbpipeline

# define pipeline
over = SMOTE(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=1)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate

# Create pipelines for each model

tag = "balanced"
pipelines = {}

scaler = StandardScaler()

for model_name, model in models.items():
    pipeline = imbpipeline(
        [
            ("preprocessor", scaler),
            ("oversampling", over),
            ("undersampling", under),
            ("classifier", model),
        ]
    )

    pipelines[model_name] = pipeline

scoring = {
    "acc": "accuracy",
    "f1": "f1",
    "recall": "recall",
    "precision": "precision",
    "auc": "roc_auc",
}
scores = {}

# Perform cross-validation and compare F1 scores
for model_name, pipeline in pipelines.items():
    print(f"########## {model_name} ##########")

    scores[model_name] = cross_validate(
        pipeline, X_train, y_train, cv=5, scoring=scoring
    )
    f1_scores = scores[model_name]["test_f1"]
    mean_f1 = np.mean(f1_scores)
    std_f1 = np.std(f1_scores)
    print(f"{model_name} - Mean F1 score: {mean_f1}")

In [None]:
# Format the scoring so it can be plotted
# fmt:off
metrics = pd.DataFrame([], columns=[ "fit_time", "score_time", "test_acc", "test_f1",
                                    "test_recall", "test_precision", "model_name"])

# fmt:on

for model_name in scores.keys():
    tmp = pd.DataFrame(scores[model_name])
    tmp["model_name"] = model_name

    metrics = pd.concat([metrics, tmp])

In [None]:
# Plot competition results

fig, ax = plt.subplots(1, 4, sharey=True)

for (i, metric), ylabel in zip(
    enumerate(["test_f1", "test_precision", "test_recall", "test_acc"]),
    ["F1", "Precision", "Recall", "Acc"],
):
    sns.pointplot(data=metrics, x=metric, linestyle="none", y="model_name", ax=ax[i])
    ax[i].set_ylabel(None)
    ax[i].set_xlabel(ylabel)
    ax[i].set_xlim([0, 1])

    ranking = metrics.groupby("model_name")[metric].mean().sort_values(ascending=False)

    ax[i].scatter(
        ranking.values[0],
        list(models).index(ranking.index[0]),
        color="r",
        marker="o",
        s=150,
    )

fig.suptitle(f"Model Selection - {tag} dataset")
plt.tight_layout()
fig.savefig(f"../figures/model_selection_{tag}.png")

In [None]:
f1_ranking = (
    metrics.groupby("model_name").mean().sort_values(by="test_f1", ascending=False)
)
f1_winner = f1_ranking.index[0]

f1_ranking

In [None]:
# Assume we have a DataFrame df
dfi.export(
    f1_ranking, f"../figures/f1_ranking_{tag}.png", table_conversion="matplotlib"
)

In [None]:
pipelines[f1_winner].fit(X_train, y_train)
y_pred = pipelines[f1_winner].predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
y_pred = pipeline.predict(X_test)
report = pd.DataFrame(
    classification_report(y_test, y_pred, output_dict=True)
).transpose()
dfi.export(report, f"../figures/default.catBoost.png", table_conversion="matplotlib")

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

fig, ax = plt.subplots(1, 2, sharey=True, figsize=(5, 3))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, ax=ax[0], colorbar=False)
ConfusionMatrixDisplay.from_predictions(
    y_test,
    y_pred,
    normalize="true",
    values_format=".2f",
    ax=ax[1],
    im_kw={"vmin": 0, "vmax": 1},
    colorbar=False,
)
fig.suptitle("Confusion Matrix")
ax[0].set_title("Counts")
ax[1].set_title("Proportions")
ax[1].set_ylabel(None)
fig.savefig(f"../figures/default.catBoost.cm.png")