Training process of Machine Learning Model

In [None]:
import pandas
import xgboost as xgb
import numpy
from calibration import calibrators, plots
from sklearn.model_selection import train_test_split

: 

Loading Datasets

In [None]:
training_set = pandas.read_csv("../../data/processed_data/training_set.csv")
validation_set = pandas.read_csv("../../data/processed_data/validation_set.csv")
testing_set = pandas.read_csv("../../data/processed_data/testing_set.csv")
calibration_set = pandas.read_csv("../../data/processed_data/calibration_set.csv")

Splitting Calibration set

In [None]:
X, Y = calibration_set.drop(columns=['bad_client']), calibration_set['bad_client']
x_train, x_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3, random_state=100
)

train_calibration_set = pandas.concat([x_train, y_train], axis=1)
test_calibration_set = pandas.concat([x_test, y_test], axis=1)

Creating Baseline Model

In [None]:
model = xgb.XGBClassifier()

In [None]:
from sklearn.metrics import roc_auc_score, make_scorer, log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate 

Initial Comparison

In [None]:
x_train = training_set.drop(columns=['bad_client'])
y_train = training_set['bad_client']

loss_function = make_scorer(log_loss, greater_is_better=False)

cv_results = cross_validate(
    estimator=model,
    X=x_train,
    y=y_train,
    n_jobs=-1,
    cv=StratifiedKFold(n_splits=10),
    scoring=loss_function
)
# Checking model training loss
cv_score = numpy.mean(cv_results['test_score'])
print('loss: %s' % cv_score)


Fine Tuning the Model

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, log_loss

In [None]:
hyperparams = {
}

X_validation = validation_set.drop(columns=['bad_client'], inplace=False)
Y_validation = validation_set['bad_client']

loss_function = make_scorer(log_loss, greater_is_better=False)
tuned_model = GridSearchCV(
    estimator=model,
    param_grid=hyperparams,
    scoring=loss_function,
    n_jobs=-1,
    cv=StratifiedKFold(n_splits=5)
)
# Fitting model
tuned_model.fit(X_validation, Y_validation)
chosen_model = tuned_model['cv_results']['best_estimator']
loss = numpy.mean(tuned_model['test_score'])
print('hyperparameter validation loss: %s' % loss)

In [None]:
x_train_calibration = train_calibration_set.drop(columns=['bad_client'])
y_train_calibration = chosen_model.predict(x_train_calibration)
y_train_calibration_proba = chosen_model.predict_proba(x_train_calibration)[:, 1].tolist()

Checking Calibration quality of the Model

In [None]:
plots.calibration_plot(
    y_true=train_calibration_set['bad_client'],
    y_pred=y_train_calibration,
)

Creating Training Calibration Dataset

In [None]:
calibration_train_dataset = calibrators.CalibrationDataset(
    decision_scores=y_train_calibration_proba,
    true_classes=train_calibration_set['bad_client']
)

Training Calibration Dataset

In [None]:
platt_scaler = calibrators.PlattScaling()

# training calibration algorithm
platt_scaler.train(
    train_dataset=calibration_train_dataset
)

Creating Testing Calibration Dataset

In [None]:
# extracting decision scores from given testing calibration data
x_test_calibration = test_calibration_set.drop(columns=['bad_client'])
predicted_test_calibration_scores = chosen_model.predict_proba(x_test_calibration)[:, 1].tolist()

In [None]:
# creating calibration dataset

calibration_test_dataset = calibrators.CalibrationDataset(
    decision_scores=predicted_test_calibration_scores,
    true_classes=test_calibration_set['bad_client']
)

Estimating Calibration Algorihtm

In [None]:
true_classes = calibration_test_dataset.true_classes
predicted_probs = platt_scaler.get_calibrated_prob(
    decision_scores=calibration_test_dataset.decision_scores
)
predicted_classes = (predicted_probs >= 0.5).astype(numpy.int_)

plots.calibration_plot(
    y_true=true_classes,
    y_pred=predicted_classes
)

Testing ML model on a Testing set

In [None]:
from sklearn.metrics import f1_score

In [None]:
X_test = testing_set.drop(columns=['bad_client'])
Y_test = testing_set['bad_client']

def eval_f1_weighted(y_true, y_pred):
    """
    Standard F1 Score metric with weighted average
    """
    return f1_score(
        y_true, y_pred,
        average='weighted'
    )

cv_results = cross_validate(
    estimator=chosen_model,
    X=X_test,
    Y=Y_test,
    cv=StratifiedKFold(n_splits=5),
    scoring=eval_f1_weighted,
    n_jobs=-1
)
print('F1 Score: %s' % numpy.mean(cv_results['test_score']))

Saving model

In [None]:
import pickle

In [None]:
pickle.dump(chosen_model, open('../models/classifier.pkl', mode='wb'))

Saving Calibration Model

In [None]:
pickle.dump(platt_scaler, open("../calibrators/platt_calibrator.pkl", mode='wb'))