In [None]:
#HANA connector
import hana_ml.dataframe as dataframe
from notebook_hana_connector.notebook_hana_connector import NotebookConnectionContext
#Python stuff
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn import tree
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import xgboost
#from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from xgboost import plot_importance
from matplotlib import pyplot
import imblearn
import numpy as np

In [None]:
conn = NotebookConnectionContext(connectionId = 'hanapoc')

In [None]:
pd.set_option("display.max_rows", 30, "display.max_columns", None)
pd.options.mode.chained_assignment = None

Get data

In [None]:
sql = (
"select "
"CUSTOMER, ORDERS_FIRST_N_WKS_I, ORDERS_FIRST_N_WKS_R, KITCHENS_FIRST_N_WKS_I, KITCHENS_FIRST_N_WKS_R, CAST(NET_VALUE_FIRST_N_WKS as int) as NET_VALUE_FIRST_N_WKS, CAST(NET_VALUE_AVG_FIRST_N_WKS as int) as NET_VALUE_AVG_FIRST_N_WKS, DISTRICTS_FIRST_N_WKS, "
"CASE WHEN CUST_CLASS = 'Z0' THEN '0' WHEN CUST_CLASS = 'Z1' then '1' end as CUST_CLASS_INDC, "
"CASE WHEN (KITCHENS_FIRST_N_WKS_I - KITCHENS_FIRST_N_WKS_R) > 0 THEN '1' ELSE '0' end as KITCHEN_INDC, "
"CASE WHEN (ORDERS_TOTAL_I - ORDERS_FIRST_N_WKS_I) > 0 THEN '1' else '0' end as REPEATCUSTOMER, "
"CASE WHEN ORDERS_FIRST_N_WKS = '1' THEN '1' else '0' end as SINGLEORDER_FIRST_N_WKS, "
"CASE WHEN ID_CREDAT = FIRST_ORDER THEN '1' else '0' end as INSTANTSHOP, "
"DAYS_BETWEEN(ID_CREDAT, FIRST_ORDER) as TIMETOSHOP "
"from TBL_BWH_CUSTOMERSUMMARY_ORDATE "
#"where NET_VALUE_TOTAL > '0' and ID_CREDAT > '2018' and ID_CREDAT < '2021-10-01' " 
"where ORDERS_FIRST_N_WKS > '0' and FIRST_ORDER > '2019' and FIRST_ORDER < '2020-10-10'" 
"ORDER BY RAND() LIMIT 10000"
)
    
cust_sel = conn.sql(sql)
customers = cust_sel.collect()
customers

In [None]:
customers.dtypes

Formatting

In [None]:
cols = ['ORDERS_FIRST_N_WKS_I', 'ORDERS_FIRST_N_WKS_R','KITCHENS_FIRST_N_WKS_I', 'KITCHENS_FIRST_N_WKS_R', 'NET_VALUE_FIRST_N_WKS', 'NET_VALUE_AVG_FIRST_N_WKS', 'DISTRICTS_FIRST_N_WKS','CUST_CLASS_INDC','KITCHEN_INDC', 'SINGLEORDER_FIRST_N_WKS','INSTANTSHOP']
expla = customers[cols]
expla['CUST_CLASS_INDC'] = expla['CUST_CLASS_INDC'].astype(float).astype(int)
expla['KITCHEN_INDC'] = expla['KITCHEN_INDC'].astype(float).astype(int)
expla['SINGLEORDER_FIRST_N_WKS'] = expla['SINGLEORDER_FIRST_N_WKS'].astype(float).astype(int)
expla['INSTANTSHOP'] = expla['INSTANTSHOP'].astype(float).astype(int)
dep = customers['REPEATCUSTOMER'].astype(float).astype(int)

Split into 70% train, 30% test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(expla, dep, test_size=0.3, random_state=1) # 70% training and 30% test

Train XGBoost classification model

In [None]:
model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', n_estimators=300, early_stopping_rounds=50, verbose_eval=50, learning_rate=0.005, scale_pos_weight=0.3, max_depth=4)
model.fit(X_train, y_train)

Calculate accuracy using test dataset

In [None]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Draw up confusion matrix using test dataset

In [None]:
confm = confusion_matrix(y_test, y_pred)

cmd_obj = ConfusionMatrixDisplay(confm, display_labels=[model.classes_[0], model.classes_[1]])
cmd_obj.plot()
cmd_obj.ax_.set(
                title='Confusion Matrix', 
                xlabel='Predicted', 
                ylabel='Actual')
plt.show()

Plot feature importance of trained model

In [None]:
plot_importance(model)