In [1]:
# !pip install imblearn

In [2]:
# !pip install lazypredict

In [3]:
import numpy as np
import pandas as pd
import gc
from datetime import datetime
import pickle
import os

In [4]:
import pyodbc

In [6]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

from lazypredict.Supervised import LazyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB

from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve,\
                            precision_recall_fscore_support, precision_score, recall_score

In [7]:
# from google.colab import drive
# drive.mount('/content/gdrive', force_remount=True)

In [8]:
# %cd '/content/gdrive/My Drive/CARD CHURN/'

In [9]:
# Cross validation
def Average_Score_Class1(model, X, y, size=0.2, cv=5, threshold=0.5):
    precision_class1=[]
    recall_class1=[]

    for n in range(1, cv+1):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size)

        df_predict = pd.DataFrame(model.predict_proba(X_test))
        df_predict.columns = ['proba_0','proba_1']

        i = threshold
        df_predict['predict'] = 0
        df_predict.loc[(df_predict['proba_1']>=i),['predict']] = 1
        df_predict['true_value'] = y_test

        precision_class1.append(precision_score(y_test, df_predict['predict'], pos_label=1))
        recall_class1.append(recall_score(y_test, df_predict['predict'], pos_label=1))

    return precision_class1, recall_class1

In [10]:
def Evaluation_Each_Class(model, X, y, list_of_thresholds, list_of_classes):

  threshold_list = []
  class_list = []
  precision_list = []
  recall_list = []
  f1_score_list = []

  for threshold in list_of_thresholds:
    for class_ in list_of_classes:
      threshold_list.append(threshold)
      class_list.append(class_)

      df_predict = pd.DataFrame(model.predict_proba(X))
      df_predict.columns = ['proba_0','proba_1']
      df_predict['true_value'] = y.values

      i = threshold
      df_predict['predict'] = 0
      df_predict.loc[(df_predict['proba_1']>=i),['predict']] = 1

      precision_list.append(precision_score(df_predict['true_value'], df_predict['predict'], pos_label=class_))
      recall_list.append(recall_score(df_predict['true_value'], df_predict['predict'], pos_label=class_))
      f1_score_list.append(f1_score(df_predict['true_value'], df_predict['predict'], pos_label=class_))

  df_evaluation = pd.DataFrame({'threshold':threshold_list,
                                'class':class_list,
                                'precision':precision_list,
                                'recall':recall_list,
                                'f1_score':f1_score_list})
  return df_evaluation

In [12]:
# Define period of sample data
# Feature data cover from 202312 to 202403
# Output data cover from 202402 tp 202405
list_month = [202312, 202401, 202402, 202403]

# Read Data

In [13]:
# Save the sample data
X = pd.read_csv('F:/PROJECT/CASA_Churn/X_202402_202405_2lead.csv')

In [14]:
# Save the sample data
y = pd.read_csv('F:/PROJECT/CASA_Churn/Y_202402_202405_2lead.csv')

# Build Model

## Train-Test Split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   test_size=0.2,
                                                   random_state=42)

In [16]:
y_train.value_counts()

Y
0    1144825
1      53732
dtype: int64

In [17]:
y_test.value_counts()

Y
0    286212
1     13428
dtype: int64

In [18]:
# Fix Imbalance problem
rus = RandomUnderSampler(random_state=0)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

In [19]:
y_train_rus.value_counts()

Y
0    53732
1    53732
dtype: int64

In [20]:
del X_train
del y_train
gc.collect()

42

## LazyPredict

In [None]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

In [None]:
models, predictions = clf.fit(X_train_rus, X_test, y_train_rus, y_test)

In [None]:
models

Notes:
- Choose LGBMClassifier, RandomForestClassifier, XGBClassifier because of the high accuracy in prediction.

## LGBMClassifier

In [21]:
import lightgbm as lgb

### Build Model

In [22]:
# Train model
model_lgbm = lgb.LGBMClassifier()
model_lgbm.fit(X_train_rus, y_train_rus)

LGBMClassifier()

In [23]:
# Save the model to disk
filename = 'CASAChurn_LGBMmodel_2lead.sav'
pickle.dump(model_lgbm, open(filename, 'wb'))

In [24]:
# load the model from disk
model_lgbm = pickle.load(open(filename, 'rb'))

### Evaluation

In [25]:
# Scores for each class and each threshold
model = model_lgbm
thresholds = [0.3,0.5,0.6,0.7,0.8,0.9]
classes = [0,1]
df_evaluation = Evaluation_Each_Class(model, X_test, y_test, thresholds, classes)
df_evaluation

Unnamed: 0,threshold,class,precision,recall,f1_score
0,0.3,0,0.99,0.81,0.89
1,0.3,1,0.17,0.84,0.28
2,0.5,0,0.99,0.96,0.97
3,0.5,1,0.47,0.75,0.58
4,0.6,0,0.99,0.98,0.98
5,0.6,1,0.62,0.73,0.67
6,0.7,0,0.99,0.99,0.99
7,0.7,1,0.72,0.71,0.72
8,0.8,0,0.99,0.99,0.99
9,0.8,1,0.77,0.71,0.74


In [26]:
# Confusion matrix and Classification report for each threshold
model = model_lgbm
df_predict = pd.DataFrame(model.predict_proba(X_test)) # predict probabilities
df_predict.columns = ['proba_0','proba_1']
# Find which threshold will perform the best recall and precision for class 1 (churn)
for i in [0.3,0.5,0.6,0.7,0.8,0.9]:
    df_predict['predict'] = 0
    df_predict.loc[(df_predict['proba_1']>=i),['predict']] = 1
    print('-'*55)
    print('threshold = ' + str(i))
    print(confusion_matrix(y_test, df_predict['predict']))
    print(classification_report(y_test, df_predict['predict']))

-------------------------------------------------------
threshold = 0.3
[[230969  55243]
 [  2120  11308]]
              precision    recall  f1-score   support

           0       0.99      0.81      0.89    286212
           1       0.17      0.84      0.28     13428

    accuracy                           0.81    299640
   macro avg       0.58      0.82      0.59    299640
weighted avg       0.95      0.81      0.86    299640

-------------------------------------------------------
threshold = 0.5
[[274831  11381]
 [  3362  10066]]
              precision    recall  f1-score   support

           0       0.99      0.96      0.97    286212
           1       0.47      0.75      0.58     13428

    accuracy                           0.95    299640
   macro avg       0.73      0.85      0.78    299640
weighted avg       0.96      0.95      0.96    299640

-------------------------------------------------------
threshold = 0.6
[[280260   5952]
 [  3650   9778]]
              precision  

## RandomForest with balanced data

### Build Model

In [27]:
# Train model
model_rf = RandomForestClassifier(n_estimators=200, max_depth=50, class_weight='balanced', n_jobs=-1)
model_rf.fit(X_train_rus, y_train_rus)

RandomForestClassifier(class_weight='balanced', max_depth=50, n_estimators=200,
                       n_jobs=-1)

In [28]:
# Save the model to disk
filename = 'CASAChurn_RFmodel_2lead.sav'
pickle.dump(model_rf, open(filename, 'wb'))

In [29]:
# load the model from disk
model_rf = pickle.load(open(filename, 'rb'))

### Evaluation

In [30]:
# Scores for each class and each threshold
model = model_rf
thresholds = [0.3,0.5,0.6,0.7,0.8,0.9]
classes = [0,1]
df_evaluation = Evaluation_Each_Class(model, X_test, y_test, thresholds, classes) # Evaluation_Each_Class is an user-defined function
df_evaluation

Unnamed: 0,threshold,class,precision,recall,f1_score
0,0.3,0,0.99,0.71,0.83
1,0.3,1,0.12,0.87,0.22
2,0.5,0,0.99,0.94,0.96
3,0.5,1,0.37,0.76,0.5
4,0.6,0,0.99,0.97,0.98
5,0.6,1,0.54,0.73,0.62
6,0.7,0,0.99,0.99,0.99
7,0.7,1,0.69,0.71,0.7
8,0.8,0,0.99,0.99,0.99
9,0.8,1,0.77,0.7,0.73


In [31]:
# Confusion matrix and Classification report for each threshold
model = model_rf
df_predict = pd.DataFrame(model.predict_proba(X_test)) # predict probabilities
df_predict.columns = ['proba_0','proba_1']
# Find which threshold will perform the best recall and precision for class 1 (churn)
for i in [0.3,0.5,0.6,0.7,0.8,0.9]:
    df_predict['predict'] = 0
    df_predict.loc[(df_predict['proba_1']>=i),['predict']] = 1
    print('-'*55)
    print('threshold = ' + str(i))
    print(confusion_matrix(y_test, df_predict['predict']))
    print(classification_report(y_test, df_predict['predict']))

-------------------------------------------------------
threshold = 0.3
[[204590  81622]
 [  1796  11632]]
              precision    recall  f1-score   support

           0       0.99      0.71      0.83    286212
           1       0.12      0.87      0.22     13428

    accuracy                           0.72    299640
   macro avg       0.56      0.79      0.52    299640
weighted avg       0.95      0.72      0.80    299640

-------------------------------------------------------
threshold = 0.5
[[268893  17319]
 [  3286  10142]]
              precision    recall  f1-score   support

           0       0.99      0.94      0.96    286212
           1       0.37      0.76      0.50     13428

    accuracy                           0.93    299640
   macro avg       0.68      0.85      0.73    299640
weighted avg       0.96      0.93      0.94    299640

-------------------------------------------------------
threshold = 0.6
[[277936   8276]
 [  3661   9767]]
              precision  

Notes:
- According to these scores, the threshold of 70% will perform the best recall and precision rates for the class 1 (churn)

### Cross Validation

In [44]:
precision_1, recall_1 = Average_Score_Class1(model, X_test, y_test, size=0.3, cv=5, threshold=0.8)

In [45]:
precision_1, recall_1

([0.7711429343420697,
  0.7783655117693732,
  0.7681992337164751,
  0.7749326145552561,
  0.7583910495471498],
 [0.698824681684623,
  0.7150145772594753,
  0.7042147516307075,
  0.7069092697319892,
  0.708385170440408])

## XGBClassifier

In [34]:
import xgboost as xgb

### Build Model

In [35]:
model_xgb = xgb.XGBClassifier(n_estimators=200, max_depth=50)
model_xgb = model_xgb.fit(X_train_rus, y_train_rus)

In [36]:
# Save the model to disk
filename = 'CASAChurn_XGBmodel_2lead.sav'
pickle.dump(model_xgb, open(filename, 'wb'))

In [37]:
# load the model from disk
model_xgb = pickle.load(open(filename, 'rb'))

### Evaluation

In [38]:
# Scores for each class and each threshold
model = model_xgb
thresholds = [0.3,0.5,0.6,0.7,0.8,0.9]
classes = [0,1]
df_evaluation = Evaluation_Each_Class(model, X_test, y_test, thresholds, classes) # Evaluation_Each_Class is an user-defined function
df_evaluation

Unnamed: 0,threshold,class,precision,recall,f1_score
0,0.3,0,0.99,0.87,0.92
1,0.3,1,0.22,0.8,0.35
2,0.5,0,0.99,0.92,0.95
3,0.5,1,0.3,0.77,0.44
4,0.6,0,0.99,0.93,0.96
5,0.6,1,0.35,0.76,0.48
6,0.7,0,0.99,0.95,0.97
7,0.7,1,0.41,0.75,0.53
8,0.8,0,0.99,0.96,0.98
9,0.8,1,0.49,0.73,0.59


In [39]:
# Confusion matrix and Classification report for each threshold
model = model_xgb
df_predict = pd.DataFrame(model.predict_proba(X_test)) # predict probabilities
df_predict.columns = ['proba_0','proba_1']
# Find which threshold will perform the best recall and precision for class 1 (churn)
for i in [0.3,0.5,0.6,0.7,0.8,0.9]:
    df_predict['predict'] = 0
    df_predict.loc[(df_predict['proba_1']>=i),['predict']] = 1
    print('-'*55)
    print('threshold = ' + str(i))
    print(confusion_matrix(y_test, df_predict['predict']))
    print(classification_report(y_test, df_predict['predict']))

-------------------------------------------------------
threshold = 0.3
[[248360  37852]
 [  2727  10701]]
              precision    recall  f1-score   support

           0       0.99      0.87      0.92    286212
           1       0.22      0.80      0.35     13428

    accuracy                           0.86    299640
   macro avg       0.60      0.83      0.63    299640
weighted avg       0.95      0.86      0.90    299640

-------------------------------------------------------
threshold = 0.5
[[262506  23706]
 [  3083  10345]]
              precision    recall  f1-score   support

           0       0.99      0.92      0.95    286212
           1       0.30      0.77      0.44     13428

    accuracy                           0.91    299640
   macro avg       0.65      0.84      0.69    299640
weighted avg       0.96      0.91      0.93    299640

-------------------------------------------------------
threshold = 0.6
[[267511  18701]
 [  3239  10189]]
              precision  

# Important Features

In [40]:
model = model_lgbm
# Extract feature importances
feature_importances = model.feature_importances_

# # Sort feature importances in descending order
# sorted_indices = np.argsort(feature_importances)[::-1]

# # Print top N features and their importances
# top_n = 100  # Change this value based on your preference
# print("Top {} features:".format(top_n))
# for i in range(top_n):
#     feature_index = sorted_indices[i]
#     print("Feature {}: {} - Importance: {:.4f}".format(i+1, X_train_rus.columns[feature_index], feature_importances[feature_index]))

In [41]:
df_feature_importances = pd.DataFrame({'Feature name':X_train_rus.columns,
                                       'Importance':feature_importances})
df_feature_importances = df_feature_importances.sort_values(by='Importance', ascending=False)
df_feature_importances

Unnamed: 0,Feature name,Importance
131,MAX_ACCOUNT_ENDING_BALANCE_AMT1M_6M,119
274,RATIO_SUM_SUM_INCREASE_RATIO_ACCOUNT_ENDING_BA...,92
124,RATIO_SUM_SUM_ACCOUNT_ENDING_BALANCE_AMT1M_1M_...,89
132,MIN_ACCOUNT_ENDING_BALANCE_AMT1M_6M,78
103,RATIO_SUM_MIN_ACCOUNT_ENDING_BALANCE_AMT1M_1M_...,75
...,...,...
88,RATIO_SUM_MIN_ALL_ACCOUNT_TRANS_OUT_AMT1M_1M_1...,0
90,RATIO_SUM_MAX_ALL_ACCOUNT_TRANS_IN_NO1M_1M_1M_3M,0
93,RATIO_SUM_MAX_ALL_ACCOUNT_TRANS_OUT_NO1M_1M_1M_3M,0
112,MAX_ALL_ACCOUNT_TRANS_OUT_NO1M_6M,0


In [42]:
df_feature_importances.to_csv('feature_importances.csv')

In [43]:
# Importance > 0
df_feature_importances[df_feature_importances['Importance'] > 0]

Unnamed: 0,Feature name,Importance
131,MAX_ACCOUNT_ENDING_BALANCE_AMT1M_6M,119
274,RATIO_SUM_SUM_INCREASE_RATIO_ACCOUNT_ENDING_BA...,92
124,RATIO_SUM_SUM_ACCOUNT_ENDING_BALANCE_AMT1M_1M_...,89
132,MIN_ACCOUNT_ENDING_BALANCE_AMT1M_6M,78
103,RATIO_SUM_MIN_ACCOUNT_ENDING_BALANCE_AMT1M_1M_...,75
...,...,...
283,MIN_INCREASE_RATIO_ALL_ACCOUNT_TRANS_IN_AMT1M_3M,1
183,MAX_ALL_ACCOUNT_TRANS_IN_NO1M_6M,1
22,RATIO_SUM_MIN_ACCOUNT_TRANS_OUT_NO1M_1M_1M_3M,1
23,RATIO_SUM_AVG_ACCOUNT_TRANS_CASH_IN_AMT1M_1M_1...,1
