In [None]:
# Importing all the necessary libraries (Hidden Input)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.metrics import classification_report, roc_auc_score,ConfusionMatrixDisplay, confusion_matrix, roc_curve
import scikitplot as skplt
import warnings
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
# Importing the data and looking at the first ten rows (Hidden Input)

df = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')


In [None]:
# Dropping the last two columns and the client number column

df = df.iloc[:,1:-2]

In [None]:
# Checking for the number of null values present in each feature

round((df.isnull().sum() * 100)/(df.shape[0]),2)

In [None]:
# Checking the Descriptive statistics of the numerical columns (Hidden Input)

df.describe().T.style.bar(
    subset=['mean'],
    color='lightsalmon').background_gradient(
    subset=['std'], cmap='plasma').background_gradient(subset=['75%'], cmap='plasma').background_gradient(
    subset=['max'], cmap='plasma')

In [None]:
# Changing the Attrition_Flag to 0 and 1 (Hidden Input/Output)

df["Attrition_Flag"] = df["Attrition_Flag"].map({"Existing Customer":0, "Attrited Customer":1})

In [None]:
# Performing EDA on the Categorical columns (Hidden Input)

num_cols = list(df.select_dtypes(["int64","float64"]))
cat_cols = list(df.select_dtypes("object"))

In [None]:
# # Creating the dummy variables for all the categorical features

# for col in cat_cols:
#     dummy_cols = pd.get_dummies(df[col], drop_first=True, prefix=col)
#     df = pd.concat([df,dummy_cols],axis=1)
#     df.drop(columns=col, inplace=True)

In [None]:
df.head()

In [None]:
y = df["Attrition_Flag"]
X = df.drop("Attrition_Flag", axis =1)
X = pd.get_dummies(X, columns=["Education_Level","Marital_Status","Income_Category","Card_Category","Gender"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X.info()

In [None]:
# # Splitting the data into train and test

# y = df.pop("Attrition_Flag")
# X = df
# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=69)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
# Normalizing the data
req_cols = ['Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count',
            'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
             'Total_Trans_Amt', 'Total_Trans_Ct','Card_Category_Blue', 'Card_Category_Gold','Card_Category_Platinum', 'Card_Category_Silver','Income_Category_$120K +', 
            'Income_Category_$40K - $60K','Income_Category_$60K - $80K','Income_Category_$80K - $120K' , 'Income_Category_Less than $40K', 'Gender_F','Gender_M',
           'Marital_Status_Divorced','Marital_Status_Married', 'Marital_Status_Single','Education_Level_College', 'Education_Level_Doctorate', 'Education_Level_Graduate',
           'Education_Level_High School','Education_Level_Post-Graduate','Education_Level_Uneducated']



In [None]:
X_train[req_cols].info()


In [None]:
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train[req_cols])

X_test = scaler.transform(X_test[req_cols])

In [None]:
X_train

In [None]:
# scaler = MinMaxScaler()

# X_train = scaler.fit_transform(X_train)

# X_test = scaler.transform(X_test)

In [None]:
import joblib

# Save the scaler to a file
joblib.dump(scaler, 'scaler_2.pkl')

In [None]:
# Applying SMOTE to handle imbalance in target variable

sm = SMOTE(random_state = 42, sampling_strategy = 1.0)

X_train, y_train = sm.fit_resample(X_train, y_train)

In [None]:
# XGBoost model

xgb_model = xgb.XGBClassifier(random_state=69, use_label_encoder=False, n_jobs=-1)

xgb_model.fit(X_train, y_train)

In [None]:
# XGBoost Score on the test data

xgb_model.score(X_test, y_test)

In [None]:
y_pred=xgb_model.predict(X_test)
print(y_pred[20])

In [None]:
# Checking the Classification Report (Hidden Input)

print(classification_report(y_test, xgb_model.predict(X_test)))

In [None]:
# Creating the Confusion Matrix (Hidden Input)

cfm = confusion_matrix(y_true=y_test, y_pred=xgb_model.predict(X_test))

TP = cfm[0][0]
FN = cfm[0][1]
FP = cfm[1][0]
TN = cfm[1][1]


#fig, ax = plt.subplots(figsize=(16, 8))
#plt.title("Confusion Matrix and Corresponding Accuracy, Precision and Recall", size=20, pad=20)
disp = ConfusionMatrixDisplay(confusion_matrix=cfm)
# plt.grid(False)
disp.plot()
# plt.show()

print("*" * 30)
print("Accuracy :", (TP+TN)/(TP+TN+FP+FN))
print("Precision :", (TP)/(TP+FP))
print("Recall :", (TP)/(TP+FN))
print("*" * 30)

In [None]:
# Checking the roc_auc_curve (Hidden Input)

y_true = y_test # ground truth labels
y_probas = xgb_model.predict_proba(X_test) # predicted probabilities generated by sklearn classifier

skplt.metrics.plot_roc(y_true, y_probas, figsize=(16,8), title_fontsize=25, text_fontsize=16, cmap='plasma')
plt.grid(False)
plt.show()

In [None]:
import pickle

In [None]:
# Save the Model to file in the current working directory

Pkl_Filename = "Pickle_RL_Model_1.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(xgb_model, file)

In [None]:
print(X_test[2])

In [None]:
print(y_test)