<a href="https://www.kaggle.com/code/muradpitafi/bank-s-customer-churn-lgbm-with-96-auc?scriptVersionId=154379510" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import numpy as np  
import pandas as pd  
 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings 
warnings.filterwarnings('ignore')



<h1> Understanding the Data </h1>

In [None]:
banks = pd.read_csv("/kaggle/input/bank-customer-churn-prediction/Churn_Modelling.csv")
banks.head()

In [None]:
banks.info()

In [None]:
banks.isnull().sum()

In [None]:
banks.tail(5)

<h1> Data Distribution of categorical and numerical variables </h1>

In [None]:
!pip3 install seaborn
import seaborn as sns

cat =  []
num = []
for i in banks.columns:
    if banks[i].nunique() < 12:
        cat.append(i)
    else:
        num.append(i)

In [None]:
import matplotlib.pyplot as plt
for i in cat:
    plt.title(f'Distribution of {i}')
    sns.countplot(x = i, data = banks)
    plt.show()

In [None]:
for i in num:
    plt.title(f'Distribution of {i}')
    sns.histplot(x = i, data = banks, bins = 20 , kde = True)
    plt.show()

In [None]:
 data = banks.drop(['CreditScore', 'CustomerId', 'RowNumber', 'Surname'], axis = 1)

<h1> transforming categorical variable to numerical </h1>

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data['Geography'] = label_encoder.fit_transform(data[['Geography']])
data.head()

In [None]:
data['Gender'] = label_encoder.fit_transform(data[['Gender']])
data.head()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for i in data.columns:
    data[i] = scaler.fit_transform(data[[i]])

In [None]:
x = data.drop(['Exited'], axis = 1)
y = data['Exited']

In [None]:
y = y.astype(int)

<h1> Feature selection </h1>

In [None]:
correlation_matrix = x.corr()

selected_features = correlation_matrix.iloc[:, 0].sort_values(ascending=False)[:6]

print(selected_features)

In [None]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 40)

In [None]:
!pip3 install xgboost
!pip3 install lightgbm
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score,precision_score, recall_score , f1_score , classification_report
import xgboost as xgb
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


# Model Evaluations

In [None]:
data.head()

In [None]:
models = {' Logistic Regression ' :LogisticRegression(),
          'Random Forest ': RandomForestClassifier(),
           'Support vector machine': SVC(), 
            'Xgboost ': xgb.XGBClassifier(), 
             'light gradient boost ': LGBMClassifier(verbose=0)}

import numpy as np
kf = KFold(n_splits=10, shuffle=True, random_state = 43)
for keys , values in models.items():
    
    cross_val_scores = cross_val_score(values, x, y , cv = kf)
    print(f'{keys} : {np.round(cross_val_scores,2)} ')
    print(" **** ")


In [None]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'Xgboost': xgb.XGBClassifier(),
    'Light Gradient Boosting': LGBMClassifier()
}

kf = KFold(n_splits=5, shuffle=True, random_state=43)

mean_scores = []

for model_name, model in models.items():
    fold_scores = cross_val_score(model, x, y, cv=kf)
    mean_scores.append(np.mean(fold_scores))

plt.figure(figsize=(10, 6))
plt.bar(models.keys(), mean_scores, color=['blue', 'green', 'orange', 'red', 'purple'])
plt.title('Cross-validation Scores Comparison')
plt.xlabel('Models')
plt.ylabel('Average Cross-validation Score')
plt.show()


In [None]:
kernals = ['linear','poly', 'rbf', 'sigmoid']
from sklearn.svm import SVC
for i in kernals:
    svm = SVC(kernel = i)
    svm.fit(xtrain, ytrain)
    print(i)
    print("training score : ", svm.score(xtrain,ytrain))
    print("testing score " ,svm.score(xtest,ytest))
    print(" ___ ")

In [None]:
models = {' Logistic Regression ' :LogisticRegression(),
          'Random Forest ': RandomForestClassifier(),
           'Support vector machine': SVC(), 
            'Xgboost ': xgb.XGBClassifier(), 
          'Gradient Boost': GradientBoostingClassifier(),
             'light gradient boost ': LGBMClassifier(verbose=0)
            ,'KNeighbors Classifier ': KNeighborsClassifier(),
          'Decision tree': DecisionTreeClassifier()
         }

In [None]:
lgbm = LGBMClassifier(verbose=0)
lgbm.fit(xtrain,ytrain)
ypred = lgbm.predict(xtest)  
print(classification_report(ytest, ypred))

In [None]:
print(confusion_matrix(ytest,ypred)) 
accuracy = accuracy_score(ytest, ypred)
precision = precision_score(ytest, ypred)
recall = recall_score(ytest, ypred)
f1 = f1_score(ytest, ypred)

print()
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


In [None]:
results = []

for model_name, model in models.items():
    model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)
    acc = np.round(accuracy_score(ytest, ypred), 2)
    f1 = np.round(f1_score(ytest, ypred),2)
    prec = np.round(precision_score(ytest, ypred),2)
    rec = np.round(recall_score(ytest, ypred), 2)
     
    model_results = {
        'Model': model_name,
        'Accuracy': acc,
        'F1': f1,
        'Precision': prec,
        'Recall': rec
    }
    
    
    results.append(model_results)

model_results = pd.DataFrame(results)


In [None]:
model_results.head(10)

In [None]:
!pip3 install plotly
import plotly.express as px

fig = px.bar(model_results.melt(id_vars='Model'), x='Model', y='value', color='variable', barmode='group', 
             text='value', title='Model Performance Metrics Comparison',
             labels={'value': 'Score', 'variable': 'Metric'},
             height=600, width=1090)
fig.update_traces(textposition='outside')

fig.show()


In [None]:
for model_name, model in models.items():
    print(model_name)
    model.fit(xtrain, ytrain)
    print(f' Training accuracy : {model.score(xtrain,ytrain)}')
    print(f' Testing accuracy : {model.score(xtest,ytest)}')
    

In [None]:
pip install imbalanced-learn


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_res, Y_res = smote.fit_resample(x, y)

xtrain,xtest, ytrain, ytest = train_test_split(X_res, Y_res, train_size=0.70, random_state=43)
# for model_name, model in models.items():
#     print(model_name)
#     model.fit(xtrain_resampled, ytrain_resampled)
#     print(f' Training accuracy : {model.score(xtrain_resampled,ytrain_resampled)}')
#     print(f' Testing accuracy : {model.score(xtest,ytest)}')
    
    

In [None]:
Y_res.value_counts()

In [None]:
print("after balancing data")

results = []

for model_name, model in models.items():
    model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)
    acc = np.round(accuracy_score(ytest, ypred), 2)
    f1 = np.round(f1_score(ytest, ypred),2)
    prec = np.round(precision_score(ytest, ypred),2)
    rec = np.round(recall_score(ytest, ypred), 2)
     
    model_results = {
        'Model': model_name,
        'Accuracy': acc,
        'F1': f1,
        'Precision': prec,
        'Recall': rec
    }
    
    
    results.append(model_results)

model_results = pd.DataFrame(results)


In [None]:
model_results.head(10)

In [None]:

fig = px.bar(model_results.melt(id_vars='Model'), x='Model', y='value', color='variable', barmode='group', 
             text='value', title='Model Performance Metrics Comparison',
             labels={'value': 'Score', 'variable': 'Metric'},
             height=600, width=1090)

fig.update_traces(textposition='outside')


fig.show()

In [None]:
from sklearn.model_selection import GridSearchCV
 
param = {
    'num_leaves': [30, 50, 100],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [5, 10, 15],
    'min_child_samples': [20, 50, 100],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'verbose':[0]
}

best_hyp = GridSearchCV(estimator=lgbm, param_grid=param, cv = 5, scoring = 'accuracy')
best_hyp.fit(xtrain,ytrain)

print("best hyperparameters : ",best_hyp.best_params_)
print('Best score : ', best_hyp.best_score_)

print(f'test score : {best_hyp.score(xtest,ytest)}')

In [None]:
tuned_lgbm = LGBMClassifier(verbose=0,colsample_bytree = 0.8, learning_rate = 0.1, max_depth = 15, min_child_samples= 50, num_leaves = 100, subsample = 0.8)
tuned_lgbm.fit(xtrain, ytrain)
ypred = tuned_lgbm.predict(xtest)
accuracy = accuracy_score(ytest, ypred)
precision = precision_score(ytest, ypred)
recall = recall_score(ytest, ypred)
f1 = f1_score(ytest, ypred)

print()
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


In [None]:
 import joblib

In [None]:
 joblib.dump(tuned_lgbm, 'churn_predict_model')

In [None]:
 model = joblib.load('churn_predict_model')

In [None]:
data.columns

In [None]:
model.predict([[1,0,34,2,343,0,0,1,23000.34]])

In [None]:

from sklearn.preprocessing import StandardScaler

import joblib

In [None]:
import tkinter as tk
from tkinter import ttk
from sklearn.preprocessing import LabelEncoder
 
geography_encoder = LabelEncoder()
gender_encoder = LabelEncoder()

def get_prediction():
     
    geography = entry_vars['Geography'].get()
    gender = entry_vars['Gender'].get()
    age = entry_vars['Age'].get()
    tenure = entry_vars['Tenure'].get()
    balance = entry_vars['Balance'].get()
    num_of_products = entry_vars['NumOfProducts'].get()
    has_cr_card = entry_vars['HasCrCard'].get()
    is_active_member = entry_vars['IsActiveMember'].get()
    estimated_salary = entry_vars['EstimatedSalary'].get()
 
    geography_encoded = geography_encoder.fit_transform([geography])[0]
    gender_encoded = gender_encoder.fit_transform([gender])[0]
 
    features = [geography_encoded, gender_encoded, int(age), int(tenure), float(balance), int(num_of_products), int(has_cr_card), int(is_active_member), float(estimated_salary)]
    prediction = model.predict([features])[0]
 
    result_label.config(text=f"Prediction: {'Exited' if prediction == 1 else 'Not Exited'}")

window = tk.Tk()
window.title("Customer Exit Prediction")

labels = ['Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
entry_vars = {}

for i, label in enumerate(labels):
    ttk.Label(window, text=f"{label}:").grid(row=i, column=0, padx=10, pady=5)
    entry_var = tk.StringVar()
    entry_widget = ttk.Entry(window, textvariable=entry_var)
    entry_widget.grid(row=i, column=1, padx=10, pady=5)
    entry_vars[label] = entry_var

predict_button = ttk.Button(window, text="Predict", command=get_prediction)
predict_button.grid(row=len(labels), column=0, columnspan=2, pady=10)

result_label = ttk.Label(window, text="")
result_label.grid(row=len(labels) + 1, column=0, columnspan=2, pady=5)

window.mainloop()
