## **Bank churn prediction using ANN** 

Aims:

In [None]:
# Importing project dependencies 

# Data handling
import pandas as pd
import numpy as np

# Data warehousing 
from sqlalchemy import create_engine

# Exploratory Data Analysis & Visualisation 
%matplotlib inline
import pandas_profiling
import seaborn as sns
import matplotlib.pyplot as plt

# Feature Engineering and Modelling 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import keras
from keras.models import Sequential
from keras.layers import Dense

# Model improvement and Evaluation 
from sklearn.metrics import confusion_matrix,accuracy_score

# Model export for deployment 
import pickle

# Supressing warning messages 
from warnings import filterwarnings
filterwarnings("ignore")  

#### **----------------------------------  1. Data collection  -----------------------------------**

In [None]:
# Importing data through API 
!kaggle datasets download -d mrmorj/big-mart-sales -p ..\Data --unzip 

In [None]:
# Importing the dataset
data = pd.read_csv('BankCustomers.csv')

In [None]:
# Viewing first 5 rows of data
data.head()

In [None]:
# Viewing the data types of the columns
data.dtypes

In [None]:
# Viewing dataset shape
data.shape

In [None]:
# 1st check for null values and datatype check 
data.info()

In [None]:
# 2nd check for number of null values
data.isnull().sum()

In [None]:
# Unique count for each variable
data.nunique()

In [None]:
# Viewing summary of statistics for numeric columns
data.describe()

In [None]:
# (SQL standard) Formatting column headers by removing potential capital letters and spaces in column headers 
data.columns = data.columns.str.lower()
data.columns = data.columns.str.replace(' ','_')

#### **----------------------------------  3. Data Warehousing  -----------------------------------**

In [None]:
# Function to warehouse data in a Postgre database 
def store_data(data,tablename):
    """
    :param data: variable, enter name of dataset you'd like to warehouse
    :param tablename: str, enter name of table for data 
    """

    # Saving cleaned data as csv
    data.to_csv(f'../Data/{tablename}_clean.csv', index=False)

    # Engine to access postgre
    engine = create_engine('postgresql+psycopg2://postgres:password@localhost:5432/projectsdb')

    # Loads dataframe into PostgreSQL and replaces table if it exists
    data.to_sql(f'{tablename}', engine, if_exists='replace',index=False)

    # Confirmation of ETL 
    return("ETL successful, {num} rows loaded into table: {tb}.".format(num=len(data.iloc[:,0]), tb=tablename))
 

In [None]:
# Calling store_data function to warehouse cleaned data
store_data(data,"p6_churnprediction")

#### **----------------------------------  4. Exploratory data analysis  -----------------------------------**

In [None]:
# Checking distribution of categorical fields 
print(data.geography.value_counts())
print(data.gender.value_counts())
print(data.exited.value_counts())

In [None]:
# Viewing summary of statistics for numeric columns
data.describe()

In [None]:
# Setting my custom color palette
colors = ["#32CD32","#FF0000"]
sns.set_palette(sns.color_palette(colors))

In [None]:
totretained = data[data['exited'] == 0]['exited'].count() / data.shape[0] * 100
totlost = data[data['exited'] == 1]['exited'].count() / data.shape[0] * 100

In [None]:
# Showing distribtuon of target values 
fig, ax = plt.subplots()
plt.figure(figsize=(8,6),facecolor='white')
sns.countplot(x='exited', data=data)
plt.xticks([0, 1], ['Retained', 'Lost'])
plt.xlabel('Condition', size=15, labelpad=12, color='grey')
plt.ylabel('Amount of customers', size=15, labelpad=12, color='grey')
plt.title("Proportion of customers lost and retained", size=15, pad=20)
plt.ylim(0, 9000)
plt.text(-0.12, 7000, f"{round(totretained, 2)}%", fontsize=12,weight='bold')
plt.text(0.90, 1000, f"{round(totlost, 2)}%", fontsize=12,weight='bold')
sns.despine()
plt.savefig('../images/Churn_barchart_distrib.png')
plt.close(1)
plt.show()

In [None]:
#Putting all the above results in a single visualization

cat_data=['geography', 'gender', 'tenure','numofproducts', 'hascrcard','isactivemember']
q=1
plt.figure(figsize=(16,12),facecolor='white')
# Plot a grid with count plots of all categorical variables
for i in cat_data:
    plt.subplot(2,3,q)
    ax=sns.countplot(data[i],hue=data.exited)
    plt.xlabel(i)
    q+=1

plt.savefig('../images/independentfeatures_distrib.png')
plt.show()

In [None]:
# Relations based on the continuous data attributes
fig, axarr = plt.subplots(3, 2,facecolor='white',figsize=(16,12))
sns.boxplot(y='creditscore',x = 'exited', hue = 'exited',data = data, ax=axarr[0][0])
sns.boxplot(y='age',x = 'exited', hue = 'exited',data = data , ax=axarr[0][1])
sns.boxplot(y='tenure',x = 'exited', hue = 'exited',data = data, ax=axarr[1][0])
sns.boxplot(y='balance',x = 'exited', hue = 'exited',data = data, ax=axarr[1][1])
sns.boxplot(y='numofproducts',x = 'exited', hue = 'exited',data = data, ax=axarr[2][0])
sns.boxplot(y='estimatedsalary',x = 'exited', hue = 'exited',data = data, ax=axarr[2][1])
plt.savefig('../images/boxplots.png')

In [None]:
# correlation of each variable
# data.corr()

In [None]:
# Viewing correlations of each features in dataset 
datacorr = round(data.corr(),2)
corrmat = datacorr
top_corr_features = corrmat.index
plt.figure(figsize=(16,12),facecolor='white')

# Plotting heat map
plot = sns.heatmap(datacorr,annot=True,cmap="RdYlGn")
plt.savefig('../images/churn_correlation.png')

In [None]:
# Profile report of each variable
# pandas_profiling.ProfileReport(data)

#### **----------------------------------  5. Feature engineering  -----------------------------------**

In [None]:
# Dropping fields that wont benefit the model
data.drop(labels=['rownumber','customerid','surname'], axis=1, inplace=True)

In [None]:
# One Hot encoding for remaining categorical fields 
data = pd.get_dummies(data, drop_first = False)
data.head()

In [None]:
# Splitting dependent and independent features to apply scaling
X = data.drop(["exited"],axis=1)

# Dependent feature | Target variable 
y= data['exited']

In [None]:
# Using train test split to split train and test data | Stratifying so y_test can reflect y_train.Resulting in a more realistic simulation of how the model is going to perform on new data
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.20, random_state=23, shuffle=True, stratify=y)

# Viewing shape of train / test data
print(X_train.shape)
print(X_test.shape)

In [None]:
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

#### **----------------------------------  6. Modelling  -----------------------------------**

In [None]:
# Initialising the ANN
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(activation="relu", input_dim=11, units=6, kernel_initializer="uniform"))

# Adding the second hidden layer
classifier.add(Dense(activation="relu", units=6, kernel_initializer="uniform"))

# Adding the output layer
classifier.add(Dense(activation="sigmoid", units=1, kernel_initializer="uniform"))

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 10, nb_epoch = 100)


#### **----------------------------------  8. Evaluation -----------------------------------**

In [None]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy=accuracy_score(y_test,y_pred)

In [None]:
parameters = {'batch_size': [25, 32],
              'epochs': [100, 500],
              'optimizer': ['adam', 'rmsprop']}
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10)
grid_search = grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

---------------------------------------------------------------------------------------------------------

In [None]:
# Importing the dataset
dataset = pd.read_csv('BankCustomers.csv')
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values

# convert categorical feature into dummy variables

states=pd.get_dummies(X['Geography'],drop_first=True)
gender=pd.get_dummies(X['Gender'],drop_first=True)

#concatenate the remaining dummies columns
X=pd.concat([X,states,gender],axis=1)

#drop the columns as it is no longer required

X=X.drop(['Geography','Gender'],axis=1)



# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Part 2 - Now let's make the ANN!

# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

# Initialising the ANN
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(activation="relu", input_dim=11, units=6, kernel_initializer="uniform"))

# Adding the second hidden layer
classifier.add(Dense(activation="relu", units=6, kernel_initializer="uniform"))

# Adding the output layer
classifier.add(Dense(activation="sigmoid", units=1, kernel_initializer="uniform"))

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 10, nb_epoch = 100)

# Part 3 - Making the predictions and evaluating the model

# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test, y_pred)
accuracy=accuracy_score(y_test,y_pred)

In [None]:
parameters = {'batch_size': [25, 32],
              'epochs': [100, 500],
              'optimizer': ['adam', 'rmsprop']}
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10)
grid_search = grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_