# Customer Churn Analytics

## 1. Installing and Importing the necessary libraries

In [None]:
!python -m pip install --user --upgrade pip

!pip3 install pandas==0.23.4 matplotlib==3.0.3 seaborn==0.9.0 scikit-learn==0.22

In [None]:
#importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import classification_report, confusion_matrix

## 2. Load your data

In [None]:
#importing the data
data = pd.read_csv("https://raw.githubusercontent.com/MavenCode/KubeflowTraining/master/Data/Telco/Churn_Modelling.csv")

## 3. Exploratory Data Analysis 

In [None]:
#viewing the data
data.head()

In [None]:
#size of data
data.shape

In [None]:
#frequency of the target classes
sns.countplot(x='Exited', data=data)

From the data we have more information on the customers that stayed at the bank

In [None]:
#checking for datatype of each column
data.dtypes

In [None]:
#checking for missing values
data.isnull().sum()

## 4. Data Preprocessing

In [None]:
#dropping some columns that are not needed
data = data.drop(columns=['RowNumber','CustomerId','Surname'], axis=1)

In [None]:
#viewing the unique values in Geography column
data['Geography'].unique()

In [None]:
#data features
X = data.iloc[:,:-1]

In [None]:
#target data
y = data.iloc[:,-1:]

In [None]:
#encoding the categorical columns
le = LabelEncoder()
ohe = OneHotEncoder()
X['Gender'] = le.fit_transform(X['Gender'])
geo_df = pd.DataFrame(ohe.fit_transform(X[['Geography']]).toarray())

#getting feature name after onehotencoding
geo_df.columns = ohe.get_feature_names(['Geography'])

#merging geo_df with the main data
X = X.join(geo_df)

In [None]:
#dropping the old columns after encoding
X.drop(columns=['Geography'], axis=1, inplace=True)

In [None]:
#using heatmap see the correlation between each features
plt.figure(figsize=(12,10))
cor = data.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Blues)
plt.show()

In [None]:
sns.pairplot(data=data, y_vars=['EstimatedSalary'], x_vars=['CreditScore', 'Gender','Age','Tenure','Balance','NumOfProducts','HasCrCard','IsActiveMember'])

#### Split data

In [None]:
X_train,X_test,y_train,y_test = train_test_split( X,y, test_size=0.2, random_state = 42)

In [None]:
data_dict = {"X_train":X_train, "X_test":X_test, "y_train":y_train, "y_test":y_test}
for i in data_dict:
    print("The shape of {} is {}".format(i,data_dict[i].shape))

## 5. Feature Scaling

In [None]:
sc =StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#saving the values from the dataframe
y_train = y_train.values

## 6. Building the Tensorflow Model

In [None]:
#initializing the classifier model with its input, hidden and output layers
classifier = Sequential()
classifier.add(Dense(units = 16, activation='relu', input_dim=12,))
classifier.add(Dense(units = 8, activation='relu'))
classifier.add(Dense(units = 1, activation='sigmoid'))

#### Compiling the classifier model with Stochastic Gradient Desecnt

In [None]:
classifier.compile(optimizer = 'adam', loss='binary_crossentropy' , metrics =['accuracy'])

#### Fitting the classifier model

In [None]:
classifier.fit(X_train, y_train, batch_size=10 , epochs=50)

We can see that the accuracy and loss of the model on the train dataset is 86% and 0.3146 respectively

#### Getting the model's predictions o  the test data

In [None]:
# These probabilities would help determine which of the customers have high risk of leaving the bank
y_pred = classifier.predict(X_test)
y_pred

From the probabilities obtained above, the bank can segment each of the customers and know the ones to prevent from leaving
by developing ideas to keep them.

In [None]:
# create a threshold for the confution matrics
y_pred=(y_pred>0.5)
y_pred

In [None]:
countFalse =0
countTrue =0
for i in y_pred:
    if i[0] == False:
        countFalse += 1
    else:
        countTrue += 1
print("countF :{} and countT: {}".format(countFalse,countTrue))

## 7. Results

In [None]:
# confusion metrics
cm = confusion_matrix(y_test.values,y_pred)
print(cm)

From our confusion matrix we conclude that:
1. **True positive:** 178(We predicted a positive result and it was positive)- the model rightly predicted the ones who left the bank 
2. **True negative:** 1545(We predicted a negative result and it was negative)-the model rightly predicted the ones who stayed at the bank 
3. **False positive:** 62(We predicted a positive result and it was negative)-the model predicted that these ones left when they actually stayed
4. **False negative:** 215(We predicted a negative result and it was positive)- the model predicted that these ones stayed when they actually left

In [None]:
#classification report
cr = classification_report(y_test, y_pred)
print(cr)