## **Data for Dashboard deployment**

In [40]:
# Importing project dependencies 

# Data handling
import pandas as pd
import numpy as np

# Feature Engineering and Modelling 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Model export for deployment 
import pickle

# Supressing warning messages 
from warnings import filterwarnings
filterwarnings("ignore")  

In [41]:
# Reading data into dataframe 
data = pd.read_csv('../Data/Churn_Modelling.csv')

# (SQL standard) Formatting column headers by removing potential capital letters and spaces in column headers 
data.columns = data.columns.str.lower()
data.columns = data.columns.str.replace(' ','_')

In [43]:
# Dropping fields that wont benefit the deployment
data.drop(labels=['rownumber','customerid','surname','exited'], axis=1, inplace=True)

In [44]:
# One Hot encoding for remaining categorical field 
data = pd.get_dummies(data, drop_first = False)

# Viewing first 5 rows of data 
data.head()

Unnamed: 0,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,geography_France,geography_Germany,geography_Spain,gender_Female,gender_Male
0,619,42,2,0.0,1,1,1,101348.88,1,0,0,1,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,1,1,0
2,502,42,8,159660.8,3,1,0,113931.57,1,0,0,1,0
3,699,39,1,0.0,2,0,0,93826.63,1,0,0,1,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,1,1,0


In [45]:

scale_vars = ['creditscore', 'age', 'tenure', 'balance', 'numofproducts', 'estimatedsalary' ]

scaler = MinMaxScaler()


#scaling
scale_vars = ['creditscore','estimatedsalary','balance','age']
data[scale_vars] = scaler.fit_transform(data[scale_vars])


data.head()



from sklearn.preprocessing import RobustScaler


Unnamed: 0,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,geography_France,geography_Germany,geography_Spain,gender_Female,gender_Male
0,0.538,0.324324,2,0.0,1,1,1,0.506735,1,0,0,1,0
1,0.516,0.310811,1,0.334031,1,0,1,0.562709,0,0,1,1,0
2,0.304,0.324324,8,0.636357,3,1,0,0.569654,1,0,0,1,0
3,0.698,0.283784,1,0.0,2,0,0,0.46912,1,0,0,1,0
4,1.0,0.337838,2,0.500246,1,1,1,0.3954,0,0,1,1,0


In [46]:
# Loading pickle model 
model = pickle.load(open(".././rf_churn_model.pkl", "rb"))

In [47]:
pred_rf = model.predict(data.values)
pred_prob_rf = model.predict_proba(data.values)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [48]:
# probabilities
pred_prob_rf

array([[0.77, 0.23],
       [0.69, 0.31],
       [0.17, 0.83],
       ...,
       [0.73, 0.27],
       [0.25, 0.75],
       [0.8 , 0.2 ]])

In [49]:
# function to select second column for probabilities
def column(matrix, i):
    return [row[i] for row in matrix]

column(pred_prob_rf, 1)

[0.23,
 0.31,
 0.83,
 0.78,
 0.18,
 0.72,
 0.66,
 0.87,
 0.79,
 0.15,
 0.82,
 0.68,
 0.72,
 0.78,
 0.66,
 0.83,
 0.22,
 0.64,
 0.14,
 0.68,
 0.66,
 0.78,
 0.35,
 0.73,
 0.22,
 0.76,
 0.16,
 0.78,
 0.15,
 0.68,
 0.82,
 0.15,
 0.87,
 0.62,
 0.69,
 0.1,
 0.17,
 0.24,
 0.18,
 0.82,
 0.16,
 0.19,
 0.14,
 0.19,
 0.29,
 0.71,
 0.11,
 0.16,
 0.09,
 0.76,
 0.71,
 0.84,
 0.81,
 0.06,
 0.08,
 0.19,
 0.13,
 0.04,
 0.38,
 0.24,
 0.08,
 0.79,
 0.86,
 0.37,
 0.17,
 0.73,
 0.14,
 0.68,
 0.84,
 0.14,
 0.8,
 0.08,
 0.11,
 0.75,
 0.74,
 0.72,
 0.74,
 0.18,
 0.2,
 0.8,
 0.88,
 0.17,
 0.76,
 0.24,
 0.73,
 0.78,
 0.16,
 0.68,
 0.75,
 0.74,
 0.68,
 0.67,
 0.73,
 0.69,
 0.78,
 0.18,
 0.2,
 0.66,
 0.85,
 0.78,
 0.22,
 0.78,
 0.08,
 0.68,
 0.35,
 0.72,
 0.07,
 0.25,
 0.1,
 0.11,
 0.09,
 0.74,
 0.07,
 0.17,
 0.78,
 0.77,
 0.21,
 0.72,
 0.12,
 0.71,
 0.83,
 0.13,
 0.74,
 0.13,
 0.17,
 0.16,
 0.31,
 0.22,
 0.74,
 0.1,
 0.69,
 0.11,
 0.15,
 0.74,
 0.7,
 0.78,
 0.22,
 0.11,
 0.44,
 0.93,
 0.11,
 0.66,
 0.18,
 0.22,


In [50]:
# 9.4. Joining the raw data witht the predictions

output = data.copy()
output['Predictions - Churn or Not'] = pred_rf
output['Predictions - Probability to Churn'] = column(pred_prob_rf, 1)
output['Predictions - Churn or Not Desc'] = 'Empty'
output['Predictions - Churn or Not Desc'][output['Predictions - Churn or Not'] == 0] = 'Retention'
output['Predictions - Churn or Not Desc'][output['Predictions - Churn or Not'] == 1] = 'Churn'
output.head()

Unnamed: 0,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,geography_France,geography_Germany,geography_Spain,gender_Female,gender_Male,Predictions - Churn or Not,Predictions - Probability to Churn,Predictions - Churn or Not Desc
0,0.538,0.324324,2,0.0,1,1,1,0.506735,1,0,0,1,0,0,0.23,Retention
1,0.516,0.310811,1,0.334031,1,0,1,0.562709,0,0,1,1,0,0,0.31,Retention
2,0.304,0.324324,8,0.636357,3,1,0,0.569654,1,0,0,1,0,1,0.83,Churn
3,0.698,0.283784,1,0.0,2,0,0,0.46912,1,0,0,1,0,1,0.78,Churn
4,1.0,0.337838,2,0.500246,1,1,1,0.3954,0,0,1,1,0,0,0.18,Retention
