In [1]:
# TIME STAMP
import os
import time

def watermark(path):
    print("Created: %s" % time.ctime(os.path.getctime(path)))
    print("Last modified: %s" % time.ctime(os.path.getmtime(path)))
    print("Author: Mattithyahu")

# Always remember to use unicode escape for backslashes if present 
watermark("P2 Bank churn prediction model.ipynb")

Created: Thu Oct 28 20:50:22 2021
Last modified: Sat Oct 30 13:16:33 2021
Author: Mattithyahu


In [21]:
# import libraries
from datetime import datetime, timedelta
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division

from chart_studio import plotly
import plotly.offline as pyoff
import plotly.graph_objs as go

In [None]:
# Exploratory data analysis
# Feature engineering
# Investigating how the features affect Retention by using Logistic Regression
# Building a classification model with XGBoost

In [22]:
# Exploratory data analysis

# Reading csv into df
df = pd.read_csv("Bankchurners.csv")

# Viewing first 10 rows
# df.head(10)

# Viewing data types 
# print(df.dtypes)

# .info() is the best way to see compact list of columns and types 
df.info() # No null values found  

# The data fall under two categories:
# Categorical features: Gender, Card_category, Education_Level etc.
# Numerical features: Dependent_count, Months_on_book, Credit_Limit etc.


# The Attrition_Flag column is string with Attrited Customer/Existing Customer values. We convert it to integer to make it easier to use in our analysis.
df.loc[df.Attrition_Flag=='Existing Customer','Attrition_Flag'] = 0 
df.loc[df.Attrition_Flag=='Attrited Customer','Attrition_Flag'] = 1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 23 columns):
 #   Column                                                                                                                              Non-Null Count  Dtype  
---  ------                                                                                                                              --------------  -----  
 0   CLIENTNUM                                                                                                                           10127 non-null  int64  
 1   Attrition_Flag                                                                                                                      10127 non-null  object 
 2   Customer_Age                                                                                                                        10127 non-null  int64  
 3   Gender                                                                           

# **Gender**

In [24]:
# Defining a new df (df_plot) grouped by Gender. Finding the AVG value in Attrition_Flag column, telling us the Churn rate. 
# (.reset_index) When we reset the index, the old index is added as a column, and a new sequential index is used or when there is no index it creates one. 
# Without the .reset_index() the df would just be a series, with it, it is a dataframe
df_plot = df.groupby('Gender').Attrition_Flag.mean().reset_index()
plot_data = [
    go.Bar(
        x=df_plot['Gender'],
        y=df_plot['Attrition_Flag'],
        width = [0.5, 0.5],
        marker=dict(
        color=['green', 'blue'])
    )
]
plot_layout = go.Layout(
        xaxis={"type": "category"},
        yaxis={"title": "Churn Rate"},
        title='Gender',
        plot_bgcolor  = 'rgb(243,243,243)',
        paper_bgcolor  = 'rgb(243,243,243)',
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

df_plot



Unnamed: 0,Gender,Attrition_Flag
0,F,0.173572
1,M,0.146152


### Female customers are more likely to churn vs. male customers, but the difference is not that substantial(~2.74%).

# **Education_Level**

In [25]:
df_plot = df.groupby('Education_Level').Attrition_Flag.mean().reset_index()
plot_data = [
    go.Bar(
        x=df_plot['Education_Level'],
        y=df_plot['Attrition_Flag'],
        width = [0.5, 0.5],
        marker=dict(
        color=['green', 'blue'])
    )
]
plot_layout = go.Layout(
        xaxis={"type": "category"},
        yaxis={"title": "Churn Rate"},
        title='Education_Level',
        plot_bgcolor  = 'rgb(243,243,243)',
        paper_bgcolor  = 'rgb(243,243,243)',
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

df_plot

Unnamed: 0,Education_Level,Attrition_Flag
0,College,0.152024
1,Doctorate,0.210643
2,Graduate,0.155691
3,High School,0.152012
4,Post-Graduate,0.178295
5,Uneducated,0.159381
6,Unknown,0.168532
