In [None]:
# Installation of required libraries
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.cluster import KMeans

Data Exploration

In [None]:
# to display all columns and rows:
pd.set_option('display.max_columns', None); pd.set_option('display.max_rows', None);

In [None]:
# load the dataset as pandas dataframe
# Note: Please update the path to your local file path for this to work
df = pd.read_csv(r" ")
df.head()

In [None]:
# Drop last 2 columns of dataframe
df.drop(columns=df.columns[-2:], axis=1, inplace=True)
df.head()

In [None]:
# check size of the data set examined
df.shape

In [None]:
# Feature information
df.info()

In [None]:
# Check if there is any null values
df.isnull().sum()

In [None]:
# The average of the age variable was taken according to the dependent variable
df.groupby("Attrition_Flag").agg("mean")

In [None]:
# most left as per the gender variable
df.groupby("Gender").agg({"Attrition_Flag": "count"})

In [None]:
# check CLIENTNUM variable has unknown value
(df["CLIENTNUM"] == "unknown").count()

In [None]:
# dataset statistics for numaric variables
df.describe()

In [None]:
# check the percentage of Existing  and Attrited customers
(df['Attrition_Flag'].value_counts()/df['Attrition_Flag'].count())*100

In [None]:
# check the percentage of male and female for Attrition_Flag
(df['Gender'].value_counts()/df['Attrition_Flag'].count())*100

In [None]:
# replacing categorical column to numarical columns
df['Attrition_Flag'].replace(['Existing Customer', 'Attrited Customer'],[1, 0], inplace=True)

In [None]:
# replacing categorical column to numarical columns using OneHotEncoder
from sklearn.preprocessing import OneHotEncoder
final_df = df
categorical_columns = ['Gender', 'Education_Level', 'Marital_Status','Income_Category', 'Card_Category']
for col in categorical_columns:
    col_df = pd.get_dummies(df[col], prefix=col)
    final_df = pd.concat((final_df, col_df), axis=1).drop(col, axis=1)

Data Visualization

In [None]:
# Plots to map the dependence of 'Attrition_Flag' column on categorical features
import matplotlib.gridspec as gridspec

fig = plt.figure(constrained_layout=False, figsize=(17, 20))
spec = gridspec.GridSpec(ncols=2, nrows=3, figure = fig)
ax1 = fig.add_subplot(spec[0, 0])
ax2 = fig.add_subplot(spec[0, 1])
ax3 = fig.add_subplot(spec[1, 0])
ax4 = fig.add_subplot(spec[1, 1])
ax5 = fig.add_subplot(spec[2, 0])
ax6 = fig.add_subplot(spec[2, 1])

# Attrition_Flag
labels = df['Attrition_Flag'].value_counts().keys()
ax1.pie(df['Attrition_Flag'].value_counts(),labels = labels,  autopct='%.1f%%',
        shadow=True, wedgeprops={'edgecolor': 'black'})
ax1.set_title('Proportion of Attrition_Flag')

# Gender
labels = df['Gender'].value_counts().keys()
ax2.pie(df['Gender'].value_counts(),labels = labels,  autopct='%.1f%%',
        shadow=True, wedgeprops={'edgecolor': 'black'})
ax2.set_title('Proportion of Gender')

# Education_Level
sns.countplot(ax=ax3, x=df['Education_Level'])
ax3.set_title('Education_Level of Customers')

# Marital_Status 
sns.countplot(ax=ax4, x=df['Marital_Status'])
ax4.set_title('Marital_Status of Customers')

# Income_Category 
sns.countplot(ax=ax5, x=df['Income_Category'])
ax5.set_title('Income_Category of Customers')              

# Card_Category                 
labels = df['Card_Category'].value_counts().keys()
ax6.pie(df['Card_Category'].value_counts(),labels = labels,  autopct='%.1f%%',
        shadow=True, wedgeprops={'edgecolor': 'black'})
ax6.set_title('Proportion of Card_Category')

In [None]:
# Correlation Heatmaps
plt.figure(figsize=(15,10))
correlations = df.corr()
sns.heatmap(round(correlations,2), cmap='RdBu', annot=True, 
            annot_kws={"size": 7}, vmin=-1, vmax=1);