# ToolBox Loading

In [1]:
#Data Manipulation
import numpy as np
import pandas as pd

#Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

#Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#Model evaluation
from sklearn.metrics import *

#Warnings
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'imblearn'

# Data Exploration

In [None]:
data = pd.read_csv(r'C:\Users\USER1\Desktop\credit_card_churn.csv')
# Let’s observe the shape of our datasets.
print('train data shape :',data.shape)

In [None]:
#checking first five rows in dataset
data.head()

In [None]:
#checking last five rows in dataset
data.tail()

In [None]:
#showing dataset information
data.info()

In [None]:
#showing statistical description of dataset
data.describe()

In [None]:
data.describe(include='object')

# Data Cleaning

In [None]:
#checking for null values in train set
print(f'Any NaN values? {data.isna().values.any()}')

In [None]:
#checking for duplicates values in train set
print(f'Any duplicates? {data.duplicated().values.any()}')

In [None]:
#dropping irrelevant columns
data = data.drop(
['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
axis = 1)

# Exploratory Data Analysis 

In [None]:
#showing a distribution of numerical columns in the dataset
data.hist(figsize = (20,11), color = 'k')

In [None]:
#checking percentage of target outcomes
data['Attrition_Flag'].value_counts()/len(data)

In [None]:
#showing barplot of attrition flag
data['Attrition_Flag'].value_counts().plot.bar()

In [None]:
#showing barplot of educational level
data['Education_Level'].value_counts().plot.bar()

In [None]:
data['Income_Category'].value_counts().plot.bar()

In [None]:
#showing barplot of card category
data['Card_Category'].value_counts().plot.bar()

In [None]:
#showing barplot of marital status
data['Marital_Status'].value_counts().plot.bar()

In [None]:
#showing barplot of gender
data['Gender'].value_counts().plot.bar()

In [None]:
#boxplot of credit limit-age distribution
plt.figure(figsize=(20,7))
sns.boxplot(data=data,y='Credit_Limit',x='Customer_Age')
plt.xlabel('Customers Age')
plt.ylabel('Credit_Limit')
plt.title('Credit Limit-Age Distribution')

In [None]:
#boxplot of credit limit-gender distribution
plt.figure(figsize=(20,7))
sns.boxplot(data=data,y='Credit_Limit',x='Gender')
plt.xlabel('Gender')
plt.ylabel('Credit_Limit')
plt.title('Credit Limit-Gender Distribution')

In [None]:
#boxplot of credit limit-income distribution
plt.figure(figsize=(20,7))
sns.boxplot(data=data,y='Credit_Limit',x='Income_Category')
plt.xlabel('Income_Category')
plt.ylabel('Credit_Limit')
plt.title('Credit Limit-Income Distribution')

In [None]:
#boxplot of credit limit-marital status distribution
plt.figure(figsize=(20,7))
sns.boxplot(data=data,y='Credit_Limit',x='Marital_Status')
plt.xlabel('Marital_Status')
plt.ylabel('Credit_Limit')
plt.title('Credit Limit-Marital_Status Distribution')

In [None]:
#boxplot of credit limit-card category distribution
plt.figure(figsize=(20,7))
sns.boxplot(data=data,y='Credit_Limit',x='Card_Category')
plt.xlabel('Income_Category')
plt.ylabel('Credit_Limit')
plt.title('Credit Limit-Card_Category Distribution')

In [None]:
#crosstable of gender and attrition flag columns
pd.crosstab(data['Gender'], data['Attrition_Flag'])

In [None]:
gen_bar = pd.crosstab(data['Gender'], data['Attrition_Flag'])
gen_bar.div(gen_bar.sum(axis = 1).astype(float), axis = 0).plot(kind = 'bar', stacked = True, figsize = (7,7))
plt.xlabel('Gender')
plt.ylabel('Percentage')

In [None]:
#crosstable of education level and attrition flag columns
pd.crosstab(data['Education_Level'], data['Attrition_Flag'])

In [None]:
edu_bar = pd.crosstab(data['Education_Level'], data['Attrition_Flag'])
edu_bar.div(edu_bar.sum(axis = 1).astype(float), axis = 0).plot(kind = 'bar', stacked = True, figsize = (8,8))
plt.xlabel('Education_Level')
plt.ylabel('Percentage')

In [None]:
#crosstable of marital status and attrition flag columns
pd.crosstab(data['Marital_Status'], data['Attrition_Flag'])

In [None]:
mar_bar = pd.crosstab(data['Marital_Status'], data['Attrition_Flag'])
mar_bar.div(mar_bar.sum(axis = 1).astype(float), axis = 0).plot(kind = 'bar', stacked = True, figsize = (8,8))
plt.xlabel('Education_Level')
plt.ylabel('Percentage')

In [None]:
#crosstable of card category and attrition flag columns
pd.crosstab(data['Card_Category'], data['Attrition_Flag'])

In [None]:
car_bar = pd.crosstab(data['Card_Category'], data['Attrition_Flag'])
car_bar.div(car_bar.sum(axis = 1).astype(float), axis = 0).plot(kind = 'bar', stacked = True, figsize = (8,8))
plt.xlabel('Education_Level')
plt.ylabel('Percentage')

In [None]:
#crosstable of income category and attrition flag columns
pd.crosstab(data['Income_Category'], data['Attrition_Flag'])

In [None]:
inc_bar = pd.crosstab(data['Income_Category'], data['Attrition_Flag'])
inc_bar.div(inc_bar.sum(axis = 1).astype(float), axis = 0).plot(kind = 'bar', stacked = True, figsize = (8,8))
plt.xlabel('Education_Level')
plt.ylabel('Percentage')

# Feature Engineering

In [None]:
#encoding categorical columns
le = LabelEncoder()
data['Gender'] = le.fit_transform(data['Gender'])
data['Education_Level'] = le.fit_transform(data['Education_Level'])
data['Marital_Status'] = le.fit_transform(data['Marital_Status'])
data['Income_Category'] = le.fit_transform(data['Income_Category'])
data['Card_Category'] = le.fit_transform(data['Card_Category'])

In [None]:
#encoding target outcome
data['Attrition_Flag'] = data['Attrition_Flag'].map({'Existing Customer' : 1,'Attrited Customer':0})

In [None]:
#correlations among features in dataset
correlation = data.corr()
print(correlation['Attrition_Flag'].sort_values(ascending=False))

In [None]:
#heatmap of correlations in dataset
plt.figure(figsize = (22,10))
sns.heatmap(correlation, annot=True,cmap='RdYlBu')

In [None]:
#dropping irrelevant columns
data = data.drop(columns = ['CLIENTNUM','Total_Trans_Ct','Months_on_book',])

In [None]:
data.head()

In [None]:
# Defining the target & predictor variables 
X = data.drop('Attrition_Flag', axis = 1)
y = data['Attrition_Flag']

In [None]:
# Resampling the dataframe using SMOTE
smote = SMOTE(sampling_strategy=0.6)
X, y = smote.fit_resample(X, y)
y.value_counts()

In [None]:
#splitting target and predictor variables into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Modelling

### Logistic Regression

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
prediction = logreg.predict(X_test)

In [None]:
#classification report of model
print('Report:', classification_report(y_test, prediction))

In [None]:
#confusion matrix of logistic regression
cm = confusion_matrix(y_test, prediction, labels=[1, 0])
cm_matrix = pd.DataFrame(data=cm, columns=['Positive:1', 'Negative:0'], index=['Positive:1', 'Negative:0'])
sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')
plt.title("The Confusion Matrix of the Predictions", y = 1.05)
plt.show()

### Decision Tree Classifier

In [None]:
clf = DecisionTreeClassifier(random_state = 20)
clf.fit(X_train, y_train)
prediction_clf = clf.predict(X_test)

In [None]:
#classification report of model
print('Report:', classification_report(y_test, prediction_clf))

In [None]:
#confudion matrix of decision tree classifier
cm_dt = confusion_matrix(y_test, prediction_clf, labels=[1, 0])
cm_matrix_dt = pd.DataFrame(data=cm_dt, columns=['Positive:1', 'Negative:0'], index=['Positive:1', 'Negative:0'])
sns.heatmap(cm_matrix_dt, annot=True, fmt='d', cmap='YlGnBu')
plt.title("The Confusion Matrix of the Predictions", y = 1.05)
plt.show()

### Random Forest Classifier

In [None]:
rf_model = RandomForestClassifier(random_state=1)
rf_model.fit(X_train, y_train)
prediction_rf = rf_model.predict(X_test)

In [None]:
#classification report of model
print('Report:', classification_report(y_test, prediction_rf))

In [None]:
#confusion matrix of random forest classifier
cm_rf = confusion_matrix(y_test, prediction_rf, labels=[1, 0])
cm_matrix_rf = pd.DataFrame(data=cm_rf, columns=['Positive:1', 'Negative:0'], index=['Positive:1', 'Negative:0'])
sns.heatmap(cm_matrix_rf, annot=True, fmt='d', cmap='YlGnBu')
plt.title("The Confusion Matrix of the Predictions", y = 1.05)
plt.show()

In [None]:
def feature_importance(model,data):
    return pd.DataFrame({'Columns': X.columns,'importance':model.feature_importances_}).sort_values(by='importance',ascending=False)

In [None]:
feature_importance(rf_model,X_train)

In [None]:
#plotting feature importances
feature_importance(rf_model,X_train).plot('Columns','importance','barh',
                                    figsize=(12,8),legend=False)
plt.title('Feature Importance based on Random Forest Model')