In [74]:
#importing the libraries
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 


In [75]:
churn_data =pd.read_csv("churn_prediction.csv") #importing and read the dataset

In [76]:
#checking the no.of rows and columns in the dataset
churn_data.shape


(28382, 21)

In [77]:
churn_data.head() #checking the first 5 rows of the dataset

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,1,3135,66,Male,0.0,self_employed,187.0,2,755,224.0,...,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0
1,2,310,35,Male,0.0,self_employed,,2,3214,60.0,...,8704.66,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0
2,4,2356,31,Male,0.0,salaried,146.0,2,41,,...,5815.29,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0
3,5,478,90,,,self_employed,1020.0,2,582,147.0,...,2291.91,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1
4,6,2531,42,Male,2.0,self_employed,1494.0,3,388,58.0,...,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1


In [78]:
#checking the no.of missing values from the dataset
churn_data.isnull().sum()

customer_id                          0
vintage                              0
age                                  0
gender                             525
dependents                        2463
occupation                          80
city                               803
customer_nw_category                 0
branch_code                          0
days_since_last_transaction       3223
current_balance                      0
previous_month_end_balance           0
average_monthly_balance_prevQ        0
average_monthly_balance_prevQ2       0
current_month_credit                 0
previous_month_credit                0
current_month_debit                  0
previous_month_debit                 0
current_month_balance                0
previous_month_balance               0
churn                                0
dtype: int64

In [79]:
churn_data['churn'].value_counts() #checking frequency count of churn in the dataset

0    23122
1     5260
Name: churn, dtype: int64

In [80]:
churn_data.dtypes #checking the datatypes so we can convert categorical to numerical datatypes

customer_id                         int64
vintage                             int64
age                                 int64
gender                             object
dependents                        float64
occupation                         object
city                              float64
customer_nw_category                int64
branch_code                         int64
days_since_last_transaction       float64
current_balance                   float64
previous_month_end_balance        float64
average_monthly_balance_prevQ     float64
average_monthly_balance_prevQ2    float64
current_month_credit              float64
previous_month_credit             float64
current_month_debit               float64
previous_month_debit              float64
current_month_balance             float64
previous_month_balance            float64
churn                               int64
dtype: object

In [81]:
#filling the missing values 
churn_data['dependents'].fillna(value = (churn_data['dependents'].mode()[0]), inplace=True)
churn_data['gender'].fillna(value = (churn_data['gender'].mode()[0]), inplace=True)
churn_data['city'].fillna(value = (churn_data['city'].mode()[0]), inplace=True)
churn_data['occupation'].fillna(value = (churn_data['occupation'].mode()[0]), inplace=True)
churn_data['days_since_last_transaction'].fillna(value = (churn_data['days_since_last_transaction'].mode()[0]), inplace=True)


In [82]:

churn_data.isnull().sum() #no missing values 

customer_id                       0
vintage                           0
age                               0
gender                            0
dependents                        0
occupation                        0
city                              0
customer_nw_category              0
branch_code                       0
days_since_last_transaction       0
current_balance                   0
previous_month_end_balance        0
average_monthly_balance_prevQ     0
average_monthly_balance_prevQ2    0
current_month_credit              0
previous_month_credit             0
current_month_debit               0
previous_month_debit              0
current_month_balance             0
previous_month_balance            0
churn                             0
dtype: int64

In [83]:
#Performing One Hot Encoding using get_dummies method
churn_data = pd.get_dummies(churn_data, columns = ['gender','occupation'])

In [84]:
churn_data.head()

Unnamed: 0,customer_id,vintage,age,dependents,city,customer_nw_category,branch_code,days_since_last_transaction,current_balance,previous_month_end_balance,...,current_month_balance,previous_month_balance,churn,gender_Female,gender_Male,occupation_company,occupation_retired,occupation_salaried,occupation_self_employed,occupation_student
0,1,3135,66,0.0,187.0,2,755,224.0,1458.71,1458.71,...,1458.71,1458.71,0,0,1,0,0,0,1,0
1,2,310,35,0.0,1020.0,2,3214,60.0,5390.37,8704.66,...,6496.78,8787.61,0,0,1,0,0,0,1,0
2,4,2356,31,0.0,146.0,2,41,0.0,3913.16,5815.29,...,5006.28,5070.14,0,0,1,0,0,1,0,0
3,5,478,90,0.0,1020.0,2,582,147.0,2291.91,2291.91,...,2291.91,1669.79,1,0,1,0,0,0,1,0
4,6,2531,42,2.0,1494.0,3,388,58.0,927.72,1401.72,...,1157.15,1677.16,1,0,1,0,0,0,1,0


In [85]:
#Perform Feature Scaling and One Hot Encoding
from sklearn.preprocessing import StandardScaler

#Perform Feature Scaling on columns to bring them on same scale
standardScaler = StandardScaler()
columns_for_ft_scaling = ['days_since_last_transaction', 'current_balance', 'previous_month_end_balance','current_month_balance','previous_month_balance','branch_code','city','vintage']

#Apply the feature scaling operation on dataset using fit_transform() method
churn_data[columns_for_ft_scaling] = standardScaler.fit_transform(churn_data[columns_for_ft_scaling])

In [86]:
churn_data.head()

Unnamed: 0,customer_id,vintage,age,dependents,city,customer_nw_category,branch_code,days_since_last_transaction,current_balance,previous_month_end_balance,...,current_month_balance,previous_month_balance,churn,gender_Female,gender_Male,occupation_company,occupation_retired,occupation_salaried,occupation_self_employed,occupation_student
0,1,0.478644,66,0.0,-1.436917,2,-0.182318,1.921844,-0.139017,-0.141953,...,-0.142564,-0.142265,0,0,1,0,0,0,1,0
1,2,-1.275909,35,0.0,0.507942,2,2.439824,-0.024315,-0.04672,0.028425,...,-0.022705,0.030459,0,0,1,0,0,0,1,0
2,4,-0.005178,31,0.0,-1.532642,2,-0.943689,-0.736325,-0.081398,-0.039514,...,-0.058165,-0.057152,0,0,1,0,0,1,0,0
3,5,-1.171568,90,0.0,0.507942,2,-0.366796,1.008098,-0.119457,-0.122361,...,-0.122742,-0.13729,1,0,1,0,0,0,1,0
4,6,0.103512,42,2.0,1.61462,3,-0.573667,-0.048049,-0.151482,-0.143293,...,-0.149738,-0.137116,1,0,1,0,0,0,1,0


In [87]:
#Create Feature variable X and Target variable y
y = churn_data['churn']
x = churn_data.drop(['churn','customer_id'], axis = 1)

In [88]:
#Split the data into training set (70%) and test set (30%)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 50)

In [89]:
# Machine Learning classification model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [90]:
#Fit the logistic Regression Model
logmodel = LogisticRegression(random_state=50)
logmodel.fit(X_train,y_train)

#Predict the value for new, unseen data
pred = logmodel.predict(X_test)

# Find Accuracy using accuracy_score method
logmodel_accuracy = round(metrics.accuracy_score(y_test, pred) * 100, 2)
print(logmodel_accuracy)

81.57


In [91]:
#Fit the K-Nearest Neighbor Model
from sklearn.neighbors import KNeighborsClassifier
knnmodel = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2) #p=2 represents Euclidean distance, p=1 represents Manhattan Distance
knnmodel.fit(X_train, y_train) 
  
#Predict the value for new, unseen data
knn_pred = knnmodel.predict(X_test)

# Find Accuracy using accuracy_score method
knn_accuracy = round(metrics.accuracy_score(y_test, knn_pred) * 100, 2)
print(knn_accuracy)

81.49


In [92]:
#Fit the Decision Tree Classification Model
from sklearn.tree import DecisionTreeClassifier
dtmodel = DecisionTreeClassifier(criterion = "gini", random_state = 50)
dtmodel.fit(X_train, y_train) 
  
#Predict the value for new, unseen data
dt_pred = dtmodel.predict(X_test)

# Find Accuracy using accuracy_score method
dt_accuracy = round(metrics.accuracy_score(y_test, dt_pred) * 100, 2)
print(dt_accuracy)

78.8


In [93]:
#Fit the Random Forest Classification Model
from sklearn.ensemble import RandomForestClassifier
rfmodel = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
rfmodel.fit(X_train, y_train) 
  
#Predict the value for new, unseen data
rf_pred = rfmodel.predict(X_test)

# Find Accuracy using accuracy_score method
rf_accuracy = round(metrics.accuracy_score(y_test, rf_pred) * 100, 2)
print(rf_accuracy)

86.94


In [94]:
# Compare Several models according to their Accuracies
Model_Comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'K-Nearest Neighbor', 
              'Decision Tree', 'Random Forest'],
    'Score': [logmodel_accuracy, knn_accuracy, 
              dt_accuracy, rf_accuracy]})
Model_Comparison_df = Model_Comparison.sort_values(by='Score', ascending=False)
Model_Comparison_df = Model_Comparison_df.set_index('Score')
Model_Comparison_df.reset_index()

Unnamed: 0,Score,Model
0,86.94,Random Forest
1,81.57,Logistic Regression
2,81.49,K-Nearest Neighbor
3,78.8,Decision Tree


In [95]:
# Predict the probability of Churn of each customer
churn_data['Probability_of_Churn'] = rfmodel.predict_proba(churn_data[X_test.columns])[:,1]

In [96]:
# Create a Dataframe showcasing probability of Churn of each customer
churn_data[['customer_id','Probability_of_Churn']].head()

Unnamed: 0,customer_id,Probability_of_Churn
0,1,0.01
1,2,0.46
2,4,0.14
3,5,0.66
4,6,0.78
