In [251]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from statistics import mode
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [228]:
data = pd.read_csv('churn_prediction.csv')
data.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,days_since_last_transaction,...,previous_month_end_balance,average_monthly_balance_prevQ,average_monthly_balance_prevQ2,current_month_credit,previous_month_credit,current_month_debit,previous_month_debit,current_month_balance,previous_month_balance,churn
0,1,3135,66,Male,0.0,self_employed,187.0,2,755,224.0,...,1458.71,1458.71,1449.07,0.2,0.2,0.2,0.2,1458.71,1458.71,0
1,2,310,35,Male,0.0,self_employed,,2,3214,60.0,...,8704.66,7799.26,12419.41,0.56,0.56,5486.27,100.56,6496.78,8787.61,0
2,4,2356,31,Male,0.0,salaried,146.0,2,41,,...,5815.29,4910.17,2815.94,0.61,0.61,6046.73,259.23,5006.28,5070.14,0
3,5,478,90,,,self_employed,1020.0,2,582,147.0,...,2291.91,2084.54,1006.54,0.47,0.47,0.47,2143.33,2291.91,1669.79,1
4,6,2531,42,Male,2.0,self_employed,1494.0,3,388,58.0,...,1401.72,1643.31,1871.12,0.33,714.61,588.62,1538.06,1157.15,1677.16,1


In [229]:
data.shape

(28382, 21)

In [230]:
data.isnull().sum()

customer_id                          0
vintage                              0
age                                  0
gender                             525
dependents                        2463
occupation                          80
city                               803
customer_nw_category                 0
branch_code                          0
days_since_last_transaction       3223
current_balance                      0
previous_month_end_balance           0
average_monthly_balance_prevQ        0
average_monthly_balance_prevQ2       0
current_month_credit                 0
previous_month_credit                0
current_month_debit                  0
previous_month_debit                 0
current_month_balance                0
previous_month_balance               0
churn                                0
dtype: int64

In [231]:
temp=pd.DataFrame(index=data.columns)
temp['Data_Type']=data.dtypes
temp['Null_Count']=data.isnull().sum()
temp['Unique_Count']=data.nunique()

In [232]:
temp

Unnamed: 0,Data_Type,Null_Count,Unique_Count
customer_id,int64,0,28382
vintage,int64,0,5473
age,int64,0,90
gender,object,525,2
dependents,float64,2463,15
occupation,object,80,5
city,float64,803,1604
customer_nw_category,int64,0,3
branch_code,int64,0,3185
days_since_last_transaction,float64,3223,360


In [233]:
data[['gender','dependents', 'city', 'days_since_last_transaction']].head()

Unnamed: 0,gender,dependents,city,days_since_last_transaction
0,Male,0.0,187.0,224.0
1,Male,0.0,,60.0
2,Male,0.0,146.0,
3,,,1020.0,147.0
4,Male,2.0,1494.0,58.0


In [234]:
data['gender'].fillna((data['gender'].mode()[0]), inplace=True)
data['dependents'].fillna((data['dependents'].mean()), inplace=True)
data['occupation'].fillna((data['occupation'].mode()[0]), inplace=True)
data['city'].fillna((data['city'].mode()[0]), inplace=True)
data['days_since_last_transaction'].fillna((data['days_since_last_transaction'].mean()), inplace=True)
data.isnull().sum()

customer_id                       0
vintage                           0
age                               0
gender                            0
dependents                        0
occupation                        0
city                              0
customer_nw_category              0
branch_code                       0
days_since_last_transaction       0
current_balance                   0
previous_month_end_balance        0
average_monthly_balance_prevQ     0
average_monthly_balance_prevQ2    0
current_month_credit              0
previous_month_credit             0
current_month_debit               0
previous_month_debit              0
current_month_balance             0
previous_month_balance            0
churn                             0
dtype: int64

In [235]:
data = pd.get_dummies(data)

In [236]:
#seperating independent and dependent variables
x = data.drop(['churn'], axis=1)
y = data['churn']
x.shape, y.shape

((28382, 25), (28382,))

In [237]:
train_x,test_x,train_y,test_y = train_test_split(x,y, random_state = 56, stratify=y)

In [238]:
train_x.head()

Unnamed: 0,customer_id,vintage,age,dependents,city,customer_nw_category,branch_code,days_since_last_transaction,current_balance,previous_month_end_balance,...,previous_month_debit,current_month_balance,previous_month_balance,gender_Female,gender_Male,occupation_company,occupation_retired,occupation_salaried,occupation_self_employed,occupation_student
26129,27897,2443,71,0.0,956.0,3,3699,18.0,12832.61,3082.16,...,13014.84,22863.56,3532.39,1,0,0,0,0,1,0
13757,14685,1700,62,0.0,1096.0,3,1360,91.0,5648.84,5645.44,...,0.09,5648.49,5645.44,1,0,0,0,0,1,0
1085,1159,5679,36,0.0,146.0,2,4101,7.0,1415.11,2430.07,...,1153.55,1386.71,2286.96,0,1,0,0,0,1,0
2418,2586,1907,57,3.0,1181.0,2,235,69.997814,4941.59,4941.59,...,0.31,4941.59,4941.59,1,0,0,0,0,1,0
18577,19823,907,39,0.347236,146.0,3,2815,47.0,463.32,9034.75,...,0.36,3714.55,8398.75,0,1,0,0,0,1,0


In [239]:
model1 = LogisticRegression()
model1.fit(train_x,train_y)
pred1=model1.predict(test_x)
pred1[:10], model1.score(test_x, test_y)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64), 0.8248308906426156)

In [240]:
model2 = KNeighborsClassifier(n_neighbors=5)
model2.fit(train_x,train_y)
pred2=model2.predict(test_x)
pred2[:10], model2.score(test_x, test_y)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64), 0.8407553551296505)

In [241]:
model3 = DecisionTreeClassifier(max_depth=7)
model3.fit(train_x,train_y)
pred3=model3.predict(test_x)
pred3[:10], model3.score(test_x, test_y)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64), 0.8470969560315671)

In [248]:
model4 = RandomForestClassifier(n_estimators=100, max_depth= 4, random_state=3)
model4.fit(train_x, train_y)
pred4=model4.predict(test_x)
pred4[:10], model4.score(test_x, test_y)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64), 0.8540022547914318)

In [256]:
model5 = DecisionTreeClassifier(max_depth=3, random_state=10)
model5.fit(train_x, train_y)
pred5=model5.predict(test_x)
pred5[:10], model5.score(test_x, test_y)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64), 0.8520293122886133)

In [258]:
m1_score= model1.score(test_x, test_y)
m2_score= model2.score(test_x, test_y)
m3_score= model3.score(test_x, test_y)
m4_score= model4.score(test_x, test_y)
m5_score= model5.score(test_x, test_y)
m1_score, m2_score, m3_score,m4_score,m5_score

(0.8248308906426156,
 0.8407553551296505,
 0.8470969560315671,
 0.8540022547914318,
 0.8520293122886133)

In [259]:
final_pred = np.array([])
for i in range(0,len(test_x)):
    final_pred = np.append(final_pred,mode([pred1[i], pred2[i], pred3[i], pred4[i], pred5[i]]))

In [260]:
accuracy_score(test_y, final_pred)

0.854847801578354