In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier


In [2]:
df=pd.read_csv("Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
#Removing missing values 
df.dropna(how = 'any', inplace = True)

In [4]:
df.TotalCharges = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [8]:

df.loc[df['TotalCharges'].isnull() == True]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn


In [9]:
labels = ["{0} - {1}".format(i, i + 11) for i in range(1, 72, 12)]

df['tenure_group'] = pd.cut(df.tenure, range(1, 74, 12), right=False, labels=labels)

In [10]:
df['tenure_group'].value_counts()

1 - 12     2175
61 - 72    1407
13 - 24    1024
25 - 36     832
49 - 60     832
37 - 48     762
Name: tenure_group, dtype: int64

In [11]:
df.drop(columns= ['customerID','tenure'], axis=1, inplace=True)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,1 - 12
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No,25 - 36
2,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1 - 12
3,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,37 - 48
4,Female,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1 - 12


In [12]:
df['Churn']=np.where(df.Churn == 'Yes',1,0)
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0,1 - 12
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0,25 - 36
2,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1,1 - 12
3,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,0,37 - 48
4,Female,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1,1 - 12


In [21]:
telco_data1=df.drop('Churn',axis=1)

In [13]:

telco_data=df.drop('Churn',axis=1)
target=df['Churn']

In [14]:
target

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7032, dtype: int32

In [15]:
telco_data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,tenure_group
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,1 - 12
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,25 - 36
2,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1 - 12
3,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,37 - 48
4,Female,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,1 - 12


In [16]:
from sklearn.preprocessing import LabelEncoder

In [17]:
gender_lb=LabelEncoder()
Partner_lb=LabelEncoder()
Dependents_lb=LabelEncoder()
PhoneService_lb=LabelEncoder()
MultipleLines_lb=LabelEncoder()
InternetService_lb=LabelEncoder()
OnlineSecurity_lb=LabelEncoder()
OnlineBackup_lb=LabelEncoder()
DeviceProtection_lb=LabelEncoder()
TechSupport_lb=LabelEncoder()
StreamingTV_lb=LabelEncoder()
StreamingMovies_lb=LabelEncoder()
Contract_lb=LabelEncoder()
PaperlessBilling_lb=LabelEncoder()
PaymentMethod_lb=LabelEncoder()
tenure_group_lb=LabelEncoder()

In [23]:
telco_data1['gender_n']=gender_lb.fit_transform(telco_data1['gender'])
telco_data1['Partner_n']=Partner_lb.fit_transform(telco_data1['Partner'])
telco_data1['Dependents_n']=Dependents_lb.fit_transform(telco_data1['Dependents'])
telco_data1['PhoneService_n']=PhoneService_lb.fit_transform(telco_data1['PhoneService'])
telco_data1['MultipleLines_n']=MultipleLines_lb.fit_transform(telco_data1['MultipleLines'])
telco_data1['InternetService_n']=InternetService_lb.fit_transform(telco_data1['InternetService'])
telco_data1['OnlineSecurity_n']=OnlineSecurity_lb.fit_transform(telco_data1['OnlineSecurity'])
telco_data1['OnlineBackup_n']=OnlineBackup_lb.fit_transform(telco_data1['OnlineBackup'])
telco_data1['DeviceProtection_n']=DeviceProtection_lb.fit_transform(telco_data1['DeviceProtection'])
telco_data1['TechSupport_n']=TechSupport_lb.fit_transform(telco_data1['TechSupport'])
telco_data1['StreamingTV_n']=StreamingTV_lb.fit_transform(telco_data1['StreamingTV'])
telco_data1['StreamingMovies_n']=StreamingMovies_lb.fit_transform(telco_data1['StreamingMovies'])
telco_data1['Contract_n']=Contract_lb.fit_transform(telco_data1['Contract'])
telco_data1['PaperlessBilling_n']=PaperlessBilling_lb.fit_transform(telco_data1['PaperlessBilling'])
telco_data1['PaymentMethod_n']=PaymentMethod_lb.fit_transform(telco_data1['PaymentMethod'])
telco_data1['tenure_group_n']=tenure_group_lb.fit_transform(telco_data1['tenure_group'])

In [27]:
telco_data1.head(12)
#telco_data1.columns

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,...,OnlineSecurity_n,OnlineBackup_n,DeviceProtection_n,TechSupport_n,StreamingTV_n,StreamingMovies_n,Contract_n,PaperlessBilling_n,PaymentMethod_n,tenure_group_n
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,...,0,2,0,0,0,0,0,1,2,0
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,...,2,0,2,0,0,0,1,0,3,2
2,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,...,2,2,0,0,0,0,0,1,3,0
3,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,...,2,0,2,2,0,0,1,0,0,3
4,Female,0,No,No,Yes,No,Fiber optic,No,No,No,...,0,0,0,0,0,0,0,1,2,0
5,Female,0,No,No,Yes,Yes,Fiber optic,No,No,Yes,...,0,0,2,0,2,2,0,1,2,0
6,Male,0,No,Yes,Yes,Yes,Fiber optic,No,Yes,No,...,0,2,0,0,2,0,0,1,1,1
7,Female,0,No,No,No,No phone service,DSL,Yes,No,No,...,2,0,0,0,0,0,0,0,3,0
8,Female,0,Yes,No,Yes,Yes,Fiber optic,No,No,Yes,...,0,0,2,2,2,2,0,1,2,2
9,Male,0,No,Yes,Yes,No,DSL,Yes,Yes,No,...,2,2,0,0,0,0,1,0,0,5


In [26]:
telco_data1.SeniorCitizen.value_counts()

0    5890
1    1142
Name: SeniorCitizen, dtype: int64

In [28]:
telco_data_copy=telco_data1.drop(['gender','Partner','Dependents','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod','tenure_group'],axis=1)
telco_data_copy.head()

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_n,Partner_n,Dependents_n,PhoneService_n,MultipleLines_n,InternetService_n,OnlineSecurity_n,OnlineBackup_n,DeviceProtection_n,TechSupport_n,StreamingTV_n,StreamingMovies_n,Contract_n,PaperlessBilling_n,PaymentMethod_n,tenure_group_n
0,0,29.85,29.85,0,1,0,0,1,0,0,2,0,0,0,0,0,1,2,0
1,0,56.95,1889.5,1,0,0,1,0,0,2,0,2,0,0,0,1,0,3,2
2,0,53.85,108.15,1,0,0,1,0,0,2,2,0,0,0,0,0,1,3,0
3,0,42.3,1840.75,1,0,0,0,1,0,2,0,2,2,0,0,1,0,0,3
4,0,70.7,151.65,0,0,0,1,0,1,0,0,0,0,0,0,0,1,2,0


In [29]:
telco_data_copy.dtypes

SeniorCitizen           int64
MonthlyCharges        float64
TotalCharges          float64
gender_n                int32
Partner_n               int32
Dependents_n            int32
PhoneService_n          int32
MultipleLines_n         int32
InternetService_n       int32
OnlineSecurity_n        int32
OnlineBackup_n          int32
DeviceProtection_n      int32
TechSupport_n           int32
StreamingTV_n           int32
StreamingMovies_n       int32
Contract_n              int32
PaperlessBilling_n      int32
PaymentMethod_n         int32
tenure_group_n          int32
dtype: object

In [33]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [34]:
telco_data_copy.shape

(7032, 19)

In [35]:
target.shape

(7032,)

In [67]:
x_train,x_test,y_train,y_test=train_test_split(telco_data_copy,target,test_size=0.2)

In [68]:
model_dt.fit(x_train,y_train)

In [69]:
y_pred=model_dt.predict(x_train)
y_pred

array([0, 1, 0, ..., 0, 1, 0])

In [70]:
x_train.to_csv("df1traindata.csv")

In [71]:
x_test.to_csv("df2traindata.csv")

In [73]:
y_proba=model_dt.predict_proba(x_train)
y_proba[:,0]
y_predict_xtrain=pd.DataFrame(y_proba[:,0])


In [74]:
y_predict_xtrain.to_csv("y_predict_xtrain.csv")

In [75]:
y_proba[:,1]
y_predict_xtrain1=pd.DataFrame(y_proba[:,1])
y_predict_xtrain1.to_csv("y_predict_xtrain1.csv")

In [76]:
y_proba1=model_dt.predict_proba(x_test)
y_proba1[:,0]
y_predict_xtest=pd.DataFrame(y_proba1[:,0])
y_predict_xtest.to_csv("y_predict_xtest0.csv")

In [77]:
y_proba1[:,1]
y_predict_xtest1=pd.DataFrame(y_proba1[:,1])
y_predict_xtest1.to_csv("y_predict_xtest1.csv")

In [39]:
model_dt.score(x_test,y_test)

0.7910447761194029

In [40]:
telco_data_copy.to_csv("labelEncoded.csv")