In [1]:
import pandas as pd
import pickle
import numpy as np

In [2]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [3]:
# Mengisi value di Total Charges yang kosong
df['TotalCharges'] = df['TotalCharges'].replace(' ',0)

# Mengganti tipe data Total Charges menjadi float
df['TotalCharges'] = df['TotalCharges'].astype(float)

In [4]:
# Check
df['TotalCharges'].dtype

dtype('float64')

In [5]:
# Menggabung value No dengan No internet service
gabung = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

for i in gabung : 
    df[i]  = df[i].replace({'No internet service' : 'No'})
    

In [6]:
# Menggabung value No dengan No phone service
df['MultipleLines'] = df['MultipleLines'].replace({'No phone service' : 'No'})

In [7]:
df['SeniorCitizen'] = df['SeniorCitizen'].apply(lambda x: 'No' if x==0 else
                                                           'Yes' if x==1 else x)

In [8]:
# drop Customer ID
df.drop('customerID',axis=1,inplace=True)

## DATA, TARGET

In [9]:
data = df.drop('Churn',axis=1)
data = pd.get_dummies(data=data,drop_first=True)

df['Churn'] = df['Churn'].apply(lambda x: 0 if x=='No' else
                                          1 if x=='Yes' else x)
target = df['Churn']
data.head()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Male,SeniorCitizen_Yes,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_Yes,InternetService_Fiber optic,...,DeviceProtection_Yes,TechSupport_Yes,StreamingTV_Yes,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,29.85,29.85,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,34,56.95,1889.5,1,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0,0,1
2,2,53.85,108.15,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
3,45,42.3,1840.75,1,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
4,2,70.7,151.65,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,1,0


In [10]:
from sklearn.metrics import f1_score, classification_report, matthews_corrcoef, roc_auc_score, accuracy_score, confusion_matrix, log_loss
def calc_train_error(X_train, y_train, model):
    predictions = model.predict(X_train)
    predictProba = model.predict_proba(X_train)
    matt = matthews_corrcoef(y_train, predictions)
    f1 = f1_score(y_train, predictions, average='macro')
    report = classification_report(y_train, predictions)
    roc_auc = roc_auc_score(y_train,predictProba[:,1])
    accuracy = accuracy_score(y_train, predictions)
    confMatrix = confusion_matrix(y_train, predictions)
    logloss = log_loss(y_train,predictProba)
    return{
        'report':report,
        'matthew':matt,
        'f1':f1,
        'roc':roc_auc,
        'accuracy':accuracy,
        'confusion':confMatrix,
        'logloss':logloss
    }

def calc_validation_error(X_test, y_test, model):
    predictions = model.predict(X_test)
    predictProba = model.predict_proba(X_test)
    matt = matthews_corrcoef(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='macro')
    report = classification_report(y_test, predictions)
    roc_auc = roc_auc_score(y_test,predictProba[:,1])
    accuracy = accuracy_score(y_test, predictions)
    confMatrix = confusion_matrix(y_test, predictions)
    logloss = log_loss(y_test,predictProba)
    return{
        'report':report,
        'matthew':matt,
        'f1':f1,
        'roc':roc_auc,
        'accuracy':accuracy,
        'confusion':confMatrix,
        'logloss':logloss
    }

def calc_metrics(X_train, y_train, X_test, y_test, model):
    model.fit(X_train,y_train)
    train_error = calc_train_error(X_train, y_train, model)
    validation_error = calc_validation_error(X_test, y_test, model)
    return train_error, validation_error

In [11]:
from sklearn.model_selection import KFold
K= 5
kf = KFold(n_splits = K, shuffle = True, random_state = 42)

In [12]:
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler

random=RandomOverSampler(sampling_strategy='minority')
train_errors = []
validation_errors = []

for train_index, val_index in kf.split(data, target):
    
    #split data
    X_train, X_val = data.iloc[train_index], data.iloc[val_index]
    y_train, y_val = target.iloc[train_index], target.iloc[val_index]


    oversampled_trainX,oversampled_trainY=random.fit_sample(X_train,y_train)
   
    #instantiate model
    logmodel = LogisticRegression()

    #calculate errors
    train_error, val_error = calc_metrics(oversampled_trainX, oversampled_trainY, X_val, y_val, logmodel)

    #append to appropiate list
    train_errors.append(train_error)
    validation_errors.append(val_error)
   



In [13]:
matrix=[]
for i,j in zip(train_errors, validation_errors):
    matrix.append([i['accuracy'],j['accuracy'],
                   i['roc'],j['roc'],
                   i['f1'],j['f1'],
                   i['matthew'],j['matthew'],
                   i['logloss'],j['logloss']])

calc_matrix_Logistik = pd.DataFrame(matrix, columns = ['Train Accuracy', 'Test Accuracy', 
                                                 'Train ROC AUC', 'Test ROC AUC',
                                                 'Train F1 Score', 'Test F1 Score', 
                                                 'Train Matthews Corr Coef','Test Matthews Corr Coef',
                                                 'Train Log Loss', 'Test Log Loss'])

calc_matrix_Logistik

Unnamed: 0,Train Accuracy,Test Accuracy,Train ROC AUC,Test ROC AUC,Train F1 Score,Test F1 Score,Train Matthews Corr Coef,Test Matthews Corr Coef,Train Log Loss,Test Log Loss
0,0.762083,0.755855,0.844358,0.86291,0.761812,0.728804,0.525362,0.500549,0.488936,0.473901
1,0.762859,0.760114,0.844269,0.861228,0.762505,0.735096,0.527295,0.515589,0.486732,0.484583
2,0.773176,0.735273,0.851701,0.829357,0.77286,0.707806,0.547881,0.450342,0.476537,0.507857
3,0.775109,0.747159,0.855613,0.83163,0.774795,0.715148,0.551755,0.460694,0.471597,0.501015
4,0.768923,0.744318,0.849085,0.839816,0.768728,0.705005,0.538752,0.44045,0.481943,0.485761


In [14]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, accuracy_score,matthews_corrcoef,auc,log_loss
for a in validation_errors:
    print(a['report'])

              precision    recall  f1-score   support

           0       0.92      0.73      0.81      1036
           1       0.52      0.83      0.64       373

    accuracy                           0.76      1409
   macro avg       0.72      0.78      0.73      1409
weighted avg       0.82      0.76      0.77      1409

              precision    recall  f1-score   support

           0       0.93      0.73      0.82      1033
           1       0.53      0.85      0.65       376

    accuracy                           0.76      1409
   macro avg       0.73      0.79      0.74      1409
weighted avg       0.82      0.76      0.77      1409

              precision    recall  f1-score   support

           0       0.90      0.72      0.80      1021
           1       0.51      0.78      0.62       388

    accuracy                           0.74      1409
   macro avg       0.70      0.75      0.71      1409
weighted avg       0.79      0.74      0.75      1409

              preci

In [15]:
print(f"Avg Accuracy Train : {calc_matrix_Logistik['Train Accuracy'].mean()}")
print(f"Avg Accuracy Validation : {calc_matrix_Logistik['Test Accuracy'].mean()}")
print(f"Avg ROC AUC Train : {calc_matrix_Logistik['Train ROC AUC'].mean()}")
print(f"Avg ROC AUC Test : {calc_matrix_Logistik['Test ROC AUC'].mean()}")

Avg Accuracy Train : 0.7684299716373499
Avg Accuracy Validation : 0.7485438576682366
Avg ROC AUC Train : 0.8490053442272408
Avg ROC AUC Test : 0.8449880590153752


In [16]:
import statsmodels.api as sm
logit_model=sm.Logit(y_train,X_train)
result=logit_model.fit(method='bfgs')
print(result.summary2())

         Current function value: 0.415563
         Iterations: 35
         Function evaluations: 48
         Gradient evaluations: 40
                                   Results: Logit
Model:                     Logit                   Pseudo R-squared:        0.286    
Dependent Variable:        Churn                   AIC:                     4729.3942
Date:                      2019-12-11 10:19        BIC:                     4882.0395
No. Observations:          5635                    Log-Likelihood:          -2341.7  
Df Model:                  22                      LL-Null:                 -3278.2  
Df Residuals:              5612                    LLR p-value:             0.0000   
Converged:                 0.0000                  Scale:                   1.0000   
-------------------------------------------------------------------------------------
                                       Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
------------------------------------------

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


In [17]:
data = data[['tenure','TotalCharges','SeniorCitizen_Yes','PhoneService_Yes','MultipleLines_Yes','InternetService_Fiber optic','InternetService_No','OnlineSecurity_Yes','TechSupport_Yes','Contract_One year','Contract_Two year','PaperlessBilling_Yes','PaymentMethod_Electronic check']]
target

from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler

random=RandomOverSampler(sampling_strategy='minority')
train_errors = []
validation_errors = []

for train_index, val_index in kf.split(data, target):
    
    #split data
    X_train, X_val = data.iloc[train_index], data.iloc[val_index]
    y_train, y_val = target.iloc[train_index], target.iloc[val_index]


    oversampled_trainX,oversampled_trainY=random.fit_sample(X_train,y_train)
   
    #instantiate model
    logmodel = LogisticRegression()

    #calculate errors
    train_error, val_error = calc_metrics(oversampled_trainX, oversampled_trainY, X_val, y_val, logmodel)

    #append to appropiate list
    train_errors.append(train_error)
    validation_errors.append(val_error)
   



In [18]:
matrix=[]
for i,j in zip(train_errors, validation_errors):
    matrix.append([i['accuracy'],j['accuracy'],
                   i['roc'],j['roc'],
                   i['f1'],j['f1'],
                   i['matthew'],j['matthew'],
                   i['logloss'],j['logloss']])

calc_matrix_Logistik = pd.DataFrame(matrix, columns = ['Train Accuracy', 'Test Accuracy', 
                                                 'Train ROC AUC', 'Test ROC AUC',
                                                 'Train F1 Score', 'Test F1 Score', 
                                                 'Train Matthews Corr Coef','Test Matthews Corr Coef',
                                                 'Train Log Loss', 'Test Log Loss'])

calc_matrix_Logistik

Unnamed: 0,Train Accuracy,Test Accuracy,Train ROC AUC,Test ROC AUC,Train F1 Score,Test F1 Score,Train Matthews Corr Coef,Test Matthews Corr Coef,Train Log Loss,Test Log Loss
0,0.756887,0.753726,0.84196,0.862871,0.756437,0.727911,0.515687,0.502861,0.491386,0.47704
1,0.758633,0.745919,0.841496,0.862057,0.758014,0.72229,0.519935,0.497679,0.492285,0.489229
2,0.77426,0.732434,0.851863,0.826296,0.774019,0.70323,0.549691,0.437757,0.477254,0.504267
3,0.762071,0.742188,0.848449,0.832037,0.761691,0.713291,0.525825,0.465108,0.48312,0.504874
4,0.768802,0.741477,0.847916,0.842464,0.768394,0.703137,0.539507,0.439159,0.483332,0.484728


In [19]:
for a in validation_errors:
    print(a['report'])

              precision    recall  f1-score   support

           0       0.93      0.72      0.81      1036
           1       0.52      0.84      0.64       373

    accuracy                           0.75      1409
   macro avg       0.72      0.78      0.73      1409
weighted avg       0.82      0.75      0.77      1409

              precision    recall  f1-score   support

           0       0.93      0.71      0.80      1033
           1       0.51      0.85      0.64       376

    accuracy                           0.75      1409
   macro avg       0.72      0.78      0.72      1409
weighted avg       0.82      0.75      0.76      1409

              precision    recall  f1-score   support

           0       0.89      0.72      0.80      1021
           1       0.51      0.76      0.61       388

    accuracy                           0.73      1409
   macro avg       0.70      0.74      0.70      1409
weighted avg       0.78      0.73      0.75      1409

              preci

In [23]:
import pickle
pickle.dump(logmodel,open('logistik.sav','wb'))

In [24]:
pickle = pickle.load(open('logistik.sav','rb'))

In [26]:
X_val

Unnamed: 0,tenure,TotalCharges,SeniorCitizen_Yes,PhoneService_Yes,MultipleLines_Yes,InternetService_Fiber optic,InternetService_No,OnlineSecurity_Yes,TechSupport_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Electronic check
3,45,1840.75,0,0,0,0,0,1,1,1,0,0,0
4,2,151.65,0,1,0,1,0,0,0,0,0,1,1
5,8,820.50,0,1,1,1,0,0,0,0,0,1,1
9,62,3487.95,0,1,0,0,0,1,0,1,0,0,0
16,52,1022.95,0,1,0,0,1,0,0,1,0,0,0
34,1,45.25,1,1,0,0,0,0,0,0,0,0,0
54,60,4456.35,1,1,0,0,0,1,1,1,0,1,0
55,18,1752.55,1,1,1,1,0,0,0,0,0,1,1
64,9,857.25,0,1,1,1,0,0,0,0,0,0,1
66,47,3650.35,0,1,1,1,0,0,0,1,0,1,1


In [25]:
pickle.predict_proba(X_val)

array([[0.92919027, 0.07080973],
       [0.13872516, 0.86127484],
       [0.12347262, 0.87652738],
       ...,
       [0.58732159, 0.41267841],
       [0.14811051, 0.85188949],
       [0.47151197, 0.52848803]])