In [85]:
import pandas as pd
import numpy as np
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [50]:
df = pd.read_csv(
    'https://raw.githubusercontent.com/treselle-systems/'
    'customer_churn_analysis/master/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [103]:
def customer_cleaning(df):
    df = df.copy()
    
    #drop unrelated
    df = df.drop(columns='customerID')
    
    #encode binaries    
    df['gender'] = df['gender'].replace({'Male':0, 'Female':1})
    
    #prep for encoding
    df['MultipleLines'] = df['MultipleLines'].replace('No phone service','No')

    
    #binaries
    binaries = ['Churn', 'Partner', 'Dependents', 'PhoneService',
                'MultipleLines', 'PaperlessBilling']
    
    for label in binaries:
        df[label] = df[label].map(dict(Yes=1, No=0))
    
    df['PaymentMethod'] = df['PaymentMethod'].replace(['Electronic check', 
'Bank transfer (automatic)', 'Credit card (automatic)'],1).replace('Mailed check',0)
    
    
    df['InternetService'] = df['InternetService'].map({'Fiber optic':1, 
                                                           'DSL':2, 'No':0})
    df['Contract'] = df['Contract'].map({'Month-to-month':0, 
                                                         'One year':1, 'Two year':2})
    
    #fix dtype
    df['TotalCharges'] = pd.to_numeric((df['TotalCharges'].replace(' ', '')), 
                                       errors='coerce', downcast='float')
    #downcast
    df['MonthlyCharges'] = pd.to_numeric(df['MonthlyCharges'], downcast='float')
    
    #loop to encode some with repeats
    repeats = ['OnlineBackup', 'DeviceProtection', 'TechSupport',
               'StreamingTV','StreamingMovies', 'OnlineSecurity']
    
    for label in repeats:
        df[label] = df[label].replace({'No internet service':0, 
                                                         'Yes':1, 'No':0})
        
    df = df.dropna()
    #we'll lose 10 rows
    
    return df
    


In [104]:
data = customer_cleaning(df)

In [105]:
# np.where(np.isnan(data))
# df. shape, data.shape
data.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [106]:
data.dtypes

gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
MultipleLines         int64
InternetService       int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
PaperlessBilling      int64
PaymentMethod         int64
MonthlyCharges      float32
TotalCharges        float32
Churn                 int64
dtype: object

In [107]:
randseed=42

X=data.drop(columns='Churn')
y=data['Churn']

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8,
                                        stratify=y, random_state=randseed)



In [110]:
param_distributions = {
    'n_estimators': randint(50, 500),
    'max_depth': randint(1,5)
}

search = RandomizedSearchCV(
    estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
    param_distributions = param_distributions,
    n_iter=50,
    scoring='roc_auc',
    n_jobs=-1,
    cv=3,
    verbose=10,
    return_train_score=True,
    random_state=42
)

search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:   25.9s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=50, n_jobs=-1,
          param_distributions={'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fdf05442a58>, 'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fdf05442908>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score=True, scoring='roc_auc', verbose=10)

In [112]:
#best 5 candidates
pd.DataFrame(search.cv_results_).sort_values(by='rank_test_score').head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
20,0.369445,0.000397,0.117976,0.013673,4,237,"{'max_depth': 4, 'n_estimators': 237}",0.846196,0.85143,0.833396,0.843676,0.007574,1,0.857396,0.852874,0.862002,0.857424,0.003727
11,0.796794,0.052911,0.205444,0.002116,4,463,"{'max_depth': 4, 'n_estimators': 463}",0.846233,0.851064,0.833633,0.843646,0.007347,2,0.85707,0.852658,0.861904,0.857211,0.003776
42,0.712644,0.060942,0.142822,0.053147,4,437,"{'max_depth': 4, 'n_estimators': 437}",0.846193,0.851021,0.833713,0.843645,0.007291,3,0.857073,0.852748,0.861875,0.857232,0.003728
21,0.493993,0.01359,0.150728,0.056113,4,320,"{'max_depth': 4, 'n_estimators': 320}",0.84633,0.851163,0.833432,0.843644,0.007483,4,0.857129,0.852735,0.861855,0.85724,0.003724
36,0.65348,0.135428,0.14993,0.055279,4,416,"{'max_depth': 4, 'n_estimators': 416}",0.846165,0.85118,0.833565,0.843639,0.007409,5,0.857065,0.852699,0.861906,0.857223,0.00376


In [113]:
from sklearn.metrics import roc_auc_score
best = search.best_estimator_

y_pred_proba = best.predict_proba(X_test.values)[:,1]
print('Test ROC AUC:', roc_auc_score(y_test, y_pred_proba))

Test ROC AUC: 0.8339372887234626


In [114]:
best

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=237, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

Test ROC AUC for RF 0.8339372887234626 

Model params:

andomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=237, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [None]:
data = customer_cleaning(df)

#define targets
X=data.select_dtypes(include='number').drop(columns='Churn')
y=data['Churn']

time = data['tenure']

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8,
                                        stratify=y, random_state=randseed)

train = pd.merge(X_train, y_train.to_frame(), left_index=True, right_index=True)

test = X_test.drop(columns='tenure') 
#= pd.merge(X_test, y_test.to_frame(), left_index=True, right_index=True)

chp = CoxPHFitter()


chp.fit(train, 'tenure', 'Churn')

y_pred = chp.predict_partial_hazard(test)


print('The training score (Generalized ROC-AUC) is: ', chp.score_)
print('The test score (ROC-AUC) is: ', roc_auc_score(y_test, y_pred))
