In [1]:
import pandas as pd

DATA_PATH = 'data/churn_data.xlsx'

df = pd.read_excel(DATA_PATH)
df.head(4)

Unnamed: 0,customerID,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,34,Yes,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,45,No,One year,No,Bank transfer,42.3,1840.75,No


In [2]:
df.shape

(7042, 9)

In [3]:
df.describe()

Unnamed: 0,tenure,MonthlyCharges
count,7042.0,7042.0
mean,32.366373,64.755886
std,24.557955,30.088238
min,0.0,18.25
25%,9.0,35.5
50%,29.0,70.35
75%,55.0,89.85
max,72.0,118.75


In [4]:
df.dtypes

customerID           object
tenure                int64
PhoneService         object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [5]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [6]:
df.dropna(inplace=True)

In [7]:
Xcat_vars = ['PhoneService', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Xnum_vars = ['tenure', 'MonthlyCharges', 'TotalCharges']
Xall_vars = Xcat_vars + Xnum_vars

In [8]:
X = df[Xall_vars].copy()
y = df['Churn'].copy()
y = pd.DataFrame(y)

In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y['Churn'] = le.fit_transform(y['Churn'])

In [10]:
X = pd.get_dummies(X)

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

tree  = DecisionTreeClassifier()
param = {
  'max_depth': [i for i in range (1, 11)]
}

gs = GridSearchCV(tree, param, scoring='roc_auc', cv=5, n_jobs=-1)
gs.fit(X_train, y_train)

In [25]:
results = pd.DataFrame(gs.cv_results_)
results.sort_values(by='rank_test_score').head(4)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
20,6.016471,0.303212,0.244727,0.029651,7,500,"{'max_depth': 7, 'n_estimators': 500}",0.845972,0.845808,0.834098,0.816193,0.848769,0.838168,0.012092,1
17,5.518403,0.217859,0.254884,0.046357,6,500,"{'max_depth': 6, 'n_estimators': 500}",0.846956,0.84582,0.832608,0.814404,0.849401,0.837838,0.013093,2
16,3.69483,0.226044,0.191769,0.037784,6,300,"{'max_depth': 6, 'n_estimators': 300}",0.846356,0.846279,0.833142,0.813523,0.849006,0.837661,0.013279,3
18,1.281226,0.142415,0.063555,0.013119,7,100,"{'max_depth': 7, 'n_estimators': 100}",0.843903,0.845832,0.834319,0.81525,0.848817,0.837624,0.012195,4


In [21]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

In [22]:
param = {
  'max_depth': [i for i in range(1, 11)],
  'n_estimators': [100, 300, 500]
}

In [24]:
gs = GridSearchCV(rfc, param, scoring='roc_auc', cv=5, n_jobs=-1)
gs.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


In [26]:
results = pd.DataFrame(gs.cv_results_)
results.sort_values(by='rank_test_score').head(4)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
20,6.016471,0.303212,0.244727,0.029651,7,500,"{'max_depth': 7, 'n_estimators': 500}",0.845972,0.845808,0.834098,0.816193,0.848769,0.838168,0.012092,1
17,5.518403,0.217859,0.254884,0.046357,6,500,"{'max_depth': 6, 'n_estimators': 500}",0.846956,0.84582,0.832608,0.814404,0.849401,0.837838,0.013093,2
16,3.69483,0.226044,0.191769,0.037784,6,300,"{'max_depth': 6, 'n_estimators': 300}",0.846356,0.846279,0.833142,0.813523,0.849006,0.837661,0.013279,3
18,1.281226,0.142415,0.063555,0.013119,7,100,"{'max_depth': 7, 'n_estimators': 100}",0.843903,0.845832,0.834319,0.81525,0.848817,0.837624,0.012195,4


In [33]:
gs.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 7,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [34]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_jobs=-1)

In [35]:
from sklearn.model_selection import RandomizedSearchCV

parameters = {'learning_rate': [0.001, 0.01],
              'num_leaves': [2, 128],
              'min_child_samples': [1, 100],
              'subsample': [0.05, 1],
              'colsample_bytree': [0.1, 1.0]}

rs = RandomizedSearchCV(lgbm, parameters, scoring='roc_auc', cv=5, n_iter=5, n_jobs=-1)
rs.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1495, number of negative: 4129
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000313 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 605
[LightGBM] [Info] Number of data points in the train set: 5624, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.265825 -> initscore=-1.015909
[LightGBM] [Info] Start training from score -1.015909


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [38]:
pd.DataFrame(rs.cv_results_).sort_values(by='rank_test_score').head(4)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_num_leaves,param_min_child_samples,param_learning_rate,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,1.149919,0.101644,0.013993,0.002682,0.05,128,1,0.001,0.1,"{'subsample': 0.05, 'num_leaves': 128, 'min_ch...",0.826872,0.823601,0.819621,0.797412,0.837511,0.821003,0.013207,1
1,0.279291,0.078837,0.018917,0.004361,0.05,2,100,0.01,0.1,"{'subsample': 0.05, 'num_leaves': 2, 'min_chil...",0.822562,0.832001,0.817157,0.795339,0.830297,0.819471,0.013202,2
0,5.883844,0.479378,0.021096,0.0018,0.05,128,1,0.001,1.0,"{'subsample': 0.05, 'num_leaves': 128, 'min_ch...",0.827593,0.803955,0.799912,0.780864,0.797671,0.801999,0.015025,3
4,0.102759,0.024706,0.01779,0.012148,1.0,2,100,0.01,1.0,"{'subsample': 1, 'num_leaves': 2, 'min_child_s...",0.797299,0.79889,0.808779,0.779817,0.80829,0.798615,0.010508,4
