In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('churn_ibm.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [2]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [6]:
y = df['Churn']
X = df.drop(['Churn', 'customerID'], axis=1)

for column in X.columns:
    if X[column].dtype == np.object:
        X = pd.concat([X, pd.get_dummies(X[column], prefix=column, drop_first=True)], axis=1).drop([column],axis=1)
y = pd.get_dummies(y, prefix='churn', drop_first=True)

In [8]:
X.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,1,0,0,1,0,...,0,0,0,0,0,0,1,0,1,0
1,0,34,56.95,1889.5,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0,2,53.85,108.15,1,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
3,0,45,42.3,1840.75,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
4,0,2,70.7,151.65,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0


In [9]:
y.head()

Unnamed: 0,churn_Yes
0,0
1,0
2,1
3,0
4,1


## Random Forest

In [13]:
y.values.ravel()

array([0, 0, 1, ..., 0, 1, 0], dtype=uint8)

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

rf = RandomForestClassifier(n_estimators=20)
rf.fit(X_train, y_train.values.ravel())
y_train_predicted = rf.predict(X_train)
y_test_predicted = rf.predict(X_test)
prediction_proba = rf.predict_proba(X_test)

print('R2 on training set: ',rf.score(X_train, y_train))
print('R2 on test set: ',rf.score(X_test, y_test))
print('Accuracy: ', accuracy_score(y_test, y_test_predicted))
print('AUC: ', roc_auc_score(y_test, prediction_proba[:,1]))

R2 on training set:  0.9908573750507924
R2 on test set:  0.7767772511848341
Accuracy:  0.7767772511848341
AUC:  0.8008990533177808


In [27]:
rf2 = RandomForestClassifier(n_estimators=100)
rf2.fit(X_train, y_train.values.ravel())
prediction = rf2.predict(X_test)
prediction_proba = rf2.predict_proba(X_test)
y_train_predicted = rf.predict(X_train)
y_test_predicted = rf.predict(X_test)

print('R2 on training set: ',rf.score(X_train, y_train))
print('R2 on test set: ',rf.score(X_test, y_test))
print('Accuracy: ', accuracy_score(y_test, prediction))
print('AUC: ', roc_auc_score(y_test, prediction_proba[:,1]))

R2 on training set:  0.9908573750507924
R2 on test set:  0.7767772511848341
Accuracy:  0.7895734597156399
AUC:  0.8181321609533179


In [30]:
rf2.get_params(deep=True)

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## Adaboost

In [28]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier()
ada.fit(X_train, y_train.values.ravel())
y_train_predicted = ada.predict(X_train)
y_test_predicted = ada.predict(X_test)
prediction_proba = ada.predict_proba(X_test)

print('R2 on training set: ', ada.score(X_train, y_train))
print('R2 on test set: ', ada.score(X_test, y_test))
print('Accuracy: ', accuracy_score(y_test, y_test_predicted))
print('AUC: ', roc_auc_score(y_test, prediction_proba[:,1]))

R2 on training set:  0.8096302316131654
R2 on test set:  0.8104265402843602
Accuracy:  0.8104265402843602
AUC:  0.8414535600869805


In [29]:
ada.get_params(deep=True)

{'algorithm': 'SAMME.R',
 'base_estimator': None,
 'learning_rate': 1.0,
 'n_estimators': 50,
 'random_state': None}

In [31]:
ada2 = AdaBoostClassifier(n_estimators=100)
ada2.fit(X_train, y_train.values.ravel())
prediction =ada2.predict(X_test)
prediction_proba = ada2.predict_proba(X_test)

print('Accuracy:', accuracy_score(y_test,prediction))
print('AUC:',roc_auc_score(y_test,prediction_proba[:,1]))

Accuracy: 0.8109004739336493
AUC: 0.8387877288528255


## Grid search

In [33]:
from sklearn.model_selection import cross_validate, cross_val_predict, GridSearchCV

parameters = {'min_samples_leaf':[1,5], 'max_depth':[None, 10]}

grid_search = GridSearchCV(RandomForestClassifier(n_estimators=20), parameters, cv=10)
grid_search.fit(X_train, y_train.values.ravel())

prediction = grid_search.predict(X_test)
prediction_proba = grid_search.predict_proba(X_test)

best_classifier = grid_search.best_estimator_

print("Best classifier: ", best_classifier)
print('Accuracy: ', accuracy_score(y_test, prediction))
print('AUC: ', roc_auc_score(y_test, prediction_proba[:,1]))

Best classifier:  RandomForestClassifier(max_depth=10, min_samples_leaf=5, n_estimators=20)
Accuracy:  0.804739336492891
AUC:  0.8392773123427062


In [35]:
for c, column in enumerate(X_test.columns):
    if rf.feature_importances_[c] in sorted(rf.feature_importances_)[-5:]:
        print('Variables: ', column, rf.feature_importances_[c])

Variables:  tenure 0.1614901439673625
Variables:  MonthlyCharges 0.16702391491624102
Variables:  TotalCharges 0.19211976411118245
Variables:  InternetService_Fiber optic 0.040647492870479626
Variables:  Contract_Two year 0.040573404439206866


In [36]:
for c, column in enumerate(X_test.columns):
    if ada.feature_importances_[c] in sorted(ada.feature_importances_)[-5:]:
        print('Variable: ', column, ada.feature_importances_[c])

Variable:  tenure 0.16
Variable:  MonthlyCharges 0.2
Variable:  TotalCharges 0.26
Variable:  InternetService_Fiber optic 0.06
Variable:  Contract_Two year 0.06
