In [17]:
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings("ignore")

## Introduce data

In [18]:
df = pd.read_csv('../data/churn.csv')
df.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


In [19]:
df.shape

(3333, 21)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   State           3333 non-null   object 
 1   Account Length  3333 non-null   int64  
 2   Area Code       3333 non-null   int64  
 3   Phone           3333 non-null   object 
 4   Int'l Plan      3333 non-null   object 
 5   VMail Plan      3333 non-null   object 
 6   VMail Message   3333 non-null   int64  
 7   Day Mins        3333 non-null   float64
 8   Day Calls       3333 non-null   int64  
 9   Day Charge      3333 non-null   float64
 10  Eve Mins        3333 non-null   float64
 11  Eve Calls       3333 non-null   int64  
 12  Eve Charge      3333 non-null   float64
 13  Night Mins      3333 non-null   float64
 14  Night Calls     3333 non-null   int64  
 15  Night Charge    3333 non-null   float64
 16  Intl Mins       3333 non-null   float64
 17  Intl Calls      3333 non-null   i

In [21]:
df.describe()

Unnamed: 0,Account Length,Area Code,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,101.064806,437.182418,8.09901,179.775098,100.435644,30.562307,200.980348,100.114311,17.08354,200.872037,100.107711,9.039325,10.237294,4.479448,2.764581,1.562856
std,39.822106,42.37129,13.688365,54.467389,20.069084,9.259435,50.713844,19.922625,4.310668,50.573847,19.568609,2.275873,2.79184,2.461214,0.753773,1.315491
min,1.0,408.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.2,33.0,1.04,0.0,0.0,0.0,0.0
25%,74.0,408.0,0.0,143.7,87.0,24.43,166.6,87.0,14.16,167.0,87.0,7.52,8.5,3.0,2.3,1.0
50%,101.0,415.0,0.0,179.4,101.0,30.5,201.4,100.0,17.12,201.2,100.0,9.05,10.3,4.0,2.78,1.0
75%,127.0,510.0,20.0,216.4,114.0,36.79,235.3,114.0,20.0,235.3,113.0,10.59,12.1,6.0,3.27,2.0
max,243.0,510.0,51.0,350.8,165.0,59.64,363.7,170.0,30.91,395.0,175.0,17.77,20.0,20.0,5.4,9.0


In [22]:
df['Churn?'].value_counts()

False.    2850
True.      483
Name: Churn?, dtype: int64

## Data cleaning

In [23]:
## drop useless features
df = df.drop(['State','Area Code','Phone'],axis=1)

In [24]:
df.shape # drop confirmed

(3333, 18)

In [25]:
df.isnull().sum().sort_values(ascending=False)
# no missing values

Churn?            0
CustServ Calls    0
Int'l Plan        0
VMail Plan        0
VMail Message     0
Day Mins          0
Day Calls         0
Day Charge        0
Eve Mins          0
Eve Calls         0
Eve Charge        0
Night Mins        0
Night Calls       0
Night Charge      0
Intl Mins         0
Intl Calls        0
Intl Charge       0
Account Length    0
dtype: int64

In [26]:
# convert yes/no to 1/0
df["Int'l Plan"] = [1 if x == 'yes' else 0 for x in df["Int'l Plan"] ]
df["VMail Plan"] = [1 if x == 'yes' else 0 for x in df["VMail Plan"] ]
df["Churn?"] = [1 if x == 'False.' else 0 for x in df["Churn?"] ]

In [27]:
df.head() # convert confirmed!

Unnamed: 0,Account Length,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,128,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,1
1,107,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,1
2,137,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,1
3,84,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,1
4,75,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,1


In [28]:
X = df.drop(['Churn?'],axis=1)
y = df['Churn?']

In [29]:
# Normalization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [30]:
from sklearn.model_selection import train_test_split
# for large dataset, use 30% as test set
# for small dataset, use 20% as test set to ensure enough data to train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.20)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((2666, 17), (667, 17), (2666,), (667,))

## Model selection

In [42]:
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import AdaBoostClassifier as ABC
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [43]:
knn = KNN(n_neighbors=5)
knn_acc = cross_val_score(knn,X_train,y_train,cv=3,scoring='accuracy').mean()

In [44]:
svc = SVC(gamma='auto',probability=True)
svc_acc = cross_val_score(svc,X_train,y_train,cv=3,scoring='accuracy').mean()

In [45]:
rf = RF(n_estimators=10)
rf_acc = cross_val_score(rf,X_train,y_train,cv=3,scoring='accuracy').mean()

In [46]:
lr = LR(solver='lbfgs')
lr_acc = cross_val_score(lr,X_train,y_train,cv=3,scoring='accuracy').mean()

In [47]:
abc=ABC()
abc_acc = cross_val_score(abc,X_train,y_train,cv=3,scoring='accuracy').mean()

In [48]:
gbc = GBC()
gbc_acc = cross_val_score(gbc,X_train,y_train,cv=3,scoring='accuracy').mean()

In [49]:
xgbc = XGBClassifier()
xgbc_acc = cross_val_score(xgbc, X_train, y_train, cv=3, scoring='accuracy').mean()

In [50]:
# use bagging to build a Random Forest with 500 decision trees, bootstrap 100 samples for one tree, do not delete duplicated samples
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, oob_score=True, n_jobs=-1)
bag_clf.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [51]:
voting_clf = VotingClassifier(
    estimators=[('lr', lr), ('rf', rf), ('svc', svc), ('knn', knn), ('gbc', gbc), ('xgbc', xgbc)], voting='hard')
voting_acc = cross_val_score(
    voting_clf, X_train, y_train, cv=3, scoring='accuracy').mean()
# hard voting: each learner has one ticket to vote

In [52]:
voting_clf2 = VotingClassifier(
    estimators=[('lr', lr), ('rf', rf), ('svc', svc), ('knn', knn), ('gbc', gbc), ('xgbc', xgbc)], voting='soft')
voting_acc2 = cross_val_score(
    voting_clf2, X_train, y_train, cv=3, scoring='accuracy').mean()

# soft voting: the result depends on the learner with the highest prob

# If all classifiers are able to estimate class probabilities 
# (i.e., they all have a predict_proba() method), 
# then you can tell Scikit-Learn to predict the class with the highest class probability, 
# averaged over all the individual classifiers. This is called soft voting.

In [53]:
# single learner
print("Accuracy of KNN is: "+str(knn_acc))
print("Accuracy of SVC is: "+str(svc_acc))
print("Accuracy of LR is: "+str(lr_acc))
# bagging
print("Accuracy of bagging is: "+str(bag_clf.oob_score_))  # use out-of-bag samples as validation set instead of CV
print("Accuracy of RF is: "+str(rf_acc))
# boosting
print("Accuracy of ABC is: "+str(abc_acc))
print("Accuracy of GBC is: "+str(gbc_acc))
print("Accuracy of XGBC is: "+str(xgbc_acc))
# voting (bagging)
print("Accuracy of VotingHard is: "+str(voting_acc))
print("Accuracy of VotingSoft is: "+str(voting_acc2))

# model with bagging (eg: Random Forest) or boosting (eg: XGBoost, GradientBoost) always have the best result
# if using voting(include are classifier), normally, the result will be better than any one of them

Accuracy of KNN is: 0.8900973526957778
Accuracy of SVC is: 0.9144777510919244
Accuracy of LR is: 0.8540858068417124
Accuracy of bagging is: 0.8938484621155289
Accuracy of RF is: 0.9437354959008503
Accuracy of ABC is: 0.870965453642619
Accuracy of GBC is: 0.9474862939429869
Accuracy of XGBC is: 0.9489865287109382
Accuracy of VotingHard is: 0.9504859189898559
Accuracy of VotingSoft is: 0.9362326330830267


## Train the best LR model

+ Grid search + CV

In [23]:
params_LR = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],  # C is the inverse of lambda
             'max_iter': [1, 10, 100, 500],
             'class_weight': ['balanced', None],
             'solver': ['liblinear', 'sag', 'lbfgs', 'newton-cg']
             }
lr = LR()
clf = GridSearchCV(lr, param_grid=params_LR, cv=5)
clf.fit(X_train, y_train)
clf.best_params_

{'C': 0.0001, 'class_weight': None, 'max_iter': 1, 'solver': 'liblinear'}

+ Randomized search + CV

In [24]:
params_LR = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],  # C is the inverse of lambda
             'max_iter': [1, 10, 100, 500],
             'class_weight': ['balanced', None],
             'solver': ['liblinear', 'sag', 'lbfgs', 'newton-cg']
             }
lr = LR()
clf2 = RandomizedSearchCV(lr, n_iter=10, cv=5, param_distributions=params_LR)
clf2.fit(X_train, y_train)
clf2.best_params_

{'solver': 'newton-cg', 'max_iter': 500, 'class_weight': None, 'C': 0.1}

In [25]:
lr_best = LR(C=0.0001, class_weight=None, max_iter=1, solver='liblinear')
lr_best.fit(X_train, y_train)

LogisticRegression(C=0.0001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
y_pred = lr_best.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred)) 

[[ 21  67]
 [ 22 557]]
              precision    recall  f1-score   support

           0       0.49      0.24      0.32        88
           1       0.89      0.96      0.93       579

    accuracy                           0.87       667
   macro avg       0.69      0.60      0.62       667
weighted avg       0.84      0.87      0.85       667



## Train the best SVM model

In [31]:
params_SVC = [{'kernel': ['rbf'],       # Gaussian kernel
               # punishment, inverse of regularization, bigger C -> overfitting
               'C': [0.1, 1, 10, 100, 1000],
               'gamma': [1e-3, 1e-4]  # bigger gamma -> overfitting
               },
              {'kernel': ['linear'],   # Linear kernel
               'C': [0.001, 0.01, 0.1, 1, 10]  # bigger C -> overfitting
               },
              {'kernel': ['poly'],  # Polynomial kernel
               'C': [0.01, 0.1, 1, 10, 100],  # bigger C -> overfitting
               'degree': [3, 5, 7, 9, 11]  # bigger degree -> overfitting
               }
              ]
svm = SVC()
clf = GridSearchCV(svm, param_grid=params_SVC, cv=5)
clf.fit(X_train, y_train)
clf.best_params_

{'C': 1, 'degree': 3, 'kernel': 'poly'}

In [32]:
svm_best = SVC(C=1,kernel='poly',degree=3)
svm_best.fit(X_train, y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='poly', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [33]:
y_pred = svm_best.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred)) 

[[ 51  37]
 [  7 572]]
              precision    recall  f1-score   support

           0       0.88      0.58      0.70        88
           1       0.94      0.99      0.96       579

    accuracy                           0.93       667
   macro avg       0.91      0.78      0.83       667
weighted avg       0.93      0.93      0.93       667



## Train the best KNN model

In [34]:
params_KNN = {'n_neighbors': range(1, 31),
              'weights': ['uniform', 'distance']
             }
knn = KNN()
clf = GridSearchCV(knn, param_grid=params_KNN, cv=5)
clf.fit(X_train, y_train)
clf.best_params_

{'n_neighbors': 8, 'weights': 'uniform'}

In [35]:
knn_best = KNN(n_neighbors=8, weights='uniform')
knn_best.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=8, p=2,
                     weights='uniform')

In [36]:
y_pred = knn_best.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred)) 

[[ 42  46]
 [ 10 569]]
              precision    recall  f1-score   support

           0       0.81      0.48      0.60        88
           1       0.93      0.98      0.95       579

    accuracy                           0.92       667
   macro avg       0.87      0.73      0.78       667
weighted avg       0.91      0.92      0.91       667



## Train the best GB model

In [49]:
params_GB = {
    "loss": ["deviance"],
    "learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(0.1, 0.5, 12),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth": [3, 5, 8],
    "max_features": ["log2", "sqrt"],
    "criterion": ["friedman_mse",  "mae"],
    "subsample": [0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators": [10]
}
gb = GBC()
clf = RandomizedSearchCV(gb, n_iter=5, param_distributions=params_GB, cv=5)
clf.fit(X_train, y_train)
clf.best_params_

{'subsample': 0.9,
 'n_estimators': 10,
 'min_samples_split': 0.5,
 'min_samples_leaf': 0.42727272727272736,
 'max_features': 'sqrt',
 'max_depth': 8,
 'loss': 'deviance',
 'learning_rate': 0.05,
 'criterion': 'mae'}

## Train the best XGB model

In [51]:
params_XGB = {
    'max_depth': [5, 10, 15, 20, 25],
    'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.15],
    'n_estimators': [500, 1000, 2000, 3000, 5000],
    'min_child_weight': [0, 2, 5, 10, 20], 
    'max_delta_step': [0, 0.2, 0.6, 1, 2],
    'subsample': [0.6, 0.7, 0.8, 0.85, 0.95],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.25, 0.5, 0.75, 1],   # l1 reg
    'reg_lambda': [0.2, 0.4, 0.6, 0.8, 1],  # l2 reg
    'scale_pos_weight': [0.2, 0.4, 0.6, 0.8, 1]
}
xgb = XGBClassifier()
clf = RandomizedSearchCV(xgb, n_iter=5, param_distributions=params_XGB, cv=5)
clf.fit(X_train, y_train)
clf.best_params_

{'subsample': 0.85,
 'scale_pos_weight': 0.6,
 'reg_lambda': 1,
 'reg_alpha': 0,
 'n_estimators': 1000,
 'min_child_weight': 0,
 'max_depth': 5,
 'max_delta_step': 2,
 'learning_rate': 0.1,
 'colsample_bytree': 0.6}

## Train the best RF model

In [38]:
param_RF = {
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 5, 6, 7, 8],
    'criterion': ['gini', 'entropy']
}
rf = RF()
clf = GridSearchCV(rf, param_grid=param_RF, cv=5)
clf.fit(X_train, y_train)

clf.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'auto',
 'n_estimators': 500}

In [39]:
clf.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=8, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [40]:
# show feature importances
sorted(zip(clf.best_estimator_.feature_importances_, list(df)), reverse=True)

[(0.16347004952560484, 'Day Charge'),
 (0.16259757445456946, 'Day Mins'),
 (0.1390712782422415, 'CustServ Calls'),
 (0.0942421328505042, "Int'l Plan"),
 (0.06311794880653726, 'Eve Mins'),
 (0.06099158715392171, 'Eve Charge'),
 (0.04739173909610749, 'Intl Calls'),
 (0.03956862354223581, 'Intl Mins'),
 (0.03517586772585605, 'Intl Charge'),
 (0.03233494438386748, 'VMail Message'),
 (0.030520093320006603, 'Night Charge'),
 (0.030038292572723807, 'Night Mins'),
 (0.024353479368380605, 'VMail Plan'),
 (0.021676622691891127, 'Day Calls'),
 (0.020399653686330937, 'Account Length'),
 (0.018896980940211937, 'Night Calls'),
 (0.01615313163900935, 'Eve Calls')]

In [41]:
rf_best = RF(criterion='gini', max_depth=8,
             max_features='auto', n_estimators=200)
rf_best.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=8, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [42]:
y_pred = rf_best.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred)) 

[[ 68  20]
 [  8 571]]
              precision    recall  f1-score   support

           0       0.89      0.77      0.83        88
           1       0.97      0.99      0.98       579

    accuracy                           0.96       667
   macro avg       0.93      0.88      0.90       667
weighted avg       0.96      0.96      0.96       667



### Save the best tuned model

In [43]:
import joblib
joblib.dump(rf_best, "../model/RF_churn.pkl")

['RF_churn.pkl']

### Load model

In [44]:
loaded_model = joblib.load("../model/RF_churn.pkl")