In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import plot_confusion_matrix, recall_score,\
    roc_auc_score, precision_score, f1_score, classification_report

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
from imblearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv('data/tele-churn.csv')
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


### Imbalance in target, going to need to adjust

In [None]:
df['churn'].value_counts(normalize=True)

In [None]:
#df['churn'] = df['churn'].astype(int)

In [3]:
X = df.drop(columns=['churn', 'phone number', 'area code'], axis=1)
y = df['churn']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [4]:
subpipe_num = Pipeline(steps=[('ss', StandardScaler())])




subpipe_cat = Pipeline(steps=[('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])




num_cols = ['account length','number vmail messages', 'total day minutes', 'total day calls',
           'total day charge', 'total eve minutes', 'total eve calls', 'total eve charge',
           'total night minutes', 'total night calls', 'total night charge', 'total intl minutes',
           'total intl calls', 'total intl charge', 'customer service calls']




cat_cols = ['international plan', 'voice mail plan', 'state']




CT = ColumnTransformer(transformers=[('subpipe_num', subpipe_num, num_cols),
                                     ('subpipe_cat', subpipe_cat, cat_cols)],
                       remainder='passthrough')

X_train_ct = CT.fit_transform(X_train)
X_test_ct = CT.transform(X_test)

CT.fit_transform(X_train).shape, CT.fit_transform(X_test).shape

((2333, 70), (1000, 70))

In [5]:
imb_pipe = ImPipeline(steps=[('ct', CT),
                             ('sm', SMOTE(random_state=42)),
                            ('dectree', DecisionTreeClassifier(random_state=42))])

In [6]:
class ModelWithCV():
    '''Structure to save the model and more easily see its crossvalidation'''
    
    def __init__(self, model, model_name, X, y, cv_now=True):
        self.model = model
        self.name = model_name
        self.X = X
        self.y = y
        # For CV results
        self.cv_results = None
        self.cv_mean = None
        self.cv_median = None
        self.cv_std = None
        #
        if cv_now:
            self.cross_validate()
        
    def cross_validate(self, X=None, y=None, kfolds=10):
        '''
        Perform cross-validation and return results.
        
        Args: 
          X:
            Optional; Training data to perform CV on. Otherwise use X from object
          y:
            Optional; Training data to perform CV on. Otherwise use y from object
          kfolds:
            Optional; Number of folds for CV (default is 10)  
        '''
        
        cv_X = X if X else self.X
        cv_y = y if y else self.y

        self.cv_results = cross_val_score(self.model, cv_X, cv_y, cv=kfolds, scoring='precision')
        self.cv_mean = np.mean(self.cv_results)
        self.cv_median = np.median(self.cv_results)
        self.cv_std = np.std(self.cv_results)

        
    def print_cv_summary(self):
        cv_summary = (
        f'''CV Results for `{self.name}` model:
            {self.cv_mean:.5f} ± {self.cv_std:.5f} precision
        ''')
        print(cv_summary)

        
    def plot_cv(self, ax):
        '''
        Plot the cross-validation values using the array of results and given 
        Axis for plotting.
        '''
        ax.set_title(f'CV Results for `{self.name}` Model')
        # Thinner violinplot with higher bw
        sns.violinplot(y=self.cv_results, ax=ax, bw=.4)
        sns.swarmplot(
                y=self.cv_results,
                color='orange',
                size=10,
                alpha= 0.8,
                ax=ax
        )

        return ax

In [8]:
dummy_clf = DummyClassifier(strategy='stratified', random_state=42)
dummy_clf.fit(X_train, y_train)
y_pred = dummy_clf.predict(X_test)

print(precision_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

0.14666666666666667
0.5010082677959267


In [9]:
imb_pipe = ImPipeline(steps=[('ct', CT),
                             ('sm', SMOTE(random_state=42)),
                            ('dectree', DecisionTreeClassifier(random_state=42))])

In [10]:
imb_pipe.fit(X_train, y_train)

y_pred = imb_pipe.predict(X_test)
print(precision_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

0.550561797752809
0.7911474087517645


In [12]:
tree_pipe = ModelWithCV(imb_pipe, model_name='tree_pipe', X=X_train, y=y_train)
tree_pipe.print_cv_summary()

In [13]:
tree_pipe.print_cv_summary()

CV Results for `tree_pipe` model:
            0.59859 ± 0.07048 precision
        


In [11]:
imb_pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'ct', 'sm', 'dectree', 'ct__n_jobs', 'ct__remainder', 'ct__sparse_threshold', 'ct__transformer_weights', 'ct__transformers', 'ct__verbose', 'ct__subpipe_num', 'ct__subpipe_cat', 'ct__subpipe_num__memory', 'ct__subpipe_num__steps', 'ct__subpipe_num__verbose', 'ct__subpipe_num__ss', 'ct__subpipe_num__ss__copy', 'ct__subpipe_num__ss__with_mean', 'ct__subpipe_num__ss__with_std', 'ct__subpipe_cat__memory', 'ct__subpipe_cat__steps', 'ct__subpipe_cat__verbose', 'ct__subpipe_cat__ohe', 'ct__subpipe_cat__ohe__categories', 'ct__subpipe_cat__ohe__drop', 'ct__subpipe_cat__ohe__dtype', 'ct__subpipe_cat__ohe__handle_unknown', 'ct__subpipe_cat__ohe__sparse', 'sm__k_neighbors', 'sm__n_jobs', 'sm__random_state', 'sm__sampling_strategy', 'dectree__ccp_alpha', 'dectree__class_weight', 'dectree__criterion', 'dectree__max_depth', 'dectree__max_features', 'dectree__max_leaf_nodes', 'dectree__min_impurity_decrease', 'dectree__min_impurity_split', 'dectree__min_samples

In [15]:
params = {
    'dectree__max_depth': [None, 10, 20, 30], 
    'dectree__min_samples_split': [2, 5, 10],
    'dectree__min_samples_leaf': [1, 2, 4],
    'sm__sampling_strategy': ['auto', 'minority', 0.3],
    'dectree__criterion': ['gini', 'entropy'],
    'steps': [('ct', ct), ('sm', sm), ('dectree', dectree)]
}

gs = GridSearchCV(estimator=imb_pipe,
                 param_grid=params,
                 cv=10, verbose=3, n_jobs=-1, scoring='precision')

In [16]:
gs.fit(X_train, y_train)

Fitting 10 folds for each of 216 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 864 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 1440 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:   13.6s finished


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         ['account '
                                                                          'length',
                                                                          'number '
                                                                          'vmail '
                                                                          'messages',
                                                                          'total '
                                                   

In [17]:
gs.best_params_

{'dectree__criterion': 'gini',
 'dectree__max_depth': 10,
 'dectree__min_samples_leaf': 2,
 'dectree__min_samples_split': 5,
 'sm__sampling_strategy': 0.3}

In [18]:
gs.best_score_

0.8196184252343874

In [38]:
final_model = gs.best_estimator_

In [39]:
y_hat = final_model.predict(X_test)

In [40]:
print(f"""
Our final model's recall on the test set is {round(recall_score(y_test, y_hat), 2)} \n
Our final model's precision on the test set is {round(precision_score(y_test, y_hat), 2)} \n
Our final model's f1-score on the test is {round(f1_score(y_test, y_hat), 2)}.
""")


Our final model's recall on the test set is 0.71 

Our final model's precision on the test set is 0.82 

Our final model's f1-score on the test is 0.76.



In [34]:
params = {
    'dectree__max_depth': [None, 10, 20, 30], 
    'dectree__min_samples_split': [2, 5, 10],
    'dectree__min_samples_leaf': [1, 2, 4],
    'sm__sampling_strategy': ['auto', 'minority', 0.3],
    'dectree__criterion': ['gini', 'entropy'],
    'dectree__class_weight': [None, 'balanced'],
    'dectree__max_features': [None, 'sqrt'],
    'dectree__max_leaf_nodes': [None, 10],
    'dectree__min_impurity_decrease': [0.0, 0.1],
    'dectree__splitter': ['best', 'random']
}


gs = GridSearchCV(estimator=imb_pipe,
                 param_grid=params,
                 cv=10, verbose=3, n_jobs=-1, scoring='precision')

In [35]:
gs.fit(X_train, y_train)

Fitting 10 folds for each of 6912 candidates, totalling 69120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 1568 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 2720 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 4128 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 5792 tasks      | elapsed:   20.2s
[Parallel(n_jobs=-1)]: Done 7712 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done 9888 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 12320 tasks      | elapsed:   39.3s
[Parallel(n_jobs=-1)]: Done 15008 tasks      | elapsed:   48.0s
[Parallel(n_jobs=-1)]: Done 17952 tasks      | elapsed:   57.1s
[Parallel(n_jobs=-1)]: Done 21152 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 24608 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 28320 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 32288 tas

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('subpipe_num',
                                                                         Pipeline(steps=[('ss',
                                                                                          StandardScaler())]),
                                                                         ['account '
                                                                          'length',
                                                                          'number '
                                                                          'vmail '
                                                                          'messages',
                                                                          'total '
                                                   

In [36]:
gs.best_params_

{'dectree__class_weight': None,
 'dectree__criterion': 'gini',
 'dectree__max_depth': 10,
 'dectree__max_features': None,
 'dectree__max_leaf_nodes': None,
 'dectree__min_impurity_decrease': 0.0,
 'dectree__min_samples_leaf': 2,
 'dectree__min_samples_split': 5,
 'dectree__splitter': 'best',
 'sm__sampling_strategy': 0.3}

In [37]:
gs.best_score_

0.8196184252343874

In [None]:
final_model = gs.best_estimator_

In [51]:
y_train_pred = final_model.predict(X_train)
y_test_pred = final_model.predict(X_test)
print(precision_score(y_train, y_train_pred))
print(precision_score(y_test, y_test_pred))

0.9826388888888888
0.824


In [49]:
y_train_pred = imb_pipe.predict(X_train)
y_test_pred = imb_pipe.predict(X_test)
print(precision_score(y_train, y_train_pred))
print(precision_score(y_test, y_test_pred))

1.0
0.550561797752809


In [50]:
y_train_pred = dummy_clf.predict(X_train)
y_test_pred = dummy_clf.predict(X_test)
print(precision_score(y_train, y_train_pred))
print(precision_score(y_test, y_test_pred))

0.12571428571428572
0.14666666666666667


In [52]:
tree_pipe = ModelWithCV(imb_pipe, model_name='tree_pipe', X=X_train, y=y_train)
tree_pipe.print_cv_summary()

CV Results for `tree_pipe` model:
            0.59859 ± 0.07048 precision
        


In [53]:
tree_pipe = ModelWithCV(final_model, model_name='tree_pipe', X=X_train, y=y_train)
tree_pipe.print_cv_summary()

CV Results for `tree_pipe` model:
            0.81962 ± 0.08986 precision
        


In [54]:
tree_pipe = ModelWithCV(dummy_clf, model_name='tree_pipe', X=X_train, y=y_train)
tree_pipe.print_cv_summary()

CV Results for `tree_pipe` model:
            0.15000 ± 0.03590 precision
        
