In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../../data/telecom_churn.csv')

In [4]:
df.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [5]:
df.drop(['State', 'Voice mail plan'], axis=1, inplace=True)

In [9]:
df['International plan'] = df['International plan'].map({"Yes": 1, "No": 0});

In [10]:
df.head()

Unnamed: 0,Account length,Area code,International plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,128,415,0,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,107,415,0,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,137,415,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,84,408,1,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,75,415,1,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [11]:
y = df['Churn'].astype('int')

In [16]:
X = df.drop(['Churn'], axis=1)

In [17]:
y.shape, X.shape

((3333,), (3333, 17))

In [22]:
from sklearn.model_selection import train_test_split, cross_val_score

In [18]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=17)

In [19]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((2333, 17), (1000, 17), (2333,), (1000,))

In [20]:
first_tree = DecisionTreeClassifier(random_state=17)

In [74]:
cross_val_score(first_tree, X_train, y_train, cv=5)

array([ 0.9143469 ,  0.91220557,  0.92291221,  0.90772532,  0.91416309])

In [27]:
from sklearn.neighbors import KNeighborsClassifier

In [28]:
first_knn = KNeighborsClassifier()

In [29]:
cross_val_score(first_knn, X_train, y_train, cv=5).mean()

0.86712740439845226

## настраиваем максимальную глубину для дерева max_depth

In [31]:
from sklearn.model_selection import GridSearchCV

In [33]:
tree_params = {"max_depth": np.arange(1, 11),
               'max_features': [.5, .7, 1]}

In [34]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5, n_jobs=-1)

In [35]:
%%time
tree_grid.fit(X_train, y_train)

CPU times: user 200 ms, sys: 30 ms, total: 230 ms
Wall time: 1.07 s


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=17, splitter='best'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'max_features': [0.5, 0.7, 1], 'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [36]:
tree_grid.best_score_

0.93913416202314615

In [37]:
tree_grid.best_params_

{'max_depth': 6, 'max_features': 0.7}

In [51]:
knn_params = {"n_neighbors": list(range(1, 102, 5))}
print(knn_params)

{'n_neighbors': [1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56, 61, 66, 71, 76, 81, 86, 91, 96, 101]}


In [52]:
knn_grid = GridSearchCV(first_knn, knn_params, cv=5)

In [53]:
%%time
knn_grid.fit(X_train, y_train)

CPU times: user 14.8 s, sys: 130 ms, total: 14.9 s
Wall time: 14.9 s


GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [1, 6, 11, 16, 21, 26, 31, 36, 41, 46, 51, 56, 61, 66, 71, 76, 81, 86, 91, 96, 101]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [54]:
knn_grid.best_params_, knn_grid.best_score_

({'n_neighbors': 11}, 0.87055293613373341)

In [58]:
tree_valid_pred = tree_grid.predict(X_valid)

In [59]:
accuracy_score(y_valid, tree_valid_pred)

0.93600000000000005

In [65]:
1 - np.mean(y)

0.85508550855085508

In [68]:
export_graphviz(tree_grid.best_estimator_,
                out_file='telecom_tree.dot',
                feature_names=X.columns,
                filled=True)

In [70]:
!dot -Tpng telecom_tree.dot -o tele_tree.png

/bin/sh: 1: dot: not found


In [71]:
second_tree = DecisionTreeClassifier(max_depth=3).fit(X_train, y_train)

In [72]:
second_tree.score(X_valid, y_valid)

0.90500000000000003