In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline

# 1. Lectura de datos

In [None]:
telco_churn = pd.read_csv('../Data/telco_churn_2.csv',sep='|')

###### Eliminar columnas correlacionadas

In [None]:
telco_churn.drop([ 'voice mail plan','total day charge','total eve charge','total night charge','total intl charge'],axis=1, inplace=True)

In [None]:
telco_churn.head()

# 2. Árbol de decisión

## Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
lista_numericas = list(telco_churn.select_dtypes(exclude=['object']).columns)

In [None]:
descartar = ['churn']

In [None]:
columnas_modelo =  [x for x in lista_numericas if x not in descartar]

In [None]:
minmax = MinMaxScaler()

In [None]:
telco_churn_minmax = minmax.fit_transform(telco_churn[columnas_modelo])

In [None]:
telco_churn[columnas_modelo].head()

In [None]:
telco_churn_minmax = pd.DataFrame(telco_churn_minmax, columns=columnas_modelo)
telco_churn_minmax.head()

### Árbol de decisión

Utilizaremos la libería sklearn

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, accuracy_score
import itertools

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
#    else:
#        print('Confusion matrix, without normalization')

#    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [None]:
def show_data(cm, print_res = 0):
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]
    if print_res == 1:
        print('Precision =     {:.3f}'.format(tp/(tp+fp)))
        print('Recall (TPR) =  {:.3f}'.format(tp/(tp+fn)))
        print('Fallout (FPR) = {:.3e}'.format(fp/(fp+tn)))
    return tp/(tp+fp), tp/(tp+fn), fp/(fp+tn)

Definimos las variables independientes y la dependiente

In [None]:
lista_numericas = list(telco_churn.select_dtypes(exclude=['object']).columns)

In [None]:
descartar = ['churn']

In [None]:
columnas_modelo =  [x for x in lista_numericas if x not in descartar]

In [None]:
X = telco_churn[columnas_modelo]
y = telco_churn['churn']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Creamos el modelo

In [None]:
modelo_arbol = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=17)

Entrenar 

In [None]:
modelo_arbol.fit(X_train,y_train)

Predicción

In [None]:
prediccion_test = modelo_arbol.predict(X_test)
prediccion_test_proba = modelo_arbol.predict_proba(X_test)

Accuracy

In [None]:
print('Accuracy Score:')
print(accuracy_score(y_test,prediccion_test))

AUC

In [None]:
roc_auc_score(y_test, prediccion_test_proba[:,1])

Matriz de confusión

In [None]:
matriz_confusion = confusion_matrix(y_test, prediccion_test)
if modelo_arbol.classes_[0] == 1:
    matriz_confusion = np.array([[matriz_confusion[1,1], matriz_confusion[1,0]], [matriz_confusion[0,1], matriz_confusion[0,0]]])

plot_confusion_matrix(matriz_confusion, ['no', 'yes'], )

In [None]:
print(classification_report(y_test, prediccion_test))

In [None]:
logit_roc_auc = roc_auc_score(y_test, modelo_arbol.predict_proba(X_test)[:,1])
fpr, tpr, thresholds = roc_curve(y_test, modelo_arbol.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
from ipywidgets import Image
from io import StringIO
import pydotplus #pip install pydotplus
from sklearn.tree import export_graphviz

In [None]:
dot_data = StringIO()
export_graphviz(modelo_arbol, feature_names=X_train.columns, 
                out_file=dot_data, filled=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(value=graph.create_png())

### Entropía

In [None]:
modelo_arbol = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=17)

In [None]:
modelo_arbol.fit(X_train,y_train)

In [None]:
prediccion_test = modelo_arbol.predict(X_test)
prediccion_test_proba = modelo_arbol.predict_proba(X_test)

Accuracy

In [None]:
print('Accuracy Score:')
print(accuracy_score(y_test,prediccion_test))

AUC

In [None]:
roc_auc_score(y_test, prediccion_test_proba[:,1])

Matriz de confusión

In [None]:
matriz_confusion = confusion_matrix(y_test, prediccion_test)
if modelo_arbol.classes_[0] == 1:
    matriz_confusion = np.array([[matriz_confusion[1,1], matriz_confusion[1,0]], [matriz_confusion[0,1], matriz_confusion[0,0]]])

plot_confusion_matrix(matriz_confusion, ['no', 'yes'], )

In [None]:
print(classification_report(y_test, prediccion_test))

In [None]:
logit_roc_auc = roc_auc_score(y_test, modelo_arbol.predict_proba(X_test)[:,1])
fpr, tpr, thresholds = roc_curve(y_test, modelo_arbol.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

In [None]:
dot_data = StringIO()
export_graphviz(modelo_arbol, feature_names=X_train.columns, 
                out_file=dot_data, filled=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(value=graph.create_png())

### Entropía y max_depth=6

In [None]:
modelo_arbol = DecisionTreeClassifier(criterion='entropy', max_depth=6, random_state=17)

Entrenar

In [None]:
modelo_arbol.fit(X_train,y_train)

Predicción

In [None]:
prediccion_test = modelo_arbol.predict(X_test)
prediccion_test_proba = modelo_arbol.predict_proba(X_test)

Accuracy

In [None]:
print('Accuracy Score:')
print(accuracy_score(y_test,prediccion_test))

AUC

In [None]:
roc_auc_score(y_test, prediccion_test_proba[:,1])

Matriz confusión

In [None]:
matriz_confusion = confusion_matrix(y_test, prediccion_test)
if modelo_arbol.classes_[0] == 1:
    matriz_confusion = np.array([[matriz_confusion[1,1], matriz_confusion[1,0]], [matriz_confusion[0,1], matriz_confusion[0,0]]])

plot_confusion_matrix(matriz_confusion, ['no', 'yes'], )

In [None]:
print(classification_report(y_test, prediccion_test))

In [None]:
logit_roc_auc = roc_auc_score(y_test, modelo_arbol.predict_proba(X_test)[:,1])
fpr, tpr, thresholds = roc_curve(y_test, modelo_arbol.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()