In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
import plotly.express as px
plt.rcParams["figure.figsize"] = 15,5
pd.set_option('display.max_columns', 500)

# Lectura de datos

In [None]:
dataset= pd.read_csv('../Data/telco_churn.csv', sep='|')

In [None]:
dataset.head()

### Target

In [None]:
dataset['churn'].value_counts()*100/dataset.shape[0]

Target a variable numérico

In [None]:
dataset['churn'] = dataset['churn'].apply(lambda x: 1 if x==True else 0)

In [None]:
dataset['churn'].value_counts()*100/dataset.shape[0]

### Variable categóricas

Elimnar número de celular

In [None]:
dataset.drop('phone number', axis=1, inplace=True)

Las variables 'international plan' y 'voice mail plan' solo tienen dos posibles valores, los combiaremos a una variable binaria.

In [None]:
dataset['international plan'] = dataset['international plan'].apply(lambda x: 1 if x=='yes' else 0)

In [None]:
dataset['voice mail plan'] = dataset['voice mail plan'].apply(lambda x: 1 if x=='yes' else 0)

### Variables para el modelo

In [None]:
dataset = dataset[['international plan','voice mail plan', 'number vmail messages', 'total day minutes','total day calls', 'total day charge', 'total eve minutes','total eve calls', 'total eve charge', 'total night minutes','total night calls', 'total night charge', 'total intl minutes', 'total intl calls', 'total intl charge', 'customer service calls', 'churn']].copy()

### Variables correlacionadas

In [None]:
matriz_correlacion = dataset.corr()

In [None]:
# Generate a mask for the upper triangle
mask = np.zeros_like(matriz_correlacion, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(15, 15))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(matriz_correlacion, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5},annot=True,fmt='.2f')

Correlación de todas las variables

In [None]:
corr_matrix = dataset.corr().abs()

Obtener todas las variables con una correlación mayor a 0.9

In [None]:
high_corr_var=np.where(corr_matrix>0.9)

In [None]:
high_corr_var=[(corr_matrix.columns[x],corr_matrix.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]

In [None]:
high_corr_var

Eliminaremos la columna 'voice mail plan', 'total day charge', 'total eve charge','total night charge' y 'total intl charge'

In [None]:
dataset.drop([ 'voice mail plan','total day charge','total eve charge','total night charge','total intl charge'],axis=1, inplace=True)

# Partición de datos

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = dataset.drop('churn',axis=1).copy()
y= dataset['churn']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Regresión logística

![regresion_logistica](https://miro.medium.com/max/640/1*CYAn9ACXrWX3IneHSoMVOQ.gif)

In [None]:
from sklearn.linear_model import LogisticRegression

Creamos el modelo

In [None]:
modelo_regresion_logistica = LogisticRegression(max_iter=1000)

In [None]:
print(modelo_regresion_logistica)

Entrenamos el modelo

In [None]:
modelo_regresion_logistica.fit(X_train, y_train)

Predicción

In [None]:
prediccion_test = modelo_regresion_logistica.predict(X_test)
prediccion_train = modelo_regresion_logistica.predict(X_train)

In [None]:
prediccion_test

Matriz de confusión

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, accuracy_score

In [None]:
matriz_confusion = confusion_matrix(y_test, prediccion_test)
matriz_confusion

###### Accuracy

In [None]:
print(accuracy_score(y_test, prediccion_test))

###### Probabilidades

In [None]:
prediccion_test_proba = modelo_regresion_logistica.predict_proba(X_test)
prediccion_test_proba

###### AUC

In [None]:
roc_auc_score(y_test, prediccion_test_proba[:,1])

# KNN

![knn](https://www.feedingthemachine.ai/wp-content/uploads/2019/06/0_jqxx3-dJqFjXD6FA.png)

---
![knn](https://machinelearningknowledge.ai/wp-content/uploads/2018/08/KNN-Classification.gif)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

Creamos el modelo

In [None]:
modelo_knn = KNeighborsClassifier(n_neighbors=5)

In [None]:
print(modelo_knn)

Entrenamos el modelo

In [None]:
modelo_knn.fit(X_train, y_train)

Predicción

In [None]:
prediccion_test = modelo_knn.predict(X_test)
prediccion_train = modelo_knn.predict(X_train)

In [None]:
prediccion_test

Matriz de confusión

In [None]:
matriz_confusion = confusion_matrix(y_test, prediccion_test)
matriz_confusion

###### Accuracy

In [None]:
print(accuracy_score(y_test, prediccion_test))

###### Probabilidades

In [None]:
prediccion_test_proba = modelo_knn.predict_proba(X_test)
prediccion_test_proba

###### AUC

In [None]:
roc_auc_score(y_test, prediccion_test_proba[:,1])

# SVM

![knn](https://miro.medium.com/max/681/1*csqbt5-K4GVi4i4Lrcx_eA.png)
---
![knn](https://miro.medium.com/max/700/0*Ojchw_Exefs4qiok.)

In [None]:
from sklearn.svm import SVC

Creamos el modelo

In [None]:
modelo_svm = SVC(C=0.1, probability=True)

In [None]:
print(modelo_svm)

Entrenamos el modelo

In [None]:
modelo_svm.fit(X_train,y_train)

Predicción

In [None]:
prediccion_test = modelo_svm.predict(X_test)
prediccion_train = modelo_svm.predict(X_train)

In [None]:
prediccion_test

Matriz de confusión

In [None]:
matriz_confusion = confusion_matrix(y_test, prediccion_test)
matriz_confusion

###### Accuracy

In [None]:
print(accuracy_score(y_test, prediccion_test))

###### Probabilidades

In [None]:
prediccion_test_proba = modelo_svm.predict_proba(X_test)
prediccion_test_proba

###### AUC

In [None]:
roc_auc_score(y_test, prediccion_test_proba[:,1])

# Árbol de decisión

![knn](https://dimensionless.in/wp-content/uploads/RandomForest_blog_files/figure-html/dt_boundary.png)

In [None]:
from sklearn.tree import DecisionTreeClassifier

Creamos el modelo

In [None]:
modelo_arbol = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=17)

In [None]:
print(modelo_arbol)

Entrenamos el modelo

In [None]:
modelo_arbol.fit(X_train,y_train)

Predicción

In [None]:
prediccion_test = modelo_arbol.predict(X_test)
prediccion_test_proba = modelo_arbol.predict_proba(X_test)

In [None]:
prediccion_test

Matriz de confusión

In [None]:
matriz_confusion = confusion_matrix(y_test, prediccion_test)
matriz_confusion

###### Accuracy

In [None]:
print(accuracy_score(y_test, prediccion_test))

###### Probabilidades

In [None]:
prediccion_test_proba = modelo_arbol.predict_proba(X_test)
prediccion_test_proba

###### AUC

In [None]:
roc_auc_score(y_test, prediccion_test_proba[:,1])