# Predicción de abandono del servicio en Telecom X

## Preparación de los datos

In [3]:
import pandas as pd

In [4]:
path = 'datos/datos_tratados.csv'
df = pd.read_csv(path)
df.sample(5)

Unnamed: 0,CustomerID,Churn,Gender,SeniorCitizen,Partner,Dependents,Tenure,PhoneService,MultipleLines,InternetService,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,ChargesDaily,ChargesMonthly,ChargesTotal
4153,5872-OEQNH,No,Female,No,No,No,60,No,No,DSL,...,Yes,No,No,Yes,One year,Yes,Electronic check,1.48,44.45,2773.9
2309,3308-JSGML,No,Male,Yes,Yes,No,59,No,No,DSL,...,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),2.13,64.05,3886.85
3555,5038-ETMLM,No,Female,No,Yes,No,72,Yes,Yes,Fiber optic,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),3.79,113.65,8182.75
3990,5626-MGTUK,No,Female,No,No,No,20,Yes,Yes,Fiber optic,...,No,No,Yes,No,Month-to-month,Yes,Bank transfer (automatic),2.97,89.1,1879.25
321,0480-BIXDE,No,Female,No,Yes,No,19,Yes,No,Fiber optic,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,3.23,96.8,1743.05


In [5]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        7043 non-null   str    
 1   Churn             7043 non-null   str    
 2   Gender            7043 non-null   str    
 3   SeniorCitizen     7043 non-null   str    
 4   Partner           7043 non-null   str    
 5   Dependents        7043 non-null   str    
 6   Tenure            7043 non-null   int64  
 7   PhoneService      7043 non-null   str    
 8   MultipleLines     7043 non-null   str    
 9   InternetService   7043 non-null   str    
 10  OnlineSecurity    7043 non-null   str    
 11  OnlineBackup      7043 non-null   str    
 12  DeviceProtection  7043 non-null   str    
 13  TechSupport       7043 non-null   str    
 14  StreamingTV       7043 non-null   str    
 15  StreamingMovies   7043 non-null   str    
 16  Contract          7043 non-null   str    
 17  Paperl

In [6]:
df = df.drop(columns=['CustomerID'])

In [7]:
X = df.drop('Churn', axis=1)
y = df['Churn']

In [8]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

In [9]:
categorical_features = df.columns.drop(['Churn', 'Tenure', 'ChargesDaily', 'ChargesMonthly', 'ChargesTotal'])

one_hot = make_column_transformer(
    (OneHotEncoder(drop='if_binary'), categorical_features),
    remainder='passthrough', sparse_threshold=0
)

In [10]:
X = one_hot.fit_transform(X)
X

array([[0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 2.19000e+00,
        6.56000e+01, 5.93300e+02],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.00000e+00,
        5.99000e+01, 5.42400e+02],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 2.46000e+00,
        7.39000e+01, 2.80850e+02],
       ...,
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.68000e+00,
        5.03000e+01, 9.27500e+01],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, ..., 2.26000e+00,
        6.78500e+01, 4.62765e+03],
       [1.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.97000e+00,
        5.90000e+01, 3.70760e+03]], shape=(7043, 27))

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [13]:
print(f'Existe una proporción de {y.mean()*100:.2f}% de clientes que abandonaron el servicio.')

Existe una proporción de 26.54% de clientes que abandonaron el servicio.


In [14]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [19]:
pipeline_us = imbpipeline(steps=[
    ('under', NearMiss(version=3)),
    ('model', DecisionTreeClassifier(max_depth=10, random_state=42))
])
pipeline_os = imbpipeline(steps=[
    ('over', SMOTE()),
    ('model', DecisionTreeClassifier(max_depth=10, random_state=42))
])

In [20]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results_us = cross_val_score(pipeline_us, X, y, cv=skf, scoring='recall')
cv_results_os = cross_val_score(pipeline_os, X, y, cv=skf, scoring='recall')

In [21]:
print(f'El recall del Under Sampling es {cv_results_us.mean():.4f}')
print(f'El recall del Over Sampling es {cv_results_os.mean():.4f}')

El recall del Under Sampling es 0.6645
El recall del Over Sampling es 0.5971


In [None]:
under_sample = NearMiss(version=3)
X_balanceado, y_balanceado = under_sample.fit_resample(X, y)

In [23]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
X_normalizado = scaler.fit_transform(X_balanceado)