# Pré-processamento

## Importações iniciais

In [83]:
import pandas as pd 
import numpy as np


from sklearn.preprocessing import OrdinalEncoder 
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier

from category_encoders import CatBoostEncoder


In [84]:
colunas_dummies = ['Churn', 
                   'customer.Partner', 
                   'customer.Dependents',
                   'customer.gender',
                   'phone.PhoneService', 
                   'account.PaperlessBilling']

colunas_multiclasses = ['internet.InternetService', 
                        'account.Contract',
                        'account.PaymentMethod']

colunas_dummies_mais_Um = ['phone.MultipleLines', 
                        'internet.OnlineSecurity', 
                        'internet.OnlineBackup',
                        'internet.DeviceProtection', 
                        'internet.TechSupport', 
                        'internet.StreamingTV',
                        'internet.StreamingMovies']

In [85]:
data_bruto = pd.read_csv("JsonConvertido.csv")

In [86]:
data_bruto.head()

Unnamed: 0,customerID,Churn,customer.gender,customer.SeniorCitizen,customer.Partner,customer.Dependents,customer.tenure,phone.PhoneService,phone.MultipleLines,internet.InternetService,...,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies,account.Contract,account.PaperlessBilling,account.PaymentMethod,account.Charges.Monthly,account.Charges.Total
0,0002-ORFBO,No,Female,0,Yes,Yes,9,Yes,No,DSL,...,Yes,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3
1,0003-MKNFE,No,Male,0,No,No,9,Yes,Yes,DSL,...,No,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4
2,0004-TLHLJ,Yes,Male,0,No,No,4,Yes,No,Fiber optic,...,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85
3,0011-IGKFF,Yes,Male,1,Yes,No,13,Yes,No,Fiber optic,...,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85
4,0013-EXCHZ,Yes,Female,1,Yes,No,3,Yes,No,Fiber optic,...,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4


### Transformação das variáveis dummies 

o padrão costuma ser
- (yes, no) -> (1,0)
- (male, female) -> (1,0)

In [87]:
data_bruto[colunas_dummies].head()

Unnamed: 0,Churn,customer.Partner,customer.Dependents,customer.gender,phone.PhoneService,account.PaperlessBilling
0,No,Yes,Yes,Female,Yes,Yes
1,No,No,No,Male,Yes,No
2,Yes,No,No,Male,Yes,Yes
3,Yes,Yes,No,Male,Yes,Yes
4,Yes,Yes,No,Female,Yes,Yes


In [88]:
ordinal_encoder = OrdinalEncoder()

In [89]:
data_bruto[colunas_dummies] = ordinal_encoder.fit_transform(data_bruto[colunas_dummies])

In [90]:
data_bruto[colunas_dummies].head()

Unnamed: 0,Churn,customer.Partner,customer.Dependents,customer.gender,phone.PhoneService,account.PaperlessBilling
0,0.0,1.0,1.0,0.0,1.0,1.0
1,0.0,0.0,0.0,1.0,1.0,0.0
2,1.0,0.0,0.0,1.0,1.0,1.0
3,1.0,1.0,0.0,1.0,1.0,1.0
4,1.0,1.0,0.0,0.0,1.0,1.0


### Separação em treino e teste 

Isso é feio para permitir que o modelo treine, nos dados de treino e que possamos ver sua eficiência no teste e também evitar vazamento de dados

In [91]:
id = data_bruto["customerID"]
id.sample(2)

389    0562-HKHML
340    0489-WMEMG
Name: customerID, dtype: object

In [92]:
X = data_bruto.drop(["Churn", "customerID"], axis=1)
y = data_bruto["Churn"]

In [93]:
y.head()

0    0.0
1    0.0
2    1.0
3    1.0
4    1.0
Name: Churn, dtype: float64

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Transformação das colunas com Yes, no e mais um fator

In [95]:
for col in data_bruto[colunas_dummies_mais_Um].columns:
    print(f'{col} : {data_bruto[col].unique()}')

phone.MultipleLines : ['No' 'Yes' 'No phone service']
internet.OnlineSecurity : ['No' 'Yes' 'No internet service']
internet.OnlineBackup : ['Yes' 'No' 'No internet service']
internet.DeviceProtection : ['No' 'Yes' 'No internet service']
internet.TechSupport : ['Yes' 'No' 'No internet service']
internet.StreamingTV : ['Yes' 'No' 'No internet service']
internet.StreamingMovies : ['No' 'Yes' 'No internet service']


In [96]:
X_train[colunas_dummies_mais_Um].head()

Unnamed: 0,phone.MultipleLines,internet.OnlineSecurity,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies
4292,No,No,Yes,No,No,Yes,Yes
4794,No,No,No,No,Yes,Yes,Yes
3172,No phone service,No,Yes,No,Yes,No,No
6358,No phone service,Yes,Yes,Yes,Yes,No,No
1956,No,No,No,No,No,Yes,No


In [112]:
for coluna in colunas_dummies_mais_Um:
    X_train[coluna] = X_train[coluna].apply(lambda x: 1 if x == "Yes" or x==1 else 0)
    X_test[coluna] = X_test[coluna].apply(lambda x: 1 if x == "Yes" or x==1 else 0)


In [113]:
X_train[colunas_dummies_mais_Um].head()

Unnamed: 0,phone.MultipleLines,internet.OnlineSecurity,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies
4292,0,0,1,0,0,1,1
4794,0,0,0,0,1,1,1
3172,0,0,1,0,1,0,0
6358,0,1,1,1,1,0,0
1956,0,0,0,0,0,1,0


### Tratando as colunas com múltiplos valores

In [115]:
colunas_multiclasses = ['internet.InternetService', 
                        'account.Contract',
                        'account.PaymentMethod']

In [119]:
for col in data_bruto[colunas_multiclasses].columns:
    print(f'{col} : {data_bruto[col].unique()}')
    print()

internet.InternetService : ['DSL' 'Fiber optic' 'No']

account.Contract : ['One year' 'Month-to-month' 'Two year']

account.PaymentMethod : ['Mailed check' 'Electronic check' 'Credit card (automatic)'
 'Bank transfer (automatic)']



In [120]:
catboost_encoder = CatBoostEncoder()

Primeiramente eu irei lidar con o "No", do "internet.InternetService" e depois utilizarei o algoritmo catboost encoder para poder lidar com as variáveis categóricas nominais

In [129]:
X_train["internet.InternetService"].tail()

3772             No
5191            DSL
5226    Fiber optic
5390            DSL
860     Fiber optic
Name: internet.InternetService, dtype: object

In [130]:
X_train["internet.InternetService"] = X_train["internet.InternetService"].apply(lambda x: 0 if x=="No" else x)
X_test["internet.InternetService"] = X_test["internet.InternetService"].apply(lambda x: 0 if x=="No" else x)

In [134]:
catboost_encoder.fit(X_train[colunas_multiclasses], y_train)

X_train[colunas_multiclasses] = catboost_encoder.transform(X_train[colunas_multiclasses])
X_test[colunas_multiclasses] = catboost_encoder.transform(X_test[colunas_multiclasses])

In [166]:
X_train[colunas_multiclasses].sample(5)

Unnamed: 0,internet.InternetService,account.Contract,account.PaymentMethod
5131,0.192939,0.425269,0.168099
4174,0.414485,0.113048,0.148522
5413,0.414485,0.425269,0.446624
5326,0.192939,0.425269,0.446624
1960,0.414485,0.425269,0.446624
