# Pré-processamento

## Importações iniciais

In [1]:
import pandas as pd 
import numpy as np


from sklearn.preprocessing import OrdinalEncoder 
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier

from category_encoders import CatBoostEncoder


In [2]:
colunas_dummies = ['Churn', 
                   'customer.Partner', 
                   'customer.Dependents',
                   'customer.gender',
                   'phone.PhoneService', 
                   'account.PaperlessBilling']

colunas_multiclasses = ['internet.InternetService', 
                        'account.Contract',
                        'account.PaymentMethod']

colunas_dummies_mais_Um = ['phone.MultipleLines', 
                        'internet.OnlineSecurity', 
                        'internet.OnlineBackup',
                        'internet.DeviceProtection', 
                        'internet.TechSupport', 
                        'internet.StreamingTV',
                        'internet.StreamingMovies']

In [3]:
data_bruto = pd.read_csv("JsonConvertido.csv")

In [4]:
data_bruto.head()

Unnamed: 0,customerID,Churn,customer.gender,customer.SeniorCitizen,customer.Partner,customer.Dependents,customer.tenure,phone.PhoneService,phone.MultipleLines,internet.InternetService,internet.OnlineSecurity,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies,account.Contract,account.PaperlessBilling,account.PaymentMethod,account.Charges.Monthly,account.Charges.Total
0,0002-ORFBO,No,Female,0,Yes,Yes,9,Yes,No,DSL,No,Yes,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3
1,0003-MKNFE,No,Male,0,No,No,9,Yes,Yes,DSL,No,No,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4
2,0004-TLHLJ,Yes,Male,0,No,No,4,Yes,No,Fiber optic,No,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85
3,0011-IGKFF,Yes,Male,1,Yes,No,13,Yes,No,Fiber optic,No,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85
4,0013-EXCHZ,Yes,Female,1,Yes,No,3,Yes,No,Fiber optic,No,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4


## Tratamento das variáveis

### Transformação das variáveis dummies 

o padrão costuma ser
- (yes, no) -> (1,0)
- (male, female) -> (1,0)

In [5]:
data_bruto[colunas_dummies].head()

Unnamed: 0,Churn,customer.Partner,customer.Dependents,customer.gender,phone.PhoneService,account.PaperlessBilling
0,No,Yes,Yes,Female,Yes,Yes
1,No,No,No,Male,Yes,No
2,Yes,No,No,Male,Yes,Yes
3,Yes,Yes,No,Male,Yes,Yes
4,Yes,Yes,No,Female,Yes,Yes


In [6]:
ordinal_encoder = OrdinalEncoder()

In [7]:
data_bruto[colunas_dummies] = ordinal_encoder.fit_transform(data_bruto[colunas_dummies])

In [8]:
data_bruto[colunas_dummies].head()

Unnamed: 0,Churn,customer.Partner,customer.Dependents,customer.gender,phone.PhoneService,account.PaperlessBilling
0,0.0,1.0,1.0,0.0,1.0,1.0
1,0.0,0.0,0.0,1.0,1.0,0.0
2,1.0,0.0,0.0,1.0,1.0,1.0
3,1.0,1.0,0.0,1.0,1.0,1.0
4,1.0,1.0,0.0,0.0,1.0,1.0


### Separação em treino e teste 

Isso é feio para permitir que o modelo treine, nos dados de treino e que possamos ver sua eficiência no teste e também evitar vazamento de dados

In [9]:
id = data_bruto["customerID"]
id.sample(2)

4746    6504-VBLFL
3304    4609-KNNWG
Name: customerID, dtype: object

In [10]:
X = data_bruto.drop(["Churn", "customerID"], axis=1)
y = data_bruto["Churn"]

In [11]:
y.head()

0    0.0
1    0.0
2    1.0
3    1.0
4    1.0
Name: Churn, dtype: float64

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Transformação das colunas com Yes, no e mais um fator

In [13]:
for col in data_bruto[colunas_dummies_mais_Um].columns:
    print(f'{col} : {data_bruto[col].unique()}')

phone.MultipleLines : ['No' 'Yes' 'No phone service']
internet.OnlineSecurity : ['No' 'Yes' 'No internet service']
internet.OnlineBackup : ['Yes' 'No' 'No internet service']
internet.DeviceProtection : ['No' 'Yes' 'No internet service']
internet.TechSupport : ['Yes' 'No' 'No internet service']
internet.StreamingTV : ['Yes' 'No' 'No internet service']
internet.StreamingMovies : ['No' 'Yes' 'No internet service']


In [14]:
X_train[colunas_dummies_mais_Um].head()

Unnamed: 0,phone.MultipleLines,internet.OnlineSecurity,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies
4292,No,No,Yes,No,No,Yes,Yes
4794,No,No,No,No,Yes,Yes,Yes
3172,No phone service,No,Yes,No,Yes,No,No
6358,No phone service,Yes,Yes,Yes,Yes,No,No
1956,No,No,No,No,No,Yes,No


In [15]:
for coluna in colunas_dummies_mais_Um:
    X_train[coluna] = X_train[coluna].apply(lambda x: 1 if x == "Yes" or x==1 else 0)
    X_test[coluna] = X_test[coluna].apply(lambda x: 1 if x == "Yes" or x==1 else 0)


In [16]:
X_train[colunas_dummies_mais_Um].head()

Unnamed: 0,phone.MultipleLines,internet.OnlineSecurity,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies
4292,0,0,1,0,0,1,1
4794,0,0,0,0,1,1,1
3172,0,0,1,0,1,0,0
6358,0,1,1,1,1,0,0
1956,0,0,0,0,0,1,0


### Tratando as colunas com múltiplos valores

In [17]:
colunas_multiclasses = ['internet.InternetService', 
                        'account.Contract',
                        'account.PaymentMethod']

In [18]:
for col in data_bruto[colunas_multiclasses].columns:
    print(f'{col} : {data_bruto[col].unique()}')
    print()

internet.InternetService : ['DSL' 'Fiber optic' 'No']

account.Contract : ['One year' 'Month-to-month' 'Two year']

account.PaymentMethod : ['Mailed check' 'Electronic check' 'Credit card (automatic)'
 'Bank transfer (automatic)']



In [19]:
catboost_encoder = CatBoostEncoder()

Primeiramente eu irei lidar con o "No", do "internet.InternetService" e depois utilizarei o algoritmo catboost encoder para poder lidar com as variáveis categóricas nominais

In [20]:
X_train["internet.InternetService"].tail()

3772             No
5191            DSL
5226    Fiber optic
5390            DSL
860     Fiber optic
Name: internet.InternetService, dtype: object

In [21]:
X_train["internet.InternetService"] = X_train["internet.InternetService"].apply(lambda x: 0 if x=="No" else x)
X_test["internet.InternetService"] = X_test["internet.InternetService"].apply(lambda x: 0 if x=="No" else x)

In [22]:
catboost_encoder.fit(X_train[colunas_multiclasses], y_train)

X_train[colunas_multiclasses] = catboost_encoder.transform(X_train[colunas_multiclasses])
X_test[colunas_multiclasses] = catboost_encoder.transform(X_test[colunas_multiclasses])

In [23]:
X_train[colunas_multiclasses].sample(5)

Unnamed: 0,internet.InternetService,account.Contract,account.PaymentMethod
4106,0.414485,0.425269,0.446624
3991,0.414485,0.425269,0.168099
5578,0.192939,0.425269,0.201444
2768,0.414485,0.425269,0.201444
6623,0.079166,0.425269,0.446624


In [24]:
X_train.head()

Unnamed: 0,customer.gender,customer.SeniorCitizen,customer.Partner,customer.Dependents,customer.tenure,phone.PhoneService,phone.MultipleLines,internet.InternetService,internet.OnlineSecurity,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies,account.Contract,account.PaperlessBilling,account.PaymentMethod,account.Charges.Monthly,account.Charges.Total
4292,1.0,0,0.0,0.0,31,1.0,0,0.414485,0,1,0,0,1,1,0.425269,1.0,0.446624,94.5,2979.2
4794,0.0,0,1.0,0.0,45,1.0,0,0.414485,0,0,0,1,1,1,0.028429,1.0,0.148522,95.2,4285.8
3172,1.0,0,0.0,0.0,3,0.0,0,0.192939,0,1,0,1,0,0,0.425269,1.0,0.201444,36.85,108.7
6358,1.0,0,0.0,0.0,5,0.0,0,0.192939,1,1,1,1,0,0,0.425269,1.0,0.201444,43.25,219.0
1956,0.0,0,0.0,0.0,1,1.0,0,0.414485,0,0,0,0,1,0,0.425269,0.0,0.446624,78.05,78.05


## Modedagem

Modelos a seram usados -> LighGBM, XGBoost, SVM, catboost

In [2]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier