# Pré-processamento

## Importações iniciais

In [133]:
import pandas as pd 
import numpy as np


from sklearn.preprocessing import OrdinalEncoder 
from sklearn.model_selection import train_test_split

from category_encoders import CatBoostEncoder


In [134]:
colunas_dummies = ['Churn', 
                   'customer.Partner', 
                   'customer.Dependents',
                   'customer.gender',
                   'phone.PhoneService', 
                   'account.PaperlessBilling']

colunas_multiclasses = ['internet.InternetService', 
                        'account.Contract',
                        'account.PaymentMethod']

colunas_dummies_mais_Um = ['phone.MultipleLines', 
                        'internet.OnlineSecurity', 
                        'internet.OnlineBackup',
                        'internet.DeviceProtection', 
                        'internet.TechSupport', 
                        'internet.StreamingTV',
                        'internet.StreamingMovies']

In [135]:
data_bruto = pd.read_csv("JsonConvertido.csv")

In [136]:
data_bruto.head()

Unnamed: 0,customerID,Churn,customer.gender,customer.SeniorCitizen,customer.Partner,customer.Dependents,customer.tenure,phone.PhoneService,phone.MultipleLines,internet.InternetService,...,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies,account.Contract,account.PaperlessBilling,account.PaymentMethod,account.Charges.Monthly,account.Charges.Total
0,0002-ORFBO,No,Female,0,Yes,Yes,9,Yes,No,DSL,...,Yes,No,Yes,Yes,No,One year,Yes,Mailed check,65.6,593.3
1,0003-MKNFE,No,Male,0,No,No,9,Yes,Yes,DSL,...,No,No,No,No,Yes,Month-to-month,No,Mailed check,59.9,542.4
2,0004-TLHLJ,Yes,Male,0,No,No,4,Yes,No,Fiber optic,...,No,Yes,No,No,No,Month-to-month,Yes,Electronic check,73.9,280.85
3,0011-IGKFF,Yes,Male,1,Yes,No,13,Yes,No,Fiber optic,...,Yes,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,98.0,1237.85
4,0013-EXCHZ,Yes,Female,1,Yes,No,3,Yes,No,Fiber optic,...,No,No,Yes,Yes,No,Month-to-month,Yes,Mailed check,83.9,267.4


## Tratamento das variáveis

### Tratamento igual feito na semana 1

In [137]:
# Retirando os valores nulos no churn
data_bruto = data_bruto.dropna()

In [138]:
# mudando o type do 'account.Charges.Total'
data_bruto['account.Charges.Total'] = pd.to_numeric(data_bruto['account.Charges.Total'],
                                               errors='coerce')

### Transformação das variáveis dummies 

o padrão costuma ser
- (yes, no) -> (1,0)
- (male, female) -> (1,0)

In [139]:
data_bruto[colunas_dummies].head()

Unnamed: 0,Churn,customer.Partner,customer.Dependents,customer.gender,phone.PhoneService,account.PaperlessBilling
0,No,Yes,Yes,Female,Yes,Yes
1,No,No,No,Male,Yes,No
2,Yes,No,No,Male,Yes,Yes
3,Yes,Yes,No,Male,Yes,Yes
4,Yes,Yes,No,Female,Yes,Yes


In [140]:
ordinal_encoder = OrdinalEncoder()

In [141]:
data_bruto[colunas_dummies] = ordinal_encoder.fit_transform(data_bruto[colunas_dummies])

In [142]:
data_bruto[colunas_dummies].head()

Unnamed: 0,Churn,customer.Partner,customer.Dependents,customer.gender,phone.PhoneService,account.PaperlessBilling
0,0.0,1.0,1.0,0.0,1.0,1.0
1,0.0,0.0,0.0,1.0,1.0,0.0
2,1.0,0.0,0.0,1.0,1.0,1.0
3,1.0,1.0,0.0,1.0,1.0,1.0
4,1.0,1.0,0.0,0.0,1.0,1.0


### Separação em treino e teste 

Isso é feio para permitir que o modelo treine, nos dados de treino e que possamos ver sua eficiência no teste e também evitar vazamento de dados

In [143]:
id = data_bruto["customerID"]
id.sample(2)

2519    3519-ZKXGG
868     1226-JZNKR
Name: customerID, dtype: object

In [144]:
X = data_bruto.drop(["Churn", "customerID"], axis=1)
y = data_bruto["Churn"]

In [145]:
y.head()

0    0.0
1    0.0
2    1.0
3    1.0
4    1.0
Name: Churn, dtype: float64

In [146]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Tratamento dos missing do account.Charges.Total

In [147]:
one_year_median_X_train = X_train.loc[X_train["account.Contract"]== "One year"]
one_year_median_X_train = one_year_median_X_train["customer.tenure"].median()

two_year_median_X_train = X_train.loc[X_train["account.Contract"]== "Two year"]
two_year_median_X_train = two_year_median_X_train["customer.tenure"].median()

In [148]:
one_year_median_X_test = X_test.loc[X_test["account.Contract"] == "One year"]
one_year_median_X_test = one_year_median_X_test["customer.tenure"].median()

two_year_median_X_test = X_test.loc[X_test["account.Contract"]== "Two year"]
two_year_median_X_test = two_year_median_X_test["customer.tenure"].median()

In [149]:

indice_dados_faltantes_X_train = X_train[X_train['account.Charges.Total'].isnull()].index
indice_dados_faltantes_X_test = X_test[X_test["account.Charges.Monthly"].isnull()].index

def altera_charges_total(row,one_year_median,two_year_median ):

    if row["account.Contract"] == 'One year':
        row["account.Charges.Total"] = row["account.Charges.Monthly"] * one_year_median
        row["customer.tenure"] = one_year_median

    else:  
        row["account.Charges.Total"] = row["account.Charges.Monthly"] * two_year_median
        row["customer.tenure"] = two_year_median
    return row

X_train.loc[indice_dados_faltantes_X_train] = X_train.loc[indice_dados_faltantes_X_train].apply(
    lambda row: altera_charges_total(row, one_year_median_X_train, two_year_median_X_train), axis=1)

X_test.loc[indice_dados_faltantes_X_test] = X_test.loc[indice_dados_faltantes_X_test].apply(
    lambda row: altera_charges_total(row, one_year_median_X_test, two_year_median_X_test), axis=1)

### Transformação das colunas com Yes, no e mais um fator

In [150]:
X_train[colunas_dummies_mais_Um].head()

Unnamed: 0,phone.MultipleLines,internet.OnlineSecurity,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies
2197,No,Yes,Yes,No,No,Yes,No
1665,Yes,No,No,No,No,No,Yes
6255,Yes,No,No,No,No,Yes,Yes
1402,No,No,No,No,No,No,Yes
6962,No,No,No,No,No,No,No


In [151]:
for coluna in colunas_dummies_mais_Um:
    X_train[coluna] = X_train[coluna].apply(lambda x: 1 if x == "Yes" or x==1 else 0)
    X_test[coluna] = X_test[coluna].apply(lambda x: 1 if x == "Yes" or x==1 else 0)


In [152]:
X_train[colunas_dummies_mais_Um].head()

Unnamed: 0,phone.MultipleLines,internet.OnlineSecurity,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies
2197,0,1,1,0,0,1,0
1665,1,0,0,0,0,0,1
6255,1,0,0,0,0,1,1
1402,0,0,0,0,0,0,1
6962,0,0,0,0,0,0,0


### Tratando as colunas com múltiplos valores

In [153]:
colunas_multiclasses = ['internet.InternetService', 
                        'account.Contract',
                        'account.PaymentMethod']

In [154]:
for col in data_bruto[colunas_multiclasses].columns:
    print(f'{col} : {data_bruto[col].unique()}')
    print()

internet.InternetService : ['DSL' 'Fiber optic' 'No']

account.Contract : ['One year' 'Month-to-month' 'Two year']

account.PaymentMethod : ['Mailed check' 'Electronic check' 'Credit card (automatic)'
 'Bank transfer (automatic)']



In [155]:
catboost_encoder = CatBoostEncoder()

Primeiramente eu irei lidar con o "No", do "internet.InternetService" e depois utilizarei o algoritmo catboost encoder para poder lidar com as variáveis categóricas nominais

In [156]:
X_train["internet.InternetService"].tail()

3880    No
5348    No
5385    No
5554    No
886     No
Name: internet.InternetService, dtype: object

In [157]:
X_train["internet.InternetService"] = X_train["internet.InternetService"].apply(lambda x: 0 if x=="No" else x)
X_test["internet.InternetService"] = X_test["internet.InternetService"].apply(lambda x: 0 if x=="No" else x)

In [158]:
catboost_encoder.fit(X_train[colunas_multiclasses], y_train)

X_train[colunas_multiclasses] = catboost_encoder.transform(X_train[colunas_multiclasses])
X_test[colunas_multiclasses] = catboost_encoder.transform(X_test[colunas_multiclasses])

In [159]:
X_train[colunas_multiclasses].sample(5)

Unnamed: 0,internet.InternetService,account.Contract,account.PaymentMethod
5978,0.192108,0.025931,0.145116
1323,0.416606,0.430318,0.460045
5542,0.416606,0.430318,0.145116
6426,0.192108,0.109454,0.166474
7088,0.192108,0.430318,0.191796


In [160]:
X_train.head()

Unnamed: 0,customer.gender,customer.SeniorCitizen,customer.Partner,customer.Dependents,customer.tenure,phone.PhoneService,phone.MultipleLines,internet.InternetService,internet.OnlineSecurity,internet.OnlineBackup,internet.DeviceProtection,internet.TechSupport,internet.StreamingTV,internet.StreamingMovies,account.Contract,account.PaperlessBilling,account.PaymentMethod,account.Charges.Monthly,account.Charges.Total
2197,1.0,0,1.0,1.0,3,1.0,0,0.416606,1,1,0,0,1,0,0.430318,0.0,0.460045,89.85,248.4
1665,1.0,0,0.0,0.0,36,1.0,1,0.416606,0,0,0,0,0,1,0.430318,1.0,0.460045,84.75,3050.15
6255,0.0,0,1.0,1.0,49,1.0,1,0.416606,0,0,0,0,1,1,0.430318,1.0,0.460045,95.6,4783.5
1402,0.0,0,0.0,0.0,7,1.0,0,0.416606,0,0,0,0,0,1,0.430318,1.0,0.460045,79.3,523.15
6962,0.0,0,1.0,0.0,2,1.0,0,0.192108,0,0,0,0,0,0,0.430318,1.0,0.460045,45.0,89.75


## Modedagem

Modelos a seram usados -> LighGBM, XGBoost, SVM, catboost

In [161]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import cross_val_score

In [162]:
random_state = 42

models = [('Regressão Logística', LogisticRegression(random_state = random_state, max_iter=1000)),
            ('SVM', SVC(random_state = random_state)),
            ('XGBoost', XGBClassifier(random_state = random_state)),
            ('LightGBM', LGBMClassifier(random_state = random_state, force_row_wise=True, verbose = 0)),
            ('CatBoost', CatBoostClassifier(random_state = random_state, verbose=0))
            ]

Fazendo a validação de diversos modelos utilizando a validação cruzada

In [163]:
def validation(model, X, y):

  f1_cv = cross_val_score(model, X, y, scoring='f1').mean()
  roc_cv = cross_val_score(model, X, y, scoring='roc_auc').mean()

  return f1_cv, roc_cv

In [164]:
random_state = 42


for model in models:
   nome_modelo = model[0]
   modelo = model[1]

   f1_cv, roc_cv = validation(modelo, X_train, y_train)

   print(nome_modelo)
   print()

   print(f'F1-score {f1_cv:.5f}')
   print(f'Curva Roc {roc_cv:.5f}')
   print()


Regressão Logística

F1-score 0.58399
Curva Roc 0.84243



KeyboardInterrupt: 

## Salvando o melhor modelo

In [None]:
model_logistic_regression = models[0][1]

In [None]:
import pickle 

salva =  True

if salva:
    with open("best_model.pkl","wb") as melhor_modelo:
        pickle.dump(model_logistic_regression, melhor_modelo )