In [4]:
import pandas as pd
data = pd.read_csv('dataset_customer_churn.csv', sep='^')
data.drop(labels=['A006_REGISTRO_ANS', 'CODIGO_BENEFICIARIO', 'CLIENTE', 'CD_USUARIO', 'CODIGO_FORMA_PGTO_MENSALIDADE', 'A006_NM_PLANO', 'DIAS_ATE_REALIZAR_ALTO_CUSTO', 'CD_ASSOCIADO', 'ESTADO_CIVIL'], axis=1, inplace=True)
is_NAN = data[data.isna().any(axis=1)]
data.drop(is_NAN.index, axis=0, inplace=True)
data.sample(10)

Unnamed: 0,NUM_BENEFICIARIOS_FAMILIA,SITUACAO,IDADE,SEXO,QTDE_DIAS_ATIVO,QTDE_ATENDIMENTOS,QTDE_ATO_COBERTO_EXECUTADO,QTDE_ATO_N_COBERTO_EXECUTADO,REALIZOU_ENDODONTIA_COBERTA,REALIZOU_EXODONTIA_COBERTA,REALIZOU_PROCEDIMEN_ALTO_CUSTO,FORMA_PGTO_MENSALIDADE,PLANO
17116,2.0,DESATIVADO,43.0,F,1085.0,0.0,0.0,0.0,NAO,NAO,NAO,debito automatico,703454997
191695,1.0,DESATIVADO,40.0,F,461.0,4.0,18.0,0.0,NAO,SIM,NAO,boleto,476697161
123950,1.0,ATIVO,74.0,F,1790.0,0.0,0.0,0.0,NAO,NAO,NAO,boleto,475840164
213056,4.0,DESATIVADO,57.0,F,232.0,1.0,5.0,0.0,NAO,NAO,NAO,boleto,OUTROS
239959,6.0,ATIVO,34.0,F,564.0,10.0,14.0,0.0,NAO,SIM,NAO,boleto digital,475441167
359840,3.0,ATIVO,45.0,F,295.0,3.0,7.0,0.0,NAO,SIM,NAO,cartao de credito,481484183
91464,3.0,DESATIVADO,41.0,F,932.0,0.0,0.0,0.0,NAO,NAO,NAO,boleto,475840164
316968,1.0,ATIVO,19.0,F,70.0,0.0,0.0,0.0,NAO,NAO,NAO,cartao de credito,475441167
303455,5.0,ATIVO,54.0,M,173.0,0.0,0.0,0.0,NAO,NAO,NAO,cartao de credito,475441167
264354,1.0,ATIVO,36.0,M,161.0,1.0,5.0,0.0,NAO,NAO,NAO,boleto digital,475441167


In [5]:
dict_replace = {
    "SIM": 1,
    "NAO": 0,
    'F': 0,
    'M': 1,
    'DESATIVADO': 1,
    'ATIVO': 0,

}
data.replace(dict_replace, inplace=True)
data.drop(labels=182212, axis=0, inplace=True)

In [7]:
data_dummified = pd.get_dummies(data)

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split

x = data_dummified.drop(labels=['SITUACAO'], axis=1)
y = data_dummified['SITUACAO']
np.random.seed(1010)
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, stratify=y)

In [10]:
from sklearn.ensemble import RandomForestClassifier
random_forest_classifier = RandomForestClassifier(10)
random_forest_classifier.fit(train_x,train_y)
predicts_rfc = random_forest_classifier.predict(test_x)

In [13]:
from sklearn.metrics import f1_score,recall_score,accuracy_score,precision_score
def all_scores(predicts, test_y):
    print(f"Accuracy Score: {accuracy_score(test_y,predicts):.3f}\nRecall Score: {recall_score(test_y,predicts):.3f}\nPrecision Score: {precision_score(test_y,predicts):.3f}\nF1 Score: {f1_score(test_y,predicts):.3f}\n")
def scores(validation_results):
    media = validation_results['test_score'].mean()
    desvio_padrao = validation_results['test_score'].std()
    print("Accuracy médio {:.2f}".format(media))
    print("Intervalo [{:.2f}, {:.2f}]".format((media - 2 * desvio_padrao), (media + 2 * desvio_padrao)))


In [7]:
all_scores(predicts_rfc, test_y)

Accuracy Score: 0.905
Recall Score: 0.897
Precision Score: 0.927
F1 Score: 0.912



In [None]:
from sklearn.model_selection import cross_validate, KFold

validation_results = cross_validate(random_forest_classifier,x,y,cv=KFold(n_splits=10, shuffle=True))
scores(validation_results)
validation_results_df = pd.DataFrame(validation_results)
validation_results_df

In [11]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV, KFold
tt_x, val_x, tt_y, val_y = train_test_split(x, y, test_size=0.2, stratify=y)
### hiper-parametrização (?)
parametros_para_RSCV = {
    "max_depth" : randint(3, 100),
    "min_samples_split": randint(2, 16),
    "min_samples_leaf": randint(1, 16),
    "bootstrap" : [True, False],
    "criterion": ["gini", "entropy"]
}

RSCross_validation = RandomizedSearchCV(RandomForestClassifier(n_estimators=10), parametros_para_RSCV, n_iter= 16, cv = KFold(n_splits = 5, shuffle=True))
RSCross_validation.fit(tt_x,tt_y)
resultados = pd.DataFrame(RSCross_validation.cv_results_)

In [14]:
best_random_forest = RSCross_validation.best_estimator_
predicts_rfc = best_random_forest.predict(val_x)
all_scores(predicts_rfc, val_y)

Accuracy Score: 0.905
Recall Score: 0.907
Precision Score: 0.918
F1 Score: 0.913



In [17]:
print(best_random_forest)

RandomForestClassifier(bootstrap=False, max_depth=86, min_samples_leaf=2,
                       min_samples_split=10, n_estimators=10)


In [None]:
feature_importances = random_forest_classifier.feature_importances_
feature_importances

In [None]:
from matplotlib import pyplot as plt

plt.pie(feature_importances)

In [None]:
from sklearn.inspection import permutation_importance
permutation_importance = permutation_importance(best_random_forest, x, y)

In [None]:
permutation_importance_df = pd.Series(permutation_importance.importances_mean, index=pd.Series([col for col in x.columns]))
permutation_importance_df*100

In [None]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (10, 8)
plt.pie(permutation_importance_df)
plt.xlabel("Permutation Importance")

In [18]:
def scores(model):
    model_predicts = model.predict(test_x)
    return accuracy_score(test_y, model_predicts), recall_score(test_y, model_predicts),precision_score(test_y, model_predicts), f1_score(test_y, model_predicts)

In [23]:

model_scores = {scores(best_random_forest), scores(random_forest_classifier)}
df_model_scores = pd.DataFrame(model_scores, columns=['Acurracy Score', 'Recall Score', 'Precision Score', 'F1 Score'], index=['BRF', 'RFC'])
df_model_scores

Unnamed: 0,Acurracy Score,Recall Score,Precision Score,F1 Score
BRF,0.947605,0.947002,0.956427,0.951691
RFC,0.90234,0.893625,0.924647,0.908871
