# Objetivo

Rodar o modelo final na base de treino + validação e depois aplicação na base de teste;

Avaliar se a distribuição das probabilidades finais na base de treino + validação se mantém na base de teste;

Avaliar a distribuição das observações nas categorias das variáveis na base de treino + validação e em teste (PSI);

Avaliar a curva de Perfil de Eficiência Acumulada (CAP) na base de tese.

# Pacotes

In [65]:
from deltalake import DeltaTable, write_deltalake
import pandas as pd
import mlflow
from mlflow.models import infer_signature
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score, average_precision_score, brier_score_loss, confusion_matrix, ConfusionMatrixDisplay, log_loss

# Leitura das bases

In [2]:
dados = DeltaTable("../1.Variaveis/tmp/dados_pp_v1").to_pandas()
dados.drop(['__index_level_0__', 'Card_Category_Gold', 'Card_Category_Platinum', 'Card_Category_Silver', 
            'Marital_Status_Married', 'Marital_Status_Single', 'Marital_Status_Unknown', 'Gender_M',
            'Dependent_count'], axis=1, inplace=True)
dados.head()

Unnamed: 0,Customer_Age,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Ct_Chng_Q4_Q1,vfm,pmcc,Income_Category_1.< 40k,Income_Category_2. >= 40k & < 60k,Income_Category_3. >= 60k & < 80k,Income_Category_4. >= 80k & < 120k,Income_Category_5. >= 120k,Education_Level_v2_1.Uneducated,Education_Level_v2_2.High School,Education_Level_v2_3.Graduate,Education_Level_v2_4.Post-Graduate,Attrition_Flag,type
0,-0.784196,1.403132,-1.337898,0.498943,0.963894,0.282975,-0.328225,-0.175537,-0.42145,1,0,0,0,0,0,0,1,0,0,Treino
1,0.72007,-0.525933,0.641818,1.408428,-0.165769,-1.527806,-0.194304,-0.208685,-1.054789,0,0,0,1,0,0,0,1,0,0,Treino
2,1.346848,-0.525933,-0.34804,0.498943,0.864865,0.894171,0.056797,-0.571459,-0.686436,0,0,0,1,0,0,0,0,1,0,Treino
3,0.218648,-0.525933,0.641818,-1.320028,-0.412731,0.369637,0.851953,0.252749,2.406712,1,0,0,0,0,0,0,1,0,0,Treino
4,-2.539173,0.117089,1.631675,1.408428,-0.858972,0.346832,-1.144306,-0.064053,-0.071911,1,0,0,0,0,1,0,0,0,0,Treino


## Separa bases

In [3]:
dados_treino_val = dados[dados.type != 'Teste'].drop(['type'], axis=1)
dados_teste = dados[dados.type == 'Teste'].drop(['type'], axis=1)

In [4]:
X_treino_val = dados_treino_val.drop(['Attrition_Flag'], axis=1)
y_treino_val = dados_treino_val['Attrition_Flag']

X_teste = dados_teste.drop(['Attrition_Flag'], axis=1)
y_teste = dados_teste['Attrition_Flag']

In [5]:
X_treino_val_new = X_treino_val.rename(columns={'Income_Category_1.< 40k': 'Income_Category_1.40k',
                                        'Income_Category_2. >= 40k & < 60k': 'Income_Category_2.40k_60k',
                                        'Income_Category_3. >= 60k & < 80k': 'Income_Category_3.60k_80k',
                                        'Income_Category_4. >= 80k & < 120k': 'Income_Category_4.80k_120k',
                                        'Income_Category_5. >= 120k': 'Income_Category_5.120k'
                                        })

X_teste_new = X_teste.rename(columns={'Income_Category_1.< 40k': 'Income_Category_1.40k',
                                        'Income_Category_2. >= 40k & < 60k': 'Income_Category_2.40k_60k',
                                        'Income_Category_3. >= 60k & < 80k': 'Income_Category_3.60k_80k',
                                        'Income_Category_4. >= 80k & < 120k': 'Income_Category_4.80k_120k',
                                        'Income_Category_5. >= 120k': 'Income_Category_5.120k'
                                        })

# RF

In [18]:
mlflow_client = mlflow.tracking.MlflowClient(tracking_uri='http://127.0.0.1:5000')
parametros = mlflow_client.get_run('c189498258a44951985fd8b97a019cf2').data.to_dictionary()['params']

In [38]:
parametros

{'verbose': '0',
 'min_weight_fraction_leaf': '0.0',
 'n_estimators': '437',
 'min_samples_leaf': '39',
 'min_impurity_decrease': '0.0',
 'max_samples': 'None',
 'ccp_alpha': '0.0',
 'max_features': 'sqrt',
 'oob_score': 'False',
 'warm_start': 'False',
 'max_leaf_nodes': 'None',
 'max_depth': '248',
 'min_samples_split': '233',
 'n_jobs': 'None',
 'criterion': 'gini',
 'monotonic_cst': 'None',
 'random_state': 'None',
 'bootstrap': 'True',
 'class_weight': 'None'}

In [53]:
clf = RandomForestClassifier(n_estimators=437,
                             min_samples_leaf=39,
                             verbose=0,
                             min_weight_fraction_leaf=0,
                             min_impurity_decrease=0,
                             max_samples=None,
                             ccp_alpha=0,
                             max_features='sqrt',
                             oob_score=False,
                             warm_start=False,
                             max_leaf_nodes=None,
                             max_depth=248,
                             min_samples_split=233,
                             n_jobs=None,
                             criterion='gini',
                             monotonic_cst=None,
                             random_state=None,
                             bootstrap=True,
                             class_weight=None) 
clf.fit(X_treino_val_new, y_treino_val)

In [54]:
experiment = mlflow.create_experiment(name = 'Modelos_Finais',
                                      artifact_location = 'Modelos_Finais',
                                      tags = {'Environment': 'Pre_Deploy', 'Version': '1.0.0'}
                                      )

In [55]:
if __name__ == '__main__':
    
    with mlflow.start_run(run_name = 'RF_Final', experiment_id = experiment) as run: 

        # Log do schema das variáveis do modelo e do modelo
        signature = infer_signature(X_teste_new, clf.predict_proba(X_treino_val_new))
        mlflow.sklearn.log_model(clf, signature=signature, artifact_path='modelo')



In [64]:
print('Auc_Pr_Teste:', average_precision_score(y_teste, clf.predict_proba(X_teste_new)[:,1]))
print('Auc_Roc_Teste:', roc_auc_score(y_teste, clf.predict_proba(X_teste_new)[:,1]))
print('BS_Teste:', brier_score_loss(y_teste, clf.predict_proba(X_teste_new)[:,1]))
print('Log_Loss_Teste:', log_loss(y_teste, clf.predict_proba(X_teste_new)[:,1]))

print('-------------------------------Sem ponto de corte ótimo-------------------------------')

# Sem ponto de corte ótimo
print('F1_Score_Teste:', f1_score(y_teste, clf.predict(X_teste_new)))
print('Precisao_Teste:', precision_score(y_teste, clf.predict(X_teste_new)))
print('Recall_Teste:', recall_score(y_teste, clf.predict(X_teste_new)))

print('-------------------------------Com ponto de corte ótimo-------------------------------')

# Sem ponto de corte ótimo
print('F1_Score_Teste:', f1_score(y_teste, clf.predict_proba(X_teste_new)[:,1] >= 0.313385883698052))
print('Precisao_Teste:', precision_score(y_teste, clf.predict_proba(X_teste_new)[:,1] >= 0.313385883698052))
print('Recall_Teste:', recall_score(y_teste, clf.predict_proba(X_teste_new)[:,1] >= 0.313385883698052))

Auc_Pr_Teste: 0.7696093805727798
Auc_Roc_Teste: 0.9287283519194204
BS_Teste: 0.07602863769574533
Log_Loss_Teste: 0.25939123756764576
-------------------------------Sem ponto de corte ótimo-------------------------------
F1_Score_Teste: 0.5397727272727273
Precisao_Teste: 0.8796296296296297
Recall_Teste: 0.38934426229508196
-------------------------------Com ponto de corte ótimo-------------------------------
F1_Score_Teste: 0.7236580516898609
Precisao_Teste: 0.7027027027027027
Recall_Teste: 0.7459016393442623


In [88]:
confusion_matrix = confusion_matrix(y_teste, clf.predict_proba(X_teste_new)[:,1] >= 0.313385883698052)
#cm_display = ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])

#cm_display.plot()
#plt.show()

TypeError: 'numpy.ndarray' object is not callable