In [11]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt

# Preprocesado y modelado
# ==============================================================================
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import RandomizedSearchCV
from sklearn.inspection import permutation_importance
from scipy.stats import randint
import multiprocessing
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay

# Bayesian
# ==============================================================================
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold,StratifiedKFold
from sklearn.metrics import classification_report
import xgboost as xgb
from collections import Counter
import time
import optuna
from optuna.samplers import TPESampler
from sklearn.datasets import make_classification

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
tabla_final = pd.read_csv('Tabla_final_colab.csv', sep = ",")


## Se borran columnas irrelevantes ##
tabla_final = tabla_final.drop('FECHA_DATO', axis=1)
tabla_final = tabla_final.drop('FECHA_PROCESO', axis=1)
tabla_final = tabla_final.drop('ID_PROD', axis=1)
tabla_final = tabla_final.drop('FLAG_PREAP', axis=1)

## Definimos una tabla con los id de clientes
## Para luego poeder funtarla con las prob ##
## Si se queda se considera como parametro ##
tabla_con_id_cliente = tabla_final

df = tabla_final.drop('ID_CLIENTE', axis=1)
#visualizamos los datos
df

Unnamed: 0,CT_CTE,DEBITO_DIR,ID_EMPLEADO,ID_SEGMENTO_VALOR,ANTIGUEDAD,RENTA,EDAD,EDAD_PUNTAJE,ID_GENERO,ACEPTADO
0,1.0,0.0,0.0,2.0,6.0,87218.10,36.0,3.0,1.0,0.0
1,1.0,0.0,0.0,1.0,35.0,122179.11,23.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,35.0,119775.54,23.0,1.0,1.0,0.0
3,1.0,0.0,0.0,1.0,35.0,22220.04,24.0,1.0,1.0,0.0
4,1.0,0.0,0.0,1.0,35.0,295590.36,24.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
602555,0.0,0.0,0.0,2.0,62.0,97397.16,41.0,6.0,1.0,0.0
602556,0.0,0.0,0.0,2.0,7.0,168445.62,34.0,2.0,0.0,0.0
602557,0.0,0.0,0.0,1.0,6.0,53689.02,24.0,1.0,1.0,0.0
602558,0.0,0.0,0.0,2.0,229.0,64404.21,61.0,7.0,0.0,0.0


In [13]:
from sklearn.model_selection import train_test_split
X = df[['ID_SEGMENTO_VALOR', 'EDAD', 'ID_GENERO', 'RENTA', 'DEBITO_DIR', 'ID_EMPLEADO', 'ANTIGUEDAD', 'EDAD_PUNTAJE']]
y = df['ACEPTADO']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=3)

In [14]:
def objective(trial):
    # Define the search space
    
    c_space=trial.suggest_loguniform('C',10e-1, 100)
    l1_rat_space=trial.suggest_uniform('l1_ratio',0.1, 1)
    

    clf =LogisticRegression(max_iter = 4000,penalty='l2',
                            random_state=2,C=c_space,l1_ratio=l1_rat_space)
    
    score = cross_val_score(clf, X_train, y_train, scoring='accuracy',
                            cv=4).mean()
    return score

study_rl = optuna.create_study(study_name="rlog_artif",
                            direction="maximize",
                              pruner=optuna.pruners.HyperbandPruner(max_resource="auto"),
                            sampler=TPESampler())
study_rl.optimize(objective, n_trials=50)

[I 2023-06-19 18:49:37,053] A new study created in memory with name: rlog_artif
  c_space=trial.suggest_loguniform('C',10e-1, 100)
  l1_rat_space=trial.suggest_uniform('l1_ratio',0.1, 1)
[I 2023-06-19 18:49:40,268] Trial 0 finished with value: 0.9211364843335104 and parameters: {'C': 13.152923535284875, 'l1_ratio': 0.5476981453244353}. Best is trial 0 with value: 0.9211364843335104.
  c_space=trial.suggest_loguniform('C',10e-1, 100)
  l1_rat_space=trial.suggest_uniform('l1_ratio',0.1, 1)
[I 2023-06-19 18:49:43,451] Trial 1 finished with value: 0.9211364843335104 and parameters: {'C': 7.288151816620559, 'l1_ratio': 0.9187239565690498}. Best is trial 0 with value: 0.9211364843335104.
  c_space=trial.suggest_loguniform('C',10e-1, 100)
  l1_rat_space=trial.suggest_uniform('l1_ratio',0.1, 1)
[I 2023-06-19 18:49:46,597] Trial 2 finished with value: 0.9211364843335104 and parameters: {'C': 49.121686391859, 'l1_ratio': 0.6233025496578846}. Best is trial 0 with value: 0.9211364843335104.
  c_sp

In [15]:
print(study_rl.best_params)

{'C': 13.152923535284875, 'l1_ratio': 0.5476981453244353}


In [16]:
from sklearn.linear_model import LogisticRegression

# Inicializamos el modelo usando la libreria de python
logit = LogisticRegression(max_iter = 4000, penalty='l2', l1_ratio=0.5476981453244353, C=13.152923535284875)
# Ajustamos nuestro modelo con los datos que reservamos para "entrenar" al modelo con la historia disponible
logit.fit(X_train, y_train)



In [17]:
rl_opt = LogisticRegression(random_state = 0, solver = "newton-cg") #Combinación Óptima
rl_opt.fit(X_train, y_train)



In [18]:
# Predecir sobre los datos que dejamos para probar nuestro modelo usando la probabilidad
proba_logit = rl_opt.predict_proba(X)
len(proba_logit)

proba_logit_aceptar = []

for i in range (len(proba_logit)):
    proba_logit_aceptar.append(proba_logit[i][1])

proba_logit_aceptar

[0.014704402022798702,
 0.02253277199302737,
 0.015643811700076043,
 0.015309402626095172,
 0.015445477580106896,
 0.01564054218604935,
 0.015354555491814694,
 0.022496882736127495,
 0.015354726390637001,
 0.014895354516352639,
 0.021684254636488706,
 0.015330920871668803,
 0.015620367522463522,
 0.020520645422725487,
 0.23400256160410512,
 0.02253059604553938,
 0.015612936204644337,
 0.021302939941498622,
 0.015051939317902172,
 0.015343233893669185,
 0.021688761811032096,
 0.020938117576520174,
 0.022101715961867618,
 0.01536358287142602,
 0.015336734772157811,
 0.022490971230746653,
 0.02487014861117813,
 0.2728125135869336,
 0.02526307977446949,
 0.028494345944437442,
 0.28327051611855025,
 0.02198665905945906,
 0.025194583958174716,
 0.2234744043029739,
 0.022074978563759895,
 0.013942635569586687,
 0.015658147706576205,
 0.025209462438782287,
 0.1631630781802363,
 0.015442288814041895,
 0.02246930425872653,
 0.017606790869750814,
 0.02209181574384873,
 0.17703674502597047,
 0.022

In [19]:
df_proba = pd.DataFrame({'probabilidad_de_aceptar':proba_logit_aceptar})
df_proba

Unnamed: 0,probabilidad_de_aceptar
0,0.014704
1,0.022533
2,0.015644
3,0.015309
4,0.015445
...,...
602555,0.025353
602556,0.020302
602557,0.012638
602558,0.080413


In [20]:
tabla_con_id_cliente_mas_probabilidad = pd.merge(tabla_con_id_cliente, df_proba, left_index=True, right_index=True)

tabla_con_id_cliente_mas_probabilidad

Unnamed: 0,CT_CTE,DEBITO_DIR,ID_EMPLEADO,ID_SEGMENTO_VALOR,ANTIGUEDAD,ID_CLIENTE,RENTA,EDAD,EDAD_PUNTAJE,ID_GENERO,ACEPTADO,probabilidad_de_aceptar
0,1.0,0.0,0.0,2.0,6.0,1375586.0,87218.10,36.0,3.0,1.0,0.0,0.014704
1,1.0,0.0,0.0,1.0,35.0,1050612.0,122179.11,23.0,1.0,0.0,0.0,0.022533
2,0.0,0.0,0.0,1.0,35.0,1050613.0,119775.54,23.0,1.0,1.0,0.0,0.015644
3,1.0,0.0,0.0,1.0,35.0,1050615.0,22220.04,24.0,1.0,1.0,0.0,0.015309
4,1.0,0.0,0.0,1.0,35.0,1050616.0,295590.36,24.0,1.0,1.0,0.0,0.015445
...,...,...,...,...,...,...,...,...,...,...,...,...
602555,0.0,0.0,0.0,2.0,62.0,899549.0,97397.16,41.0,6.0,1.0,0.0,0.025353
602556,0.0,0.0,0.0,2.0,7.0,1441442.0,168445.62,34.0,2.0,0.0,0.0,0.020302
602557,0.0,0.0,0.0,1.0,6.0,1454431.0,53689.02,24.0,1.0,1.0,0.0,0.012638
602558,0.0,0.0,0.0,2.0,229.0,56812.0,64404.21,61.0,7.0,0.0,0.0,0.080413


In [21]:
tabla_con_id_cliente_mas_probabilidad = tabla_con_id_cliente_mas_probabilidad.sort_values(by='probabilidad_de_aceptar', ascending = False)
tabla_con_id_cliente_mas_probabilidad.dropna()

Unnamed: 0,CT_CTE,DEBITO_DIR,ID_EMPLEADO,ID_SEGMENTO_VALOR,ANTIGUEDAD,ID_CLIENTE,RENTA,EDAD,EDAD_PUNTAJE,ID_GENERO,ACEPTADO,probabilidad_de_aceptar
269471,0.0,1.0,0.0,2.0,205.0,98717.0,21674246.67,47.0,10.0,0.0,0.0,0.777377
275354,0.0,1.0,0.0,3.0,241.0,19398.0,644710.38,46.0,10.0,0.0,1.0,0.734382
281523,1.0,1.0,0.0,2.0,217.0,48404.0,22034738.76,45.0,10.0,1.0,1.0,0.732796
276236,0.0,1.0,0.0,3.0,244.0,16233.0,207387.69,47.0,10.0,0.0,1.0,0.731860
275184,0.0,1.0,0.0,3.0,240.0,19929.0,36034.77,46.0,10.0,0.0,1.0,0.729130
...,...,...,...,...,...,...,...,...,...,...,...,...
597797,1.0,0.0,0.0,1.0,1.0,1436134.0,67531.65,31.0,1.0,1.0,0.0,0.010742
470693,1.0,0.0,0.0,1.0,1.0,1402635.0,63119.28,31.0,1.0,1.0,0.0,0.010740
597871,1.0,0.0,0.0,1.0,1.0,1440072.0,55267.65,31.0,1.0,1.0,0.0,0.010738
566252,0.0,0.0,0.0,1.0,1.0,1400657.0,53738.85,31.0,1.0,1.0,0.0,0.010737


In [22]:
tabla_con_id_cliente_mas_probabilidad.to_csv(r'resultados_modelos\Resultados_Regresion_Logistica.csv', index=False, header=True)