In [1]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd

# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt

# Preprocesado y modelado
# ==============================================================================
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import RandomizedSearchCV
from sklearn.inspection import permutation_importance
from scipy.stats import randint
import multiprocessing
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay

# Bayesian
# ==============================================================================
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold,StratifiedKFold
from sklearn.metrics import classification_report
import xgboost as xgb
from collections import Counter
import time
import optuna
from optuna.samplers import TPESampler
from sklearn.datasets import make_classification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tabla_final = pd.read_csv('Tabla_final_colab.csv', sep = ",")


## Se borran columnas irrelevantes ##
tabla_final = tabla_final.drop('FECHA_DATO', axis=1)
tabla_final = tabla_final.drop('FECHA_PROCESO', axis=1)
tabla_final = tabla_final.drop('ID_PROD', axis=1)
tabla_final = tabla_final.drop('FLAG_PREAP', axis=1)

## Definimos una tabla con los id de clientes
## Para luego poeder funtarla con las prob ##
## Si se queda se considera como parametro ##
tabla_con_id_cliente = tabla_final

df = tabla_final.drop('ID_CLIENTE', axis=1)
#visualizamos los datos
df

Unnamed: 0,CT_CTE,DEBITO_DIR,ID_EMPLEADO,ID_SEGMENTO_VALOR,ANTIGUEDAD,RENTA,EDAD,EDAD_PUNTAJE,ID_GENERO,ACEPTADO
0,1.0,0.0,0.0,2.0,6.0,87218.10,36.0,3.0,1.0,0.0
1,1.0,0.0,0.0,1.0,35.0,122179.11,23.0,1.0,0.0,0.0
2,0.0,0.0,0.0,1.0,35.0,119775.54,23.0,1.0,1.0,0.0
3,1.0,0.0,0.0,1.0,35.0,22220.04,24.0,1.0,1.0,0.0
4,1.0,0.0,0.0,1.0,35.0,295590.36,24.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
602555,0.0,0.0,0.0,2.0,62.0,97397.16,41.0,6.0,1.0,0.0
602556,0.0,0.0,0.0,2.0,7.0,168445.62,34.0,2.0,0.0,0.0
602557,0.0,0.0,0.0,1.0,6.0,53689.02,24.0,1.0,1.0,0.0
602558,0.0,0.0,0.0,2.0,229.0,64404.21,61.0,7.0,0.0,0.0


In [3]:
from sklearn.model_selection import train_test_split
X = df[['ID_SEGMENTO_VALOR', 'EDAD', 'ID_GENERO', 'RENTA', 'DEBITO_DIR', 'ID_EMPLEADO', 'ANTIGUEDAD', 'EDAD_PUNTAJE']]
y = df['ACEPTADO']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=3)

In [None]:
def objective(trial):
    # Define the search space
    
    c_space=trial.suggest_loguniform('C',10e-1, 100)
    l1_rat_space=trial.suggest_uniform('l1_ratio',0.1, 1)
    

    clf =LogisticRegression(max_iter = 4000,penalty='l2',
                            random_state=2,C=c_space,l1_ratio=l1_rat_space)
    
    score = cross_val_score(clf, X_train, y_train, scoring='accuracy',
                            cv=4).mean()
    return score

study_rl = optuna.create_study(study_name="rlog_artif",
                            direction="maximize",
                              pruner=optuna.pruners.HyperbandPruner(max_resource="auto"),
                            sampler=TPESampler())
study_rl.optimize(objective, n_trials=50)

In [None]:
print(study_rl.best_params)

In [7]:
from sklearn.linear_model import LogisticRegression

# Inicializamos el modelo usando la libreria de python
logit = LogisticRegression(max_iter = 4000, penalty='l2', tol=0.0001, C=1.0)
# Ajustamos nuestro modelo con los datos que reservamos para "entrenar" al modelo con la historia disponible
logit.fit(X_train, y_train)

In [8]:
rl_opt = LogisticRegression(random_state = 0, solver = "newton-cg") #Combinación Óptima
rl_opt.fit(X_train, y_train)



In [14]:
# Predecir sobre los datos que dejamos para probar nuestro modelo usando la probabilidad
proba_logit = rl_opt.predict_proba(X)
len(proba_logit)

proba_logit_aceptar = []

for i in range (len(proba_logit)):
    proba_logit_aceptar.append(proba_logit[i][1])

proba_logit_aceptar

[0.014422863018442589,
 0.02289169825173691,
 0.01581633166680379,
 0.015446250695694943,
 0.01558058504731172,
 0.015813097096935166,
 0.015490828790088089,
 0.022856026773264374,
 0.01549099750816515,
 0.014461642866059743,
 0.021937333260446005,
 0.015467495373703562,
 0.015793137738683634,
 0.020628359806267597,
 0.23620873126397146,
 0.022889535542910747,
 0.0157857855899378,
 0.021505614589160307,
 0.01515357105354665,
 0.015479651571684487,
 0.02194179470694139,
 0.021091997576740292,
 0.022406756850770714,
 0.015499740921705437,
 0.015473235251661778,
 0.02285015102177847,
 0.024276378441029863,
 0.27090762050132006,
 0.02471385586274925,
 0.02853842222876234,
 0.283251226464157,
 0.021499261426620665,
 0.024718553094472327,
 0.22499530708924637,
 0.022380235991291735,
 0.013958022798584857,
 0.015830514281797534,
 0.024732846286566978,
 0.16336992664707178,
 0.015577437358912578,
 0.022828614811214964,
 0.017426784602655905,
 0.02239693689205535,
 0.1784046774077143,
 0.022447

In [15]:
df_proba = pd.DataFrame({'probabilidad_de_aceptar':proba_logit_aceptar})
df_proba

Unnamed: 0,probabilidad_de_aceptar
0,0.014423
1,0.022892
2,0.015816
3,0.015446
4,0.015581
...,...
602555,0.025494
602556,0.019825
602557,0.012786
602558,0.077868


In [16]:
tabla_con_id_cliente_mas_probabilidad = pd.merge(tabla_con_id_cliente, df_proba, left_index=True, right_index=True)

tabla_con_id_cliente_mas_probabilidad

Unnamed: 0,CT_CTE,DEBITO_DIR,ID_EMPLEADO,ID_SEGMENTO_VALOR,ANTIGUEDAD,ID_CLIENTE,RENTA,EDAD,EDAD_PUNTAJE,ID_GENERO,ACEPTADO,probabilidad_de_aceptar
0,1.0,0.0,0.0,2.0,6.0,1375586.0,87218.10,36.0,3.0,1.0,0.0,0.014423
1,1.0,0.0,0.0,1.0,35.0,1050612.0,122179.11,23.0,1.0,0.0,0.0,0.022892
2,0.0,0.0,0.0,1.0,35.0,1050613.0,119775.54,23.0,1.0,1.0,0.0,0.015816
3,1.0,0.0,0.0,1.0,35.0,1050615.0,22220.04,24.0,1.0,1.0,0.0,0.015446
4,1.0,0.0,0.0,1.0,35.0,1050616.0,295590.36,24.0,1.0,1.0,0.0,0.015581
...,...,...,...,...,...,...,...,...,...,...,...,...
602555,0.0,0.0,0.0,2.0,62.0,899549.0,97397.16,41.0,6.0,1.0,0.0,0.025494
602556,0.0,0.0,0.0,2.0,7.0,1441442.0,168445.62,34.0,2.0,0.0,0.0,0.019825
602557,0.0,0.0,0.0,1.0,6.0,1454431.0,53689.02,24.0,1.0,1.0,0.0,0.012786
602558,0.0,0.0,0.0,2.0,229.0,56812.0,64404.21,61.0,7.0,0.0,0.0,0.077868


In [17]:
tabla_con_id_cliente_mas_probabilidad = tabla_con_id_cliente_mas_probabilidad.sort_values(by='probabilidad_de_aceptar', ascending = False)
tabla_con_id_cliente_mas_probabilidad.dropna()

Unnamed: 0,CT_CTE,DEBITO_DIR,ID_EMPLEADO,ID_SEGMENTO_VALOR,ANTIGUEDAD,ID_CLIENTE,RENTA,EDAD,EDAD_PUNTAJE,ID_GENERO,ACEPTADO,probabilidad_de_aceptar
269471,0.0,1.0,0.0,2.0,205.0,98717.0,21674246.67,47.0,10.0,0.0,0.0,0.781240
281523,1.0,1.0,0.0,2.0,217.0,48404.0,22034738.76,45.0,10.0,1.0,1.0,0.736765
275354,0.0,1.0,0.0,3.0,241.0,19398.0,644710.38,46.0,10.0,0.0,1.0,0.734941
276236,0.0,1.0,0.0,3.0,244.0,16233.0,207387.69,47.0,10.0,0.0,1.0,0.731999
275184,0.0,1.0,0.0,3.0,240.0,19929.0,36034.77,46.0,10.0,0.0,1.0,0.729799
...,...,...,...,...,...,...,...,...,...,...,...,...
597797,1.0,0.0,0.0,1.0,1.0,1436134.0,67531.65,31.0,1.0,1.0,0.0,0.010711
470693,1.0,0.0,0.0,1.0,1.0,1402635.0,63119.28,31.0,1.0,1.0,0.0,0.010709
597871,1.0,0.0,0.0,1.0,1.0,1440072.0,55267.65,31.0,1.0,1.0,0.0,0.010707
566252,0.0,0.0,0.0,1.0,1.0,1400657.0,53738.85,31.0,1.0,1.0,0.0,0.010706


In [None]:
tabla_con_id_cliente_mas_probabilidad.to_csv(r'C:\Users\marti\Desktop\Capstone\procesamiento de datos\Resultados_Regresion_Logistica.csv', index=False, header=True)