In [1]:
import polars as pl
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns

from typing import List

from scipy.stats import chi2_contingency
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

import time

#import optuna
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from functools import partial
from sklearn import ensemble
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, log_loss, brier_score_loss, roc_auc_score, roc_curve, classification_report, make_scorer
from sklearn.calibration import CalibratedClassifierCV, calibration_curve, CalibrationDisplay
from sklearn.metrics import roc_auc_score, r2_score

import time

In [2]:
def volumetria(df, columna, orden):
    # Hacer una copia de la columna original para no modificarla
    temp_col = df[columna].copy()
    
    # Reemplazar NaN por 'NULL' en la copia
    temp_col = temp_col.fillna('NULL')
    
    # Incluir nulos en el groupby
    resultado = temp_col.value_counts().reset_index()
    resultado.columns = [columna, 'Cantidad']
    
    total = resultado['Cantidad'].sum()
    resultado['% Total'] = (resultado['Cantidad'] / total) * 100
    
    if orden == 0:
        resultado = resultado.sort_values('% Total', ascending=False)
        resultado['% Acumulado'] = resultado['% Total'].cumsum()
        return resultado
    elif orden == 1:
        resultado = resultado.sort_values(columna, ascending=True)
        resultado['% Acumulado'] = resultado['% Total'].cumsum()
        return resultado
    else:
        return print('En orden solo puede escoger los valores 0, 1')

In [3]:
# Revision de Missings
def revision_missings(df):
    df_missings = pd.DataFrame({
        'COLUMNA': df.columns,
        'NRO_NULL': df.isna().sum(),
        '%_NULL': round(df.isna().mean() * 100, 2).astype(str) + '%'
    }).reset_index(drop=True)

    return df_missings

In [4]:
import unidecode  # Asegura que tienes instalada esta librería con: pip install unidecode
directory_contactos = "C:/Users/BW439WF/Downloads/Datathon DMC/Datathon_DMC/data/02_raw/"
df_contactos = pd.read_excel(directory_contactos + 'BBDD_CONTACTOS.xlsx')

# Función para limpiar nombres de columnas
def clean_column_names(columns):
    return [unidecode.unidecode(col.upper().replace(" ", "_")) for col in columns]

# Aplicar la función a las columnas del DataFrame
df_contactos.columns = clean_column_names(df_contactos.columns)

# Resumen Data
print(f'Nro Clientes: {df_contactos["ID_COMPLETO"].nunique():,}')
print(f'Nro Filas: {df_contactos.shape[0]:,}')
df_contactos.head()

Nro Clientes: 147,513
Nro Filas: 147,513


Unnamed: 0,ID_COMPLETO,PAIS_DE_CORREO,ESTADO_O_PROVINCIA_DE_CORREO,CODIGO_POSTAL_DE_CORREO,CIUDAD_DE_CORREO,NOMBRE,GENERO,TRATAMIENTO,ESTADO_CIVIL,TIENE_HIJOS,HOBBIE,TIENE_NIETOS,NIVEL_DE_ESTUDIOS,TRABAJA_ACTUALMENTE,OCUPACION
0,0031R00001uhz7IQAQ,Perú,Lima,Lima,San Borja,Gledy Georgina,Femenino,Srita.,,,,,,,
1,0031R0000281tkvQAA,Perú,Lima,Lima,San Borja,Fredy Enrique,Masculino,Sr.,,,,,,,
2,0031R00002KqNYbQAN,Perú,Lima,Lima,Santiago De Surco,Alvaro Patricio,Masculino,Sr.,Soltero/a?,No,,,,,
3,0031R00002hJzb5QAC,Perú,Lambayeque,Chiclayo,Chiclayo,Anton Cesar,Masculino,Sr.,,,,,,,
4,0033600001ADzauAAD,Perú,Lima,Lima,Comas,Diana Alison Milagros,Femenino,Sra.,,,,,,,


#### 1. Validacion Estructura

In [5]:
df_contactos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147513 entries, 0 to 147512
Data columns (total 15 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   ID_COMPLETO                   147513 non-null  object
 1   PAIS_DE_CORREO                146171 non-null  object
 2   ESTADO_O_PROVINCIA_DE_CORREO  144793 non-null  object
 3   CODIGO_POSTAL_DE_CORREO       143507 non-null  object
 4   CIUDAD_DE_CORREO              143476 non-null  object
 5   NOMBRE                        147513 non-null  object
 6   GENERO                        147466 non-null  object
 7   TRATAMIENTO                   147458 non-null  object
 8   ESTADO_CIVIL                  19568 non-null   object
 9   TIENE_HIJOS                   20951 non-null   object
 10  HOBBIE                        383 non-null     object
 11  TIENE_NIETOS                  72 non-null      object
 12  NIVEL_DE_ESTUDIOS             479 non-null     object
 13 

In [6]:
# Contamos valores nulos de cada variable
revision_missings(df_contactos)

Unnamed: 0,COLUMNA,NRO_NULL,%_NULL
0,ID_COMPLETO,0,0.0%
1,PAIS_DE_CORREO,1342,0.91%
2,ESTADO_O_PROVINCIA_DE_CORREO,2720,1.84%
3,CODIGO_POSTAL_DE_CORREO,4006,2.72%
4,CIUDAD_DE_CORREO,4037,2.74%
5,NOMBRE,0,0.0%
6,GENERO,47,0.03%
7,TRATAMIENTO,55,0.04%
8,ESTADO_CIVIL,127945,86.73%
9,TIENE_HIJOS,126562,85.8%


#### 2. Volumetría - Análisis Exploratorio

##### 2.1 Frecuencias Categorías

In [7]:
# Uso de la función
i=0
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 1)

Nro Columna: 0


Unnamed: 0,ID_COMPLETO,Cantidad,% Total,% Acumulado
0,0031R00001uhz7IQAQ,1,0.000678,0.000678
98344,0031R00001uhz7JQAQ,1,0.000678,0.001356
98350,0031R00001uhz7KQAQ,1,0.000678,0.002034
98329,0031R00001uhz7LQAQ,1,0.000678,0.002712
98326,0031R00001uhz7MQAQ,1,0.000678,0.003390
...,...,...,...,...
49173,003UX00000I2PGCYA3,1,0.000678,99.997288
49174,003UX00000I2VWoYAN,1,0.000678,99.997966
49175,003UX00000I2VWpYAN,1,0.000678,99.998644
49176,003UX00000I2bSXYAZ,1,0.000678,99.999322


In [8]:
i+=1
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 0)

Nro Columna: 1


Unnamed: 0,PAIS_DE_CORREO,Cantidad,% Total,% Acumulado
0,Perú,146164,99.085504,99.085504
1,,1342,0.90975,99.995255
2,Estados Unidos,5,0.00339,99.998644
3,España,2,0.001356,100.0


In [9]:
i+=1
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 0)

Nro Columna: 2


Unnamed: 0,ESTADO_O_PROVINCIA_DE_CORREO,Cantidad,% Total,% Acumulado
0,Lima,105944,71.820111,71.820111
1,Callao,8298,5.625267,77.445378
2,Lambayeque,8011,5.430708,82.876085
3,Arequipa,4981,3.376652,86.252737
4,Loreto,3190,2.162521,88.415258
5,Piura,2961,2.007281,90.422539
6,,2720,1.843905,92.266444
7,Ica,2036,1.380217,93.646662
8,Áncash,1790,1.213452,94.860114
9,La Libertad,1519,1.02974,95.889854


In [10]:
i+=1
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 0)

Nro Columna: 3


Unnamed: 0,CODIGO_POSTAL_DE_CORREO,Cantidad,% Total,% Acumulado
0,Lima,104942,71.140849,71.140849
1,Callao,8296,5.623911,76.764760
2,Chiclayo,7280,4.935158,81.699918
3,Arequipa,4819,3.266831,84.966749
4,,4006,2.715693,87.682442
...,...,...,...,...
179,San Agustín de Cajas,1,0.000678,99.997288
178,26 De Octubre,1,0.000678,99.997966
177,Chulucanas,1,0.000678,99.998644
176,Paucar pata,1,0.000678,99.999322


In [11]:
i+=1
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 0)

Nro Columna: 4


Unnamed: 0,CIUDAD_DE_CORREO,Cantidad,% Total,% Acumulado
0,Lima,23763,16.109089,16.109089
1,Santiago De Surco,11266,7.637293,23.746382
2,Chiclayo,5126,3.474948,27.221330
3,Miraflores,5017,3.401056,30.622386
4,San Martin De Porres,4499,3.049901,33.672287
...,...,...,...,...
522,Tejas,1,0.000678,99.997288
523,Larco Herrera,1,0.000678,99.997966
524,Mariscal Castilla,1,0.000678,99.998644
525,Los Molinos,1,0.000678,99.999322


In [12]:
i+=1
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 0)

Nro Columna: 5


Unnamed: 0,NOMBRE,Cantidad,% Total,% Acumulado
0,Juan Carlos,781,0.529445,0.529445
1,Miguel Angel,711,0.481991,1.011436
2,Jose Luis,650,0.440639,1.452075
3,Luis Alberto,615,0.416912,1.868988
4,Jorge Luis,523,0.354545,2.223533
...,...,...,...,...
34330,Jaqueline Sadith,1,0.000678,99.997288
34331,Marilyn Rocio,1,0.000678,99.997966
34332,Sara Berenice,1,0.000678,99.998644
34333,Soraya Tathiana,1,0.000678,99.999322


In [13]:
i+=1
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 0)

Nro Columna: 6


Unnamed: 0,GENERO,Cantidad,% Total,% Acumulado
0,Femenino,80753,54.742972,54.742972
1,Masculino,66681,45.203474,99.946445
2,,47,0.031862,99.978307
3,Otro,32,0.021693,100.0


In [14]:
i+=1
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 0)

Nro Columna: 7


Unnamed: 0,TRATAMIENTO,Cantidad,% Total,% Acumulado
0,Sra.,67103,45.48955,45.48955
1,Sr.,66684,45.205507,90.695057
2,Srita.,13671,9.267658,99.962715
3,,55,0.037285,100.0


In [15]:
i+=1
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 0)

Nro Columna: 8


Unnamed: 0,ESTADO_CIVIL,Cantidad,% Total,% Acumulado
0,,127945,86.734728,86.734728
1,Soltero/a?,8858,6.004894,92.739623
2,Casado/a?,8032,5.444944,98.184567
3,Concubinato / Unión Libre,831,0.56334,98.747907
4,Divorciado/a?,566,0.383695,99.131602
5,Soltero/a,428,0.290144,99.421746
6,Viudo/a?,347,0.235234,99.656979
7,Casado/a,261,0.176934,99.833913
8,Separado/a,197,0.133548,99.96746
9,Divorciado/a,20,0.013558,99.981019


In [16]:
i+=1
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 0)

Nro Columna: 9


Unnamed: 0,TIENE_HIJOS,Cantidad,% Total,% Acumulado
0,,126562,85.797184,85.797184
1,Si,12539,8.500268,94.297452
2,No,8412,5.702548,100.0


In [17]:
i+=1
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 0)

Nro Columna: 10


Unnamed: 0,HOBBIE,Cantidad,% Total,% Acumulado
0,,147130,99.740362,99.740362
1,Baile,35,0.023727,99.764089
2,Lectura,30,0.020337,99.784426
3,Fútbol,27,0.018303,99.802729
4,Deportes,20,0.013558,99.816287
...,...,...,...,...
62,Canto; Voleibol,1,0.000678,99.997288
61,Baile; Correr,1,0.000678,99.997966
59,Dibujar; Escuchar música,1,0.000678,99.998644
58,Dibujar; Lectura,1,0.000678,99.999322


In [18]:
i+=1
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 0)

Nro Columna: 11


Unnamed: 0,TIENE_NIETOS,Cantidad,% Total,% Acumulado
0,,147441,99.951191,99.951191
1,Si,41,0.027794,99.978985
2,No,31,0.021015,100.0


In [19]:
i+=1
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 0)

Nro Columna: 12


Unnamed: 0,NIVEL_DE_ESTUDIOS,Cantidad,% Total,% Acumulado
0,,147034,99.675283,99.675283
1,Educación superior de tercer nivel (universida...,394,0.267095,99.942378
2,"Educación superior de cuarto nivel (maestría, ...",49,0.033217,99.975595
3,Educación secundaria alta,35,0.023727,99.999322
4,Educación primaria?,1,0.000678,100.0


In [20]:
i+=1
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 0)

Nro Columna: 13


Unnamed: 0,TRABAJA_ACTUALMENTE,Cantidad,% Total,% Acumulado
0,,146368,99.223797,99.223797
1,Si,1042,0.706378,99.930176
2,No,103,0.069824,100.0


In [21]:
i+=1
print(f'Nro Columna: {i}')
volumetria(df_contactos, df_contactos.columns[i], 0)

Nro Columna: 14


Unnamed: 0,OCUPACION,Cantidad,% Total,% Acumulado
0,,118407,80.268858,80.268858
1,Otra,3999,2.710948,82.979805
2,Empleado,3368,2.283189,85.262994
3,Administrador/a,2148,1.456143,86.719137
4,Ingeniero/a,2033,1.378184,88.097320
...,...,...,...,...
227,Mercadólogo,1,0.000678,99.997288
226,Musico,1,0.000678,99.997966
225,Académico,1,0.000678,99.998644
224,Asesor Legal,1,0.000678,99.999322
