Librerias

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

Cargando datos

In [2]:
# Dataframe preprocesado
df = pd.read_csv('data/df.txt', sep='\t', encoding='latin-1')
df.head(20)

Unnamed: 0,asegurado_id,genero,ciudad,cancer,epoc,diabetes,hipertension,enf_cardiovascular,reclamacion,eventos,valor_pagado,estado_poliza,segmento_edad,diagnostico,tiempo_poliza
0,11885152,0,Bogota,0,0,0,0,0,CONSULTA EXTERNA,1,439577.2,Poliza activa,Adultez,Diagnostico pendiente,24
1,763957,0,Medellin,0,0,0,0,0,LABORATORIO CLINICO,1,423147.3,Poliza activa,Adultez,Diagnostico pendiente,24
2,763957,0,Medellin,0,0,0,0,0,EXAMENES DE DIAGNOSTICO,1,458111.4,Poliza activa,Adultez,Diagnostico pendiente,24
3,763957,0,Medellin,0,0,0,0,0,EXAMENES DE DIAGNOSTICO,1,407415.9,Poliza activa,Adultez,Factores que influyen en el estado de salud (S...,24
4,763957,0,Medellin,0,0,0,0,0,FISIOTERAPIA ILIMITADA,5,499010.0,Poliza activa,Adultez,Diagnostico pendiente,24
5,763957,0,Medellin,0,0,0,0,0,EXAMENES DE DIAGNOSTICO,1,434168.0,Poliza activa,Adultez,Factores que influyen en el estado de salud (S...,24
6,763957,0,Medellin,0,0,0,0,0,LABORATORIO CLINICO,2,486938.6,Poliza activa,Adultez,Diagnostico pendiente,24
7,763957,0,Medellin,0,0,0,0,0,LABORATORIO CLINICO,4,543457.0,Poliza activa,Adultez,Diagnostico pendiente,24
8,763957,0,Medellin,0,0,0,0,0,CONSULTA DE URGENCIAS,1,519215.7,Poliza activa,Adultez,Diagnostico pendiente,24
9,763957,0,Medellin,0,0,0,0,0,CONSULTA EXTERNA,1,425636.1,Poliza activa,Adultez,Diagnostico pendiente,24


In [3]:
# Verificar tipo de la columna Valor_Pagado
print(df['valor_pagado'].dtype)

# Paso 1: Convertir valores a string para reemplazar punto por coma (si fuera necesario)
df['valor_pagado'] = df['valor_pagado'].apply(lambda x: f"{x:,.2f}".replace(',', 'X').replace('.', ',').replace('X', '.'))

print(df)

float64
         asegurado_id  genero    ciudad  cancer  epoc  diabetes  hipertension  \
0            11885152       0    Bogota       0     0         0             0   
1              763957       0  Medellin       0     0         0             0   
2              763957       0  Medellin       0     0         0             0   
3              763957       0  Medellin       0     0         0             0   
4              763957       0  Medellin       0     0         0             0   
...               ...     ...       ...     ...   ...       ...           ...   
3778951      34905586       0    Bogota       0     0         0             0   
3778952      34905586       0    Bogota       0     0         0             0   
3778953      34905586       0    Bogota       0     0         0             0   
3778954      34905586       0    Bogota       0     0         0             0   
3778955      34905586       0    Bogota       0     0         0             0   

         enf_cardio

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3778956 entries, 0 to 3778955
Data columns (total 15 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   asegurado_id        int64 
 1   genero              int64 
 2   ciudad              object
 3   cancer              int64 
 4   epoc                int64 
 5   diabetes            int64 
 6   hipertension        int64 
 7   enf_cardiovascular  int64 
 8   reclamacion         object
 9   eventos             int64 
 10  valor_pagado        object
 11  estado_poliza       object
 12  segmento_edad       object
 13  diagnostico         object
 14  tiempo_poliza       int64 
dtypes: int64(9), object(6)
memory usage: 432.5+ MB


In [5]:
# Convertir 'valor_pagado' a numérico, eliminando caracteres especiales
df['valor_pagado'] = df['valor_pagado'].str.replace(r'[^\d,]', '', regex=True).str.replace(',', '.').astype(float)

# Selección de características y target
X = df.drop(columns=['valor_pagado', 'reclamacion', 'diagnostico'])
y = df['valor_pagado']

# Identificación de variables numéricas y categóricas
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# Preprocesamiento de las características
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Definición de los modelos
models = {
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(n_estimators=100),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100)
}


In [None]:
# División de los datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Evaluación de modelos
for name, model in models.items():
    # Crear pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('model', model)])
    
    # Validación cruzada
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    cv_rmse = np.sqrt(-cv_scores)
    print(f"{name} - CV RMSE: {cv_rmse.mean():.2f} ± {cv_rmse.std():.2f}")
    
    # Entrenamiento y predicción
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    # Evaluación en el conjunto de prueba
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"{name} - Test RMSE: {rmse:.2f}, R^2: {r2:.2f}\n")

In [None]:
# Guardar el modelo con mejor desempeño
#best_model = models["Random Forest"]
#best_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                                ('model', best_model)])
#best_pipeline.fit(X_train, y_train)
#joblib.dump(best_pipeline, 'best_model.pkl')