<a href="https://colab.research.google.com/github/MartinDC95/Core-s-del-modulo-dos/blob/main/Proyecto_2_Parte_II_(core).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Importar librerías
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Cargar datos
# Reemplaza 'ruta/dataset.csv' con la ruta de tu archivo o URL
df = pd.read_csv('/content/insurance.csv')

# Mostrar primeras filas
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
# Identificar columnas numéricas y categóricas
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

# Imputar valores numéricos con la media y categóricos con la moda
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Aplicar imputación
df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Verificar si quedan valores nulos
print("Valores nulos después de la imputación:\n", df.isnull().sum())

Valores nulos después de la imputación:
 age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


In [7]:
# Detectar outliers usando el rango intercuartílico (IQR)
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Filtrar los outliers (opcional: puedes eliminarlos o marcarlos)
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

In [8]:
# Configurar los transformadores
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ]
)


In [11]:
# Importar librerías
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Cargar datos
# Reemplaza 'ruta/dataset.csv' con la ruta de tu archivo o URL
df = pd.read_csv('/content/insurance.csv')

# Mostrar primeras filas
df.head()

# Identificar columnas numéricas y categóricas
# Exclude 'charges' from num_cols since it's the target variable
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop('charges')
cat_cols = df.select_dtypes(include=['object']).columns

# Imputar valores numéricos con la media y categóricos con la moda
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

# Aplicar imputación
df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Verificar si quedan valores nulos
print("Valores nulos después de la imputación:\n", df.isnull().sum())

# Detectar outliers usando el rango intercuartílico (IQR)
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Filtrar los outliers (opcional: puedes eliminarlos o marcarlos)
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

# Configurar los transformadores
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ]
)

# Crear el pipeline de preprocesamiento
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Dividir en conjuntos de entrenamiento y prueba
# Replace 'charges' with the actual name of your target column
X = df.drop('charges', axis=1)
y = df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Aplicar el pipeline a los datos de entrenamiento
X_train_prep = pipeline.fit_transform(X_train)
X_test_prep = pipeline.transform(X_test)


Valores nulos después de la imputación:
 age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


In [14]:
# Importar modelos
from sklearn.linear_model import LinearRegression  # Importa LinearRegression
from sklearn.neighbors import KNeighborsRegressor  # Importa KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor  # Importa DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor  # Importa RandomForestRegressor
from xgboost import XGBRegressor  # Importa XGBRegressor
from lightgbm import LGBMRegressor  # Importa LGBMRegressor

# Importar métricas
from sklearn.metrics import mean_squared_error, r2_score  # Importa métricas de regresión
from sklearn.model_selection import cross_val_score

# Definir modelos a entrenar
# Cambia los modelos a modelos de regresión
modelos = {
    'Linear Regression': LinearRegression(),
    'KNN': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor()
}

# Definir función para evaluar cada modelo
# Cambia la métrica de evaluación a una métrica de regresión (e.g., 'neg_mean_squared_error')
def evaluar_modelo(modelo, X, y):
    scores = cross_val_score(modelo, X, y, cv=5, scoring='neg_mean_squared_error')
    return np.mean(scores)

# Evaluar cada modelo
resultados = {}
for nombre, modelo in modelos.items():
    puntuacion = evaluar_modelo(modelo, X_train_prep, y_train)
    resultados[nombre] = puntuacion
    print(f"{nombre}: MSE promedio de validación cruzada = {puntuacion:.4f}")  # Cambia la etiqueta a MSE

# Mostrar resultados
print("\nResultados de Validación Cruzada para cada modelo:")
for modelo, score in resultados.items():
    print(f"{modelo}: {score:.4f}")


Linear Regression: MSE promedio de validación cruzada = -37251026.0639
KNN: MSE promedio de validación cruzada = -45288429.6525
Decision Tree: MSE promedio de validación cruzada = -40572273.4434
Random Forest: MSE promedio de validación cruzada = -23745554.3774
XGBoost: MSE promedio de validación cruzada = -26889029.5215
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001049 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 320
[LightGBM] [Info] Number of data points in the train set: 850, number of used features: 8
[LightGBM] [Info] Start training from score 13413.050740
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 319
[LightGBM] [Info] Number of data points in the train set: 850, number of use

In [17]:
# Importar modelos
from sklearn.linear_model import LinearRegression  # Importa LinearRegression
from sklearn.neighbors import KNeighborsRegressor  # Importa KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor  # Importa DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor  # Importa RandomForestRegressor
from xgboost import XGBRegressor  # Importa XGBRegressor
from lightgbm import LGBMRegressor  # Importa LGBMRegressor

# Importar métricas
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error  # Importa métricas de regresión
from sklearn.model_selection import cross_val_score

# Entrenar el modelo con mejor rendimiento en el conjunto de entrenamiento y obtener métricas adicionales
mejor_modelo_nombre = max(resultados, key=resultados.get)
mejor_modelo = modelos[mejor_modelo_nombre]

# Entrenar en el conjunto de entrenamiento
mejor_modelo.fit(X_train_prep, y_train)

# Predicción en el conjunto de prueba
y_pred = mejor_modelo.predict(X_test_prep)

# Calcular métricas de regresión
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)  # Raíz cuadrada del MSE
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Imprimir métricas de regresión
print(f"\nMétricas para el mejor modelo ({mejor_modelo_nombre}):")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R2): {r2:.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000230 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 320
[LightGBM] [Info] Number of data points in the train set: 1063, number of used features: 8
[LightGBM] [Info] Start training from score 13180.426961

Métricas para el mejor modelo (LightGBM):
Mean Squared Error (MSE): 23202223.7750
Root Mean Squared Error (RMSE): 4816.8687
Mean Absolute Error (MAE): 2691.6475
R-squared (R2): 0.8434
