# Model Baseline

In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, root_mean_squared_error, root_mean_squared_log_error, r2_score

## Carga de Datos Preprocesados

In [22]:
trainset_df = pd.read_csv("../../data/processed/train_data_transformed.csv")
testset_df = pd.read_csv("../../data/processed/test_data_transformed.csv")
trainset_df.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Health Score,Location,Policy Type,Vehicle Age,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,19.0,0,10049.0,1,1.0,1,22.598761,2,2,17.0,5.0,0,0,2,2,2869.0
1,39.0,0,31678.0,0,3.0,2,15.569731,0,1,12.0,2.0,1,1,1,2,1483.0
2,23.0,1,25602.0,0,3.0,0,47.177549,1,2,14.0,3.0,2,1,2,2,567.0
3,21.0,1,141855.0,1,2.0,1,10.938144,0,0,0.0,1.0,0,1,3,0,765.0
4,21.0,1,39651.0,2,1.0,1,20.376094,0,2,8.0,4.0,0,1,2,2,2022.0


## División de los Datos en Conjuntos de Entrenamiento y Prueba

In [4]:
X = trainset_df.drop(columns=["Premium Amount"])
Y = trainset_df["Premium Amount"]

In [5]:
X.head()

Unnamed: 0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Health Score,Location,Policy Type,Vehicle Age,Insurance Duration,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,19.0,0,10049.0,1,1.0,1,22.598761,2,2,17.0,5.0,0,0,2,2
1,39.0,0,31678.0,0,3.0,2,15.569731,0,1,12.0,2.0,1,1,1,2
2,23.0,1,25602.0,0,3.0,0,47.177549,1,2,14.0,3.0,2,1,2,2
3,21.0,1,141855.0,1,2.0,1,10.938144,0,0,0.0,1.0,0,1,3,0
4,21.0,1,39651.0,2,1.0,1,20.376094,0,2,8.0,4.0,0,1,2,2


In [6]:
Y.head()

0    2869.0
1    1483.0
2     567.0
3     765.0
4    2022.0
Name: Premium Amount, dtype: float64

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=1)

## Creación del Modelo usando Arbol de Deisión para Regresión

In [11]:
insurance_regression_tree = DecisionTreeRegressor(criterion="squared_error", max_depth=6)

## Entrenamiento

In [12]:
insurance_regression_tree.fit(X_train, Y_train)

## Evaluación

In [13]:
insurance_regression_tree.score(X_test,Y_test)

0.007729750978981875

In [32]:
y_pred = insurance_regression_tree.predict(X_test)

mae = mean_absolute_error(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = root_mean_squared_error(Y_test, y_pred)
rmsle_value = root_mean_squared_log_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"RMSLE: {rmsle_value}")
print(f"R2 Score: {r2:.4f}")


MAE: 661.5206463239788
MSE: 737145.2943197117
RMSE: 858.5716593969962
RMSLE: 1.1626354974581343
R2 Score: 0.0077


## Test Datos Nuevos

In [23]:
ytest_pred = insurance_regression_tree.predict(testset_df)
ytest_pred

array([1260.38577811, 1045.35615768, 1086.0970185 , ..., 1118.66705808,
       1063.1442655 , 1086.0970185 ])

### Exportar Predicciones

In [24]:
original_testset_df = pd.read_csv("../../data/raw/test.csv")
original_testset_df["Premium Amount"] = ytest_pred
original_testset_df[["id", "Premium Amount"]].to_csv("../../data/outputs/predictions.csv", index=False)