In [None]:
#Librería para menejo de datos
import pandas as pd 
#Librería para manejo de tensores 
import numpy as np
#Modelo seleccionado
from xgboost import XGBRegressor
#Separación de datos y valdiacion
from sklearn.model_selection import train_test_split, KFold, cross_val_score
#Normalizacion
from sklearn.preprocessing import  OneHotEncoder
#Transformador
from sklearn.compose import ColumnTransformer
#Pipline 
from sklearn.pipeline import Pipeline
#Evaluacion del modelo
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [3]:
df = pd.read_csv('../data/housing_clean.csv')
df.head(4)

Unnamed: 0,longitude,latitude,housing_median_age,population,households,median_income,median_house_value,ocean_proximity,bedrooms_x_room
0,-122.23,37.88,41.0,322.0,126.0,8.3252,452600.0,NEAR BAY,0.146591
1,-122.22,37.86,21.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,0.155797
2,-122.24,37.85,52.0,496.0,177.0,7.2574,352100.0,NEAR BAY,0.129516
3,-122.25,37.85,52.0,558.0,219.0,5.6431,341300.0,NEAR BAY,0.184458


In [4]:
#Seleccionar variables 
X = df.drop('median_house_value', axis=1)
Y = df['median_house_value']

#Separación de las variables 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [5]:
#Variables categoricas y numericas
numericas = X.select_dtypes(include=[np.number]).columns
categoricas = X.select_dtypes(exclude=[np.number]).columns
#Crear el preprocesador 
transformer = ColumnTransformer(transformers=[('num', 'passthrough', numericas), ('cat', OneHotEncoder(handle_unknown='ignore'), categoricas)])

In [6]:
#Crear el modelo 
xg_model = XGBRegressor(n_estimators=552,learning_rate=0.0584,max_depth=8,subsample=0.7,colsample_bytree=0.8,reg_alpha=0.4,reg_lambda=1.7,random_state=42)
#Crear el pipline 
pipline = Pipeline([
    ('preprocesador', transformer),
    ('model', xg_model)
])
pipline.fit(X_train, Y_train)

0,1,2
,steps,"[('preprocesador', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [7]:
#Realizar entrenamiento y validación del modelo nuevamente para rectificar performance
pred = pipline.predict(X_test)
mse, mae, r2 = mean_squared_error(Y_test, pred), mean_absolute_error(Y_test, pred), r2_score(Y_test, pred)
print(f'MSE: {mse} | MAE: {mae} | R2: {r2}')

MSE: 2171064009.380816 | MAE: 30286.75951959181 | R2: 0.8343216472543356


In [8]:
pred_train = pipline.predict(X_train)
mse_train, mae_train, r2_train = mean_squared_error(Y_train, pred_train), mean_absolute_error(Y_train, pred_train), r2_score(Y_train, pred_train)
print(f'MSE_TRAIN: {mse_train} | MAE_TRAIN: {mae_train} | R2_TRAIN: {r2_train}')

MSE_TRAIN: 229610239.5061011 | MAE_TRAIN: 10755.653531096703 | R2_TRAIN: 0.9828236099690209


# **Reduccion de overfitting**

In [27]:
#Probar para reduccion de overtf
xg_model = XGBRegressor(n_estimators=400,learning_rate=0.0584,max_depth=6,subsample=0.7,colsample_bytree=0.8,reg_alpha=0.6  ,reg_lambda=2.0,random_state=42)

# **Cambios en**

**n_estimators:** Antes: 552 | Ahora: 400 

**max_depth:** Antes: 8 | Ahora: 6 

**reg_alpha:** Antes: 0.8 | Ahora: 0.6 

**reg_lambda:** Antes: 1.7 | Ahora: 2.0 

In [28]:
#Crear el pipline 
pipline = Pipeline([
    ('preprocesador', transformer),
    ('model', xg_model)
])
pipline.fit(X_train, Y_train)

0,1,2
,steps,"[('preprocesador', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [29]:
#Realizar entrenamiento y validación del modelo nuevamente para rectificar performance
pred = pipline.predict(X_test)
mse, mae, r2 = mean_squared_error(Y_test, pred), mean_absolute_error(Y_test, pred), r2_score(Y_test, pred)
print(f'MSE: {mse} | MAE: {mae} | R2: {r2}')

MSE: 2260046986.0253234 | MAE: 31361.352946909825 | R2: 0.8275311735837446


In [30]:
pred_train = pipline.predict(X_train)
mse_train, mae_train, r2_train = mean_squared_error(Y_train, pred_train), mean_absolute_error(Y_train, pred_train), r2_score(Y_train, pred_train)
print(f'MSE_TRAIN: {mse_train} | MAE_TRAIN: {mae_train} | R2_TRAIN: {r2_train}')

MSE_TRAIN: 991699104.9221812 | MAE_TRAIN: 21939.077480464017 | R2_TRAIN: 0.9258142378312195


In [9]:
#Validación cruzada para revisar que se mantenga estable nuestro valor 
flod = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipline, X, Y, cv=flod, scoring='r2', verbose=2)

[CV] END .................................................... total time=   1.5s
[CV] END .................................................... total time=   1.2s
[CV] END .................................................... total time=   0.9s
[CV] END .................................................... total time=   1.1s
[CV] END .................................................... total time=   1.0s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.7s finished


In [10]:
print(f'Scores: {scores}')
print(f'Mean scores: {np.mean(scores)}')

Scores: [0.83308671 0.85121714 0.84310032 0.86138494 0.83858046]
Mean scores: 0.8454739154789859


Se redujo el overtfitting con 10 valores de diferencia entre entrenamiento y prueba.
Listo para desplegar el modelo