In [25]:
# BLOQUE 1: Importar librerías y montar Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from geopy.distance import geodesic
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, SplineTransformer, PolynomialFeatures
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import category_encoders as ce
import joblib

# Semilla global
RNG = 42


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
# BLOQUE 2: Carga del CSV y selección de columnas relevantes
file_path = '/content/drive/MyDrive/Dataset Idealista/pisosBarcelona-21-04-2025-clean.csv'
df = pd.read_csv(file_path, encoding='latin1')

relevant_cols = [
    'price','size','rooms','bathrooms','floor','hasLift','exterior',
    'propertyType','status','numPhotos','latitude','longitude',
    'hasParking','isParkingIncludedInPrice'
]
df_model = df[relevant_cols].copy()
print(f"Cargados {df_model.shape[0]} registros y {df_model.shape[1]} columnas")


Cargados 8478 registros y 14 columnas


In [27]:
# BLOQUE 3: Limpieza y formato sin chained-assignment
df_model = df_model.copy()

# Floor → numérico (bj=0, en=0.5, ss=-1), imputar mediana
floor_map = {'bj':0.0,'en':0.5,'ss':-1.0}
df_model['floor'] = pd.to_numeric(
    df_model['floor'].replace(floor_map),
    errors='coerce'
).fillna(df_model['floor'].map(floor_map).median())

# Exterior → 1/0, imputar moda
df_model['exterior'] = pd.to_numeric(
    df_model['exterior'].replace({'Unknown':np.nan, True:1, False:0}),
    errors='coerce'
).fillna(df_model['exterior'].mode()[0]).astype(int)

# hasLift → 1/0, imputar moda
df_model['hasLift'] = df_model['hasLift'].fillna(df_model['hasLift'].mode()[0]).astype(int)

# status → rellenar nulos con 'Unknown'
df_model['status'] = df_model['status'].fillna('Unknown')

# parking_status combinada + drop originales
conds = [
    (df_model['hasParking']==1)&(df_model['isParkingIncludedInPrice']==1),
    (df_model['hasParking']==1)&(df_model['isParkingIncludedInPrice']==0),
    (df_model['hasParking']==0)
]
choices = ['Included','Optional','None']
df_model['parking_status'] = np.select(conds, choices, default='Unknown')
df_model.drop(['hasParking','isParkingIncludedInPrice'], axis=1, inplace=True)

# Flags de nulos
for c in ['size','rooms','bathrooms','latitude','longitude']:
    df_model[f'isna_{c}'] = df_model[c].isna().astype(int)

# Imputar numéricos con mediana
for c in ['size','rooms','bathrooms','latitude','longitude']:
    df_model[c] = df_model[c].fillna(df_model[c].median())

print("Nulls remanentes:\n", df_model.isna().sum())


Nulls remanentes:
 price             0
size              0
rooms             0
bathrooms         0
floor             0
hasLift           0
exterior          0
propertyType      0
status            0
numPhotos         0
latitude          0
longitude         0
parking_status    0
isna_size         0
isna_rooms        0
isna_bathrooms    0
isna_latitude     0
isna_longitude    0
dtype: int64


  df_model['exterior'].replace({'Unknown':np.nan, True:1, False:0}),
  df_model['hasLift'] = df_model['hasLift'].fillna(df_model['hasLift'].mode()[0]).astype(int)


In [28]:
# BLOQUE 4: Ingeniería espacial + cluster avg price
# 4.1 Distancias a POIs
pois = {
  'Catalunya':(41.3874,2.1700),
  'Barceloneta':(41.3790,2.1885),
  'Sants':(41.3793,2.1400),
  'CampNou':(41.3809,2.1228),
  'ParcGuell':(41.4145,2.1527)
}
coords = list(zip(df_model['latitude'],df_model['longitude']))
for name, loc in pois.items():
    df_model[f'DistKm_{name}'] = [
        geodesic(loc, xy).km for xy in coords
    ]

# 4.2 Clustering geográfico + PCA(1)
geo = StandardScaler().fit_transform(df_model[['latitude','longitude']])
kmeans = KMeans(n_clusters=8, random_state=RNG, n_init='auto').fit(geo)
df_model['geo_cluster'] = kmeans.labels_

pca1 = PCA(n_components=1, random_state=RNG).fit_transform(geo)
df_model['geo_pca1'] = pca1.flatten()

# 4.3 Precio medio log por cluster
df_model['price_log'] = np.log1p(df_model['price'])
cluster_avg = df_model.groupby('geo_cluster')['price_log'].transform('mean')
df_model['cluster_avg_logprice'] = cluster_avg


In [29]:
# BLOQUE 5: Codificación objetivo suavizada (Target Encoding)
te_cols = ['propertyType','status','parking_status']
te = ce.TargetEncoder(cols=te_cols, smoothing=10)
df_model[te_cols] = te.fit_transform(df_model[te_cols], df_model['price_log'])


In [30]:
# BLOQUE 6: Splines y polinomios para capturar no linealidad
# 6.1 Spline sobre size y distancias
spline = SplineTransformer(degree=3, n_knots=5, include_bias=False)
spline_feats = spline.fit_transform(df_model[['size'] + [f'DistKm_{p}' for p in pois]])
spline_names = spline.get_feature_names_out(['size'] + [f'DistKm_{p}' for p in pois])
df_splines = pd.DataFrame(spline_feats, columns=spline_names, index=df_model.index)
df_model = pd.concat([df_model, df_splines], axis=1)

# 6.2 Interacciones de grado 2 (sin cuadrados)
num_base = ['size','rooms','bathrooms','floor','numPhotos',
            'geo_pca1','cluster_avg_logprice']
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(df_model[num_base])
poly_names = poly.get_feature_names_out(num_base)
df_poly = pd.DataFrame(X_poly, columns=poly_names, index=df_model.index)
# añadimos solo las interacciones nuevas
new_ints = [c for c in poly_names if '*' in c]
df_model = pd.concat([df_model, df_poly[new_ints]], axis=1)


In [31]:
# BLOQUE 7: Filtrado de outliers sobre price_log (1–99 percentil)
y = df_model['price_log']
low, high = y.quantile([0.01,0.99])
mask = y.between(low, high)
df_model = df_model[mask]
print("Registros tras filtrar outliers:", df_model.shape[0])


Registros tras filtrar outliers: 8309


In [32]:
# BLOQUE 8: Train/Test split y escalado
X = df_model.drop(columns=['price','price_log'])
y = df_model['price_log']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RNG
)

# Escalado solo numéricos
num_cols = X_train.select_dtypes(include=['int64','float64']).columns
scaler = StandardScaler().fit(X_train[num_cols])
X_train[num_cols] = scaler.transform(X_train[num_cols])
X_test [num_cols] = scaler.transform(X_test [num_cols])

print("Train/Test shapes:", X_train.shape, X_test.shape)


Train/Test shapes: (6647, 61) (1662, 61)


In [34]:
# BLOQUE 9: RandomizedSearchCV XGBoost con regularización avanzada
param_dist = {
    'n_estimators':     [100,200,500],
    'learning_rate':    [0.01,0.05,0.1],
    'max_depth':        [3,5,7],
    'subsample':        [0.7,0.9,1.0],
    'colsample_bytree': [0.7,0.9,1.0],
    'reg_alpha':        [0, 0.01, 0.1, 1],
    'reg_lambda':       [0.5, 1, 5, 10]
}

xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=RNG,
    n_jobs=-1,
    tree_method='hist'      # más rápido en dataset grande
)

rand_search = RandomizedSearchCV(
    xgb_model, param_dist,
    n_iter=40, cv=5,
    scoring='neg_mean_absolute_error',
    random_state=RNG, n_jobs=-1, verbose=2,
    error_score='raise'
)
rand_search.fit(X_train, y_train)
best_model = rand_search.best_estimator_
print("Mejores parámetros:", rand_search.best_params_)


Fitting 5 folds for each of 40 candidates, totalling 200 fits
Mejores parámetros: {'subsample': 0.7, 'reg_lambda': 0.5, 'reg_alpha': 0.1, 'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.1, 'colsample_bytree': 1.0}


In [35]:
# BLOQUE 10: Evaluación final y guardado del pipeline
# Predicción en escala original
y_pred_log = best_model.predict(X_test)
y_test_orig = np.expm1(y_test)
y_pred_orig = np.expm1(y_pred_log)

mae  = mean_absolute_error(y_test_orig, y_pred_orig)
rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred_orig))
r2   = r2_score(y_test_orig, y_pred_orig)

print(f"MAE   : {mae:,.2f} €")
print(f"RMSE  : {rmse:,.2f} €")
print(f"R²    : {r2:.4f}")

# Guardar scaler, encoder y modelo
joblib.dump({
    'scaler': scaler,
    'target_encoder': te,
    'xgb_model': best_model
}, '/content/drive/MyDrive/Dataset Idealista/pipeline_joblib.pkl')
print("Pipeline guardado en Drive.")


MAE   : 67,452.74 €
RMSE  : 125,739.32 €
R²    : 0.8595
Pipeline guardado en Drive.


In [44]:
# 2) Definir carpeta de destino
import os
save_dir = '/content/drive/MyDrive/ModelosIdealista'
os.makedirs(save_dir, exist_ok=True)

# 3) Serializar pipeline completo
import joblib

pipeline = {
    'scaler'        : scaler,               # StandardScaler ajustado
    'target_encoder': te,                   # TargetEncoder ajustado
    'xgb_model'     : best_model,           # XGBRegressor con mejores parámetros
    'feature_order' : X.columns.tolist(),   # Orden de columnas/variables del modelo
    'num_cols'      : num_cols.tolist(),    # Lista de columnas numéricas
    'te_cols'       : te_cols               # Lista de columnas categóricas target-encoded
}

joblib.dump(pipeline, os.path.join(save_dir, 'pipeline_idealista.joblib'))

print(f"✅ Pipeline exportado correctamente en: {save_dir}/pipeline_idealista.joblib")


✅ Pipeline exportado correctamente en: /content/drive/MyDrive/ModelosIdealista/pipeline_idealista.joblib
