In [None]:
!pip install -qq pandas numpy pyarrow

# Objetivo

El objetivo principal de este notebook es usar los métodos del archivo transform.py para preprocesar y transformar el conjunto de datos inmobiliarios de forma cuidadosa, asegurando que todas las variables relevantes estén en un formato limpio, consistente y analizable sin eliminar información valiosa (ya que el dataset de por sí ya es pequeño con alrededor de 4k filas). Esto permite mejorar la calidad de los datos para análisis posteriores y la generación de comparables inmobiliarios.

Insights: 

1. Estandarización de columnas categóricas

2. Imputación de valores faltantes en variables numéricas

3. Manejo de columnas binarias

4. Tratamiento de outliers extremos (sin eliminarlos)

5. Conversión de columnas numéricas a tipos enteros

In [4]:
import pandas as pd


In [5]:
df = pd.read_csv("../data/raw/cuahutemoc_properties.csv")

In [6]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
sys.path.append(str(project_root))

In [7]:
from dd360.transform import clean_property_data

In [8]:
df_clean = clean_property_data(df)

In [9]:
df_clean.head()

Unnamed: 0_level_0,listing_type,property_type,url_ad,price,terrain_surface,construction_surface,num_bathrooms,num_parking_lots,num_bedrooms,built_year,conservation_status,latitude,longitude,id_neighborhood,neighborhood,has_garden,has_gym
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
5ae456983ab717ec68e1ff2e0689c5d8,for-sale,apartment,https://rentahouse.com.mx/departamento_en_vent...,9348347.0,260.0,96.0,2,1,1,2024,0.98951,19.430419,-99.163761,11817,CUAUHTEMOC,1,0
fff8db1482a59a1c9093f9ab3ac50ddf,for-sale,apartment,https://www.easybroker.com/mx/listings/departa...,16200000.0,244.5,90.0,2,1,2,2025,0.981774,19.425804,-99.167429,11818,JUAREZ,0,1
7f0478d4f668aaf45ed3511523746fce,for-sale,apartment,https://www.vivanuncios.com.mx/a-venta-departa...,5400000.0,244.5,110.0,1,1,2,1999,0.916711,19.426753,-99.161845,11818,JUAREZ,0,0
4a0034cf7bfdd9b610bcb4a2e5096cf0,for-sale,apartment,https://www.clau.com/propiedades/3pm-departame...,6700000.0,194.5,79.0,2,0,2,2000,0.873144,19.422017,-99.164724,11819,ROMA NORTE,0,0
afdc70d7a521ba91cc026062653fe841,for-sale,apartment,https://degohouse.com.mx/apartamento-venta-non...,3778408.0,200.0,67.0,2,1,2,2019,0.98856,19.432608,-99.133208,11800,CENTRO,0,0


In [10]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4513 entries, 5ae456983ab717ec68e1ff2e0689c5d8 to 3cd6ac47327af734a867e646a8637e56
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   listing_type          4513 non-null   object 
 1   property_type         4513 non-null   object 
 2   url_ad                4513 non-null   object 
 3   price                 4513 non-null   float64
 4   terrain_surface       4513 non-null   float64
 5   construction_surface  4513 non-null   float64
 6   num_bathrooms         4513 non-null   int64  
 7   num_parking_lots      4513 non-null   int64  
 8   num_bedrooms          4513 non-null   int64  
 9   built_year            4513 non-null   int64  
 10  conservation_status   4513 non-null   float64
 11  latitude              4513 non-null   float64
 12  longitude             4513 non-null   float64
 13  id_neighborhood       4513 non-null   int64  
 14  neighborhood      

In [11]:
df_clean.reset_index().to_parquet('../data/interim/cleaned.parquet', index = False)