In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("tmp/full_stg_extract_2024-11-30.csv")

In [3]:
# Hay varias columnas que para la porción de ML no nos interesan. Las vamos a borrar
df.drop(columns=["location", "created_at", "updated_at", "property_url", "address", "argenprop_code", "zonaprop_code"]
        , inplace=True)

In [4]:
df["price"] = df["price"].str.replace(".000", "000").replace(".0", "")
df["price"] = df["price"].str.replace(".0", "")

In [5]:
# Convert extracted columns to numeric types
df[["total_area", "rooms", "bedrooms", "bathrooms", "garages", "price", "antiquity"]] = (
    df[["total_area", "rooms", "bedrooms", "bathrooms", "garages", "price", "antiquity"]]
    .apply(pd.to_numeric, errors="coerce")
)

In [6]:
# Vamos a comenzar a llenar información faltante

# Si la propiedad tiene NaN en garage, asumimos que no tiene
df.loc[df["garages"].isna(), "garages"] = 0

In [7]:
# Borramos las propiedades que no tienen precio
df = df[~df.price.isna()]

In [8]:
# Borramos todas las propiedades que no tienen información de metros cuadrados
df = df[~df.total_area.isna()]

In [9]:
# Vamos a trabajar ahora con numeros de cuartos, baños y ambientes
# Pasamos las descripciones a lowercase para facilitar busquedas de strings
df["description"] = df["description"].str.lower()

# Si la propiedad menciona monoambiente en su descripción, asumimos que tiene 1 baño, 1 cuarto y 1 ambiente
df.loc[df["description"].str.contains("monoambiente") & df["bedrooms"].isna(), "bedrooms"] = 1
df.loc[df["description"].str.contains("monoambiente") & df["rooms"].isna(), "rooms"] = 1
df.loc[df["description"].str.contains("monoambiente") & df["bathrooms"].isna(), "bathrooms"] = 1

In [10]:
# Vamos a seguir rellenando "rooms" en base a la descripción

def rooms_filler(description):
    possible_rooms = [1,2,3,4,5,6,7,8,9,10]
    possible_descriptions = ["{} ambientes", "{} amb", "{} dormitorios", 
                             "{} dorm", "{} ambiente", "{}amb", "{} dor", "{}dorm", "{}  ambientes"]

    for i in possible_rooms:
        for j in possible_descriptions:
            if j.format(i) in description:
                return i

df.loc[df["rooms"].isna(), "rooms"] = df.loc[df["rooms"].isna(), "description"].apply(rooms_filler)

In [11]:
# Eliminamos las propiedades que no tienen ambientes luego de este procesamiento
df = df[~df.rooms.isna()]

In [12]:
df

Unnamed: 0,antiquity,bedrooms,expenses,garages,bathrooms,neighborhood,rooms,total_area,description,price
0,50.0,2.0,75000,0.0,,Nuñez,1.0,60.0,departamento en alquiler - 1 dormitorio 1 baño...,1335750.0
1,60.0,,30000,0.0,3.0,Barracas,2.0,360.0,alquiler 2 ambientes al frente - barracas,320000.0
2,60.0,2.0,52372,0.0,1.0,Saavedra,1.0,58.0,alquiler monoambiente con balcon saavedra caba.,350000.0
3,40.0,1.0,75000,0.0,,San Telmo,2.0,40.0,alquiler amplio 2 ambientes en san telmo,390000.0
5,15.0,,,0.0,1.0,Palermo,2.0,37.0,excelente 2 amb totalmente equipado en palermo...,873375.0
...,...,...,...,...,...,...,...,...,...,...
22590,,1.0,120.000,0.0,1.0,Palermo,2.0,45.0,excelente depto de dos ambientes al frente con...,500000.0
22591,,1.0,45.000,0.0,1.0,Caballito,1.0,25.0,monoambiente ubicado en el corazon de caballit...,350000.0
22592,,1.0,160.000,1.0,1.0,Puerto Madero,2.0,67.0,excelente departamento en el emprendimiento de...,1130250.0
22593,,4.0,550.000,1.0,2.0,Recoleta,7.0,172.0,alquiler semi amoblado con muy buena recepcion...,1541250.0


In [13]:
# Realizamos conversores de baños y dormitorios

# En resumen, si no tenemos cantidad de cuartos, asignamos la cantidad de cuartos menos 1
# Sabemos que en este punto todos los registros tienen valor en rooms
df.loc[df["bedrooms"].isna(), "bedrooms"] = df.loc[df["bedrooms"].isna()]["rooms"] - 1

# En caso de que nos de 0, asumimos que es 1 ya que sería un monoambiente
df.loc[df["bedrooms"] == 0, "bedrooms"] = 1

In [14]:
def bathroom_converter(rooms):
    one_bathroom_values = [1,2,3]
    two_bathroom_values = [4,5,6]

    if rooms in one_bathroom_values:
        return 1
    elif rooms in two_bathroom_values:
        return 2
    else:
        return 3
    
df.loc[df["bathrooms"].isna(), "bathrooms"] = df.loc[df["bathrooms"].isna()]["rooms"].apply(bathroom_converter)

In [15]:
# Para antiguedad, si no tenemos valor, llenamos con el valor promedio del resto de las antiguedades
# en el barrio

df['antiquity'] = df['antiquity'].fillna(
    df.groupby('neighborhood')['antiquity'].transform('mean')
)

# Round up to the nearest natural number
df['antiquity'] = np.ceil(df['antiquity'])

In [16]:
# Para muchos casos no tenemos antiguedad por promedio. Tomamos información de Internet y las rellenamos

avg_antiquity = {
    "Recoleta": 50,  # Historical, many buildings from early 20th century
    "Núñez": 40,  # Mix of older houses and newer developments
    "Palermo Hollywood": 30,  # Many mid-century and newer constructions
    "Puerto Madero": 20,  # Mostly new developments since the 1990s
    "Centro / Microcentro": 70,  # Historic center with older buildings
    "Las Cañitas": 40,  # Trendy area with a mix of old and new
    "Palermo Soho": 40,  # Similar to Hollywood, slightly older buildings
    "Monte Castro": 50,  # Traditional residential area
    "Almagro Norte": 60,  # Older residential area
    "Tribunales": 80,  # Historic legal and business district
    "San Nicolás": 70,  # Similar to Microcentro
    "Monserrat": 80,  # One of the oldest neighborhoods
    "Belgrano R": 50,  # Residential, mix of old and newer homes
    "Palermo Nuevo": 30,  # Newer part of Palermo
    "Palermo Chico": 40,  # Upscale, many mid-century properties
    "Belgrano Chico": 40,  # Similar to Palermo Chico
    "Palermo Viejo": 50,  # Older buildings, many renovated
    "Retiro": 70,  # Historic with some modern developments
    "La Paternal": 50,  # Older residential neighborhood
    "Caballito Norte": 60,  # Older family homes
    "Belgrano C": 50,  # Similar to Belgrano R
    "Caballito Sur": 60,  # Same as Norte
    "Parque Rivadavia": 60,  # Older buildings near park
    "Villa Pueyrredón": 50,  # Traditional middle-class area
    "Floresta Sur": 60,  # Mix of old houses and mid-century buildings
    "Primera Junta": 60,  # Similar to Caballito
    "Cid Campeador": 60,  # Similar to surrounding areas
    "Constitución": 80,  # Old and densely built
    "Botánico": 40,  # Around the gardens, mix of styles
    "Lomas de Núñez": 30,  # Newer developments
    "Distrito Quartier": 20,  # Newly developed
    "Temperley": 70,  # Older suburb
    "Flores Sur": 60,  # Similar to Floresta
    "Almagro Sur": 60,  # Similar to Norte
    "Flores Norte": 60,  # Same as Sur
    "La Boca": 80,  # Historic with some modern projects
    "Parque Chas": 50,  # Traditional middle-class area
    "Floresta Norte": 60,  # Similar to Sur
    "Agronomía": 50,  # Near university, mix of styles
    "Otro": 50,  # Placeholder for undefined neighborhoods
    "Puerto Retiro": 70,  # Near historic Retiro
    "Barrio Parque": 40,  # Upscale, mid-century
    "Barrio Chino": 30,  # Newer commercial developments
    "Naón": 50,  # Mix of older homes
    "Parque Avellaneda": 60,  # Similar to Parque Chas
    "Catalinas": 20,  # Modern skyscrapers
    "Los Perales": 50,  # Traditional residential
    "Villa Riachuelo": 50,  # Outlying older area
    "Barrio Parque General Belgrano": 50  # Older homes, quieter
}

def antiquity_filler(neighborhood):
    return avg_antiquity.get(neighborhood, 50)

df.loc[df["antiquity"].isna(), "antiquity"] = df.loc[df["antiquity"].isna()]["neighborhood"].apply(antiquity_filler)

In [17]:
# Borramos la columna "expenses ya que no tenemos un buen uso por ahora"
df.drop(columns=["expenses"], inplace=True)

In [18]:
# Borramos la descripción ya que no la vamos a usar en el modelo
df.drop(columns=["description"], inplace=True)

In [19]:
# Enviamos info a un archivo csv para trabajar en el siguiente paso
df.to_csv("tmp/full_stg_extract_2024-11-30_cleaned.csv", index=False)