name: rellenar con no_name
host_name: rellenar con unknown_host
neighbourhood_group: eliminar registros con nulos
neighbourhood: eliminar registros con nulos
lat, long: eliminar registros con nulos
cancellation_policy: rellenar con not_specified
construction_year: rellenar con 2012
price: rellenar con 624.0
service_fee: rellenar con 125.0
minimum_nights: rellenar con 3
number_of_reviews: imputar con 0 
last_review: eliminar si number_of_reviews == 0.
review_rate_number: eliminar si number_of_reviews == 0.
reviews_per_month: imputar con 0 si no hay reviews
calculated_host_listings_count: imputar con meddiana que es igual a 1
availability_365: imputar con mediana que igual a 96.0
host_verification: rellenar con Not Provided
instant_bookable_flag: relllenar con False

In [None]:
import pandas as pd
import os
import logging
import numpy as np
from rapidfuzz import process, fuzz

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("clean_airbnb_data.log"), 
        logging.StreamHandler()
    ]
)

logging.info("Inicio de la extración de datos de AirBnB.")

try:
    SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
except NameError:
    SCRIPT_DIR = os.getcwd()

BASE_DIR = os.path.dirname(os.path.dirname(SCRIPT_DIR))
RAW_DATA_DIR = os.path.join(BASE_DIR, 'data', 'raw')
CSV_FILE_NAME = 'Airbnb_Open_Data.csv'
CSV_FILE_PATH = os.path.join(RAW_DATA_DIR, CSV_FILE_NAME)
OUTPUT_CLEANED_DATA_DIR = os.path.join(BASE_DIR, 'data', 'clean')

logging.info(f"Ruta del archivo CSV construida con os: {CSV_FILE_PATH}")

df_airbnb = pd.DataFrame()
logging.info("DataFrame 'df_airbnb' predefinido como un DataFrame vacío.")

try:
    logging.info(f"Intentando cargar el archivo CSV: {CSV_FILE_PATH}")
    if not os.path.exists(CSV_FILE_PATH):
        logging.error(f"Error: Archivo CSV no encontrado en '{CSV_FILE_PATH}'")
        raise FileNotFoundError(f"Archivo no encontrado: {CSV_FILE_PATH}")
    df_airbnb = pd.read_csv(CSV_FILE_PATH, low_memory=False)
    logging.info(f"Archivo CSV '{CSV_FILE_PATH}' cargado exitosamente.")
    logging.info(f"El DataFrame original tiene {df_airbnb.shape[0]} filas y {df_airbnb.shape[1]} columnas.")
except FileNotFoundError:
    raise
except Exception as e:
    logging.error(f"Ocurrió un error al cargar el CSV '{CSV_FILE_PATH}': {e}")
    raise

if not df_airbnb.empty:
    logging.info("Verificando filas duplicadas en df_airbnb.")
    num_duplicados = df_airbnb.duplicated().sum()
    logging.info(f"Número de filas duplicadas encontradas en df_airbnb: {num_duplicados}")
else:
    logging.critical("El DataFrame df_airbnb está vacío después de la carga. Terminando el script.")


# --- Limpieza Preliminar y Conversión de Tipos de Datos (Refactorizado) ---
logging.info("Iniciando limpieza preliminar y conversión de tipos de datos (versión optimizada).")
df_cleaned = pd.DataFrame()

if not df_airbnb.empty:
    df_cleaned = df_airbnb.copy()
    logging.info("Copia de df_airbnb creada como df_cleaned.")

    original_columns = df_cleaned.columns.tolist()
    df_cleaned.columns = df_cleaned.columns.str.lower().str.replace(' ', '_', regex=False).str.replace('[^0-9a-zA-Z_]', '', regex=True)
    new_columns = df_cleaned.columns.tolist()
    logging.info(f"Columnas de df_cleaned normalizadas.")
    if original_columns != new_columns:
        logging.info(f"Cambios en nombres de columnas: {dict(zip(original_columns, new_columns))}")
    else:
        logging.info("Nombres de columnas ya estaban normalizados o no requirieron cambios significativos.")

    def clean_string_column(series, col_name):
        logging.debug(f"Limpiando columna string: {col_name}")
        series = series.astype(str).str.strip().replace({'nan': pd.NA, '': pd.NA, 'None': pd.NA})
        return series

    def to_numeric_column(series, col_name, numeric_type='Int64'):
        logging.debug(f"Convirtiendo columna a numérica ({numeric_type}): {col_name}")
        nulls_before = series.isna().sum()
        if numeric_type == 'datetime':
            series = pd.to_datetime(series, format='%m/%d/%Y', errors='coerce')
        else:
            series = pd.to_numeric(series, errors='coerce')
            if numeric_type == 'Int64' and not series.empty:
                if series.dropna().apply(lambda x: x.is_integer()).all() or series.dropna().empty:
                    series = series.astype('Int64')
                else:
                    logging.warning(f"Columna '{col_name}' contiene flotantes, no se convertirá a Int64, se mantendrá como float.")
        
        coerced_nulls = series.isna().sum() - nulls_before
        if coerced_nulls > 0:
            logging.warning(f"Columna '{col_name}': {coerced_nulls} nuevos NaNs/NaTs por coerción.")
        return series

    def standardize_categorical_fuzz(series, col_name, choices_list, score_cutoff=85):
        logging.debug(f"Estandarizando columna categórica con RapidFuzz: {col_name}")
        unique_values = series.dropna().unique()
        mapping = {}
        for val in unique_values:
            match = process.extractOne(str(val), choices_list, scorer=fuzz.WRatio, score_cutoff=score_cutoff)
            if match:
                mapping[val] = match[0]
            else:
                mapping[val] = val
        
        original_na_mask = series.isna()
        series_mapped = series.map(mapping)
        series_mapped[original_na_mask] = pd.NA
        
        changes = (series.dropna() != series_mapped.dropna()).sum()
        if changes > 0:
            logging.info(f"Columna '{col_name}': {changes} valores estandarizados usando RapidFuzz.")
        return series_mapped

    try:
        numeric_cols_int = ['construction_year', 'minimum_nights', 'number_of_reviews', 'review_rate_number', 
                            'calculated_host_listings_count', 'availability_365']
        
        for col in numeric_cols_int:
            if col in df_cleaned.columns:
                df_cleaned[col] = to_numeric_column(df_cleaned[col], col, 'Int64')
            else: logging.warning(f"Columna '{col}' no encontrada para conversión numérica (Int64).")

        numeric_cols_float = ['lat', 'long', 'reviews_per_month']
        for col in numeric_cols_float:
            if col in df_cleaned.columns:
                df_cleaned[col] = to_numeric_column(df_cleaned[col], col, 'float')
            else: logging.warning(f"Columna '{col}' no encontrada para conversión numérica (float).")

        df_cleaned = df_cleaned.reset_index(drop=True)
        df_cleaned['id'] = df_cleaned.index + 1
        df_cleaned = df_cleaned.reset_index(drop=True)
        df_cleaned['host_id'] = df_cleaned.index + 150000
        
        string_cols = ['name', 'host_name']
        for col in string_cols:
            if col in df_cleaned.columns:
                df_cleaned[col] = clean_string_column(df_cleaned[col], col)
            else: logging.warning(f"Columna '{col}' no encontrada para limpieza de string.")

        categorical_cols_pre_fuzz = ['neighbourhood_group', 'neighbourhood']
        for col in categorical_cols_pre_fuzz:
            if col in df_cleaned.columns:
                df_cleaned[col] = clean_string_column(df_cleaned[col], col)
                if col == 'neighbourhood_group' and col in df_cleaned.columns:
                    canonical_groups = ['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Staten Island']
                    if not df_cleaned[col].dropna().empty:
                       df_cleaned[col] = standardize_categorical_fuzz(df_cleaned[col], col, canonical_groups, score_cutoff=80)
                       logging.info(f"RapidFuzz aplicado a '{col}'.")
                    else:
                       logging.info(f"Columna '{col}' está vacía o solo nulos, RapidFuzz no aplicado.")

                nunique_threshold = 50 if col == 'neighbourhood' else 20 
                if col in df_cleaned.columns and df_cleaned[col].nunique(dropna=False) < nunique_threshold:
                    df_cleaned[col] = df_cleaned[col].astype('category')
                    logging.info(f"Columna '{col}' convertida a category.")
                elif col in df_cleaned.columns:
                    logging.info(f"Columna '{col}' limpiada (no convertida a category debido a alta cardinalidad: {df_cleaned[col].nunique(dropna=False)}).")

            else: logging.warning(f"Columna '{col}' no encontrada para limpieza categórica.")

        category_cols_direct = ['cancellation_policy', 'room_type']
        for col in category_cols_direct:
            if col in df_cleaned.columns:
                df_cleaned[col] = clean_string_column(df_cleaned[col], col).astype('category')
                logging.info(f"Columna '{col}' convertida a category.")
            else: logging.warning(f"Columna '{col}' no encontrada para conversión a category.")
        
        # Fechas
        if 'last_review' in df_cleaned.columns:
            df_cleaned['last_review'] = to_numeric_column(df_cleaned['last_review'], 'last_review', 'datetime')
            logging.info("Columna 'last_review' convertida a datetime.")
        else: logging.warning("Columna 'last_review' no encontrada.")

        # Booleanas (con mapeo)
        if 'host_identity_verified' in df_cleaned.columns:
            verified_map = {'verified': True, 'unconfirmed': False}
            df_cleaned['host_verification'] = df_cleaned['host_identity_verified'].map(verified_map).astype('boolean')
            logging.info("Columnas 'host_verification', creada a partir de 'host_identity_verified'.")
        else: logging.warning("Columna 'host_identity_verified' no encontrada.")

        if 'instant_bookable' in df_cleaned.columns:
            bookable_map = {'TRUE': True, 'FALSE': False, 'True': True, 'False': False, 'true': True, 'false': False} 
            df_cleaned['instant_bookable_flag'] = df_cleaned['instant_bookable'].astype(str).str.upper().map(bookable_map).astype('boolean')
            logging.info("Columna 'instant_bookable_flag' creada a partir de 'instant_bookable'.")
        else: logging.warning("Columna 'instant_bookable' no encontrada.")

        currency_cols = {'price': 'price', 'service_fee': 'service_fee'}
        for original_col, new_col_numeric in currency_cols.items():
            if original_col in df_cleaned.columns:
                series_cleaned_str = df_cleaned[original_col].astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False).str.strip().replace({'nan': pd.NA, '': pd.NA})
                df_cleaned[new_col_numeric] = to_numeric_column(series_cleaned_str, new_col_numeric, 'float')
                logging.info(f"Columna '{new_col_numeric}' creada a partir de '{original_col}'.")
            else: logging.warning(f"Columna original '{original_col}' no encontrada para procesar moneda.")

        cols_to_drop_original_names = ['host_identity_verified', 'instant_bookable', 'country', 'country_code', 'license', 'house_rules']
        
        normalized_cols_to_drop = []
        for col_name in cols_to_drop_original_names:
            normalized_name = col_name.lower().replace(' ', '_').replace('[^0-9a-zA-Z_]', '')
            if normalized_name in df_cleaned.columns:
                normalized_cols_to_drop.append(normalized_name)
            elif col_name in df_cleaned.columns:
                normalized_cols_to_drop.append(col_name)

        existing_cols_to_drop = [col for col in normalized_cols_to_drop if col in df_cleaned.columns]
        if existing_cols_to_drop:
            df_cleaned.drop(columns=existing_cols_to_drop, inplace=True, errors='ignore')
            logging.info(f"Columnas {existing_cols_to_drop} eliminadas de df_cleaned.")
        
        # --- Imputación y eliminación de nulos según reglas personalizadas ---
        logging.info("Iniciando imputación y eliminación de valores nulos según reglas específicas.")

       # 1. Rellenar strings simples
        df_cleaned['name'] = df_cleaned['name'].fillna("no_name")
        df_cleaned['host_name'] = df_cleaned['host_name'].fillna("unknown_host")
        if 'cancellation_policy' in df_cleaned.columns:
            if not isinstance(df_cleaned['cancellation_policy'].dtype, pd.CategoricalDtype):
                df_cleaned['cancellation_policy'] = df_cleaned['cancellation_policy'].astype('category')

            if "moderate" not in df_cleaned['cancellation_policy'].cat.categories:
                df_cleaned['cancellation_policy'] = df_cleaned['cancellation_policy'].cat.add_categories(["moderate"])

            df_cleaned['cancellation_policy'] = df_cleaned['cancellation_policy'].fillna("moderate")
            logging.info("Columna 'cancellation_policy' imputada con la moda: 'moderate'.")


        # 2. Eliminar registros con nulos en campos críticos
        before_drop = df_cleaned.shape[0]
        df_cleaned = df_cleaned.dropna(subset=[
            'neighbourhood_group', 'neighbourhood', 'lat', 'long'
        ])
        logging.info(f"Filas eliminadas por nulos en neighbourhood_group, neighbourhood, lat o long: {before_drop - df_cleaned.shape[0]}")

        # 3. Imputaciones numéricas simples
        df_cleaned['construction_year'] = df_cleaned['construction_year'].fillna(2012).astype('Int64')
        df_cleaned['price'] = df_cleaned['price'].fillna(624.0)
        df_cleaned['service_fee'] = df_cleaned['service_fee'].fillna(125.0)
        df_cleaned['minimum_nights'] = df_cleaned['minimum_nights'].fillna(3).astype('Int64')
        df_cleaned['number_of_reviews'] = df_cleaned['number_of_reviews'].fillna(0).astype('Int64')
        df_cleaned['reviews_per_month'] = df_cleaned['reviews_per_month'].fillna(0.0)

        # 4. Eliminar registros donde number_of_reviews == 0 y campos de reseña estén nulos
        before_conditional_drop = df_cleaned.shape[0]
        df_cleaned = df_cleaned[~((df_cleaned['number_of_reviews'] == 0) &
                                  (df_cleaned['last_review'].isna()) &
                                  (df_cleaned['review_rate_number'].isna()))]
        
        
        logging.info(f"Filas eliminadas por no tener reseñas y tener last_review y review_rate_number nulos: {before_conditional_drop - df_cleaned.shape[0]}")

        # 5. Imputaciones por mediana
        df_cleaned['calculated_host_listings_count'] = df_cleaned['calculated_host_listings_count'].fillna(1).astype('Int64')
        df_cleaned['availability_365'] = df_cleaned['availability_365'].fillna(96.0).astype('Int64')

        # 6. Booleanos
        df_cleaned['host_verification'] = df_cleaned['host_verification'].fillna(False)
        df_cleaned['instant_bookable_flag'] = df_cleaned['instant_bookable_flag'].fillna(False)

        logging.info("Imputación y limpieza de nulos completada.")

        # --- Imputaciones y eliminaciones adicionales específicas ---
        logging.info("Aplicando reglas adicionales para 'last_review' y 'review_rate_number'.")

        # 1. Imputar 'last_review' con fecha ficticia
        if 'last_review' in df_cleaned.columns:
            fecha_ficticia = pd.Timestamp("2262-04-11 00:00:00")
            df_cleaned['last_review'] = df_cleaned['last_review'].fillna(fecha_ficticia)
            logging.info("Valores nulos en 'last_review' imputados con '2262-04-11 00:00:00'.")

        # 2. Eliminar registros con 'review_rate_number' nulo
        if 'review_rate_number' in df_cleaned.columns:
            before_drop = df_cleaned.shape[0]
            df_cleaned = df_cleaned.dropna(subset=['review_rate_number'])
            logging.info(f"Filas eliminadas por nulos en 'review_rate_number': {before_drop - df_cleaned.shape[0]}")

        logging.info("Proceso de limpieza preliminar y conversión de tipos optimizado completado.")

    except KeyError as ke:
        logging.error(f"Ocurrió un KeyError durante la limpieza: '{ke}'. Verifica que la columna exista en df_cleaned (posiblemente después de la normalización).")
        logging.error(f"Columnas disponibles en df_cleaned: {df_cleaned.columns.tolist()}")
        logging.error(f"Ocurrió un KeyError: '{ke}'. Revisa los nombres de las columnas y la lógica de normalización.")
    except Exception as e:
        logging.error(f"Ocurrió un error general durante la limpieza: {e}", exc_info=True)
        logging.error(f"Ocurrió un error general: {e}")

else:
    logging.warning("El DataFrame df_airbnb está vacío. No se puede realizar la limpieza.")

logging.info("Proceso finalizado exitosamente.")

2025-05-18 16:39:17,552 - INFO - Inicio de la extración de datos de AirBnB.


2025-05-18 16:39:17,553 - INFO - Ruta del archivo CSV construida con os: /home/jacobo/Proyecto_ETL/data/raw/Airbnb_Open_Data.csv
2025-05-18 16:39:17,563 - INFO - DataFrame 'df_airbnb' predefinido como un DataFrame vacío.
2025-05-18 16:39:17,565 - INFO - Intentando cargar el archivo CSV: /home/jacobo/Proyecto_ETL/data/raw/Airbnb_Open_Data.csv
2025-05-18 16:39:18,143 - INFO - Archivo CSV '/home/jacobo/Proyecto_ETL/data/raw/Airbnb_Open_Data.csv' cargado exitosamente.
2025-05-18 16:39:18,144 - INFO - El DataFrame original tiene 102599 filas y 26 columnas.
2025-05-18 16:39:18,145 - INFO - Verificando filas duplicadas en df_airbnb.
2025-05-18 16:39:18,257 - INFO - Número de filas duplicadas encontradas en df_airbnb: 541
2025-05-18 16:39:18,258 - INFO - Iniciando limpieza preliminar y conversión de tipos de datos (versión optimizada).
2025-05-18 16:39:18,269 - INFO - Copia de df_airbnb creada como df_cleaned.
2025-05-18 16:39:18,271 - INFO - Columnas de df_cleaned normalizadas.
2025-05-18 16:

In [13]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 102222 entries, 0 to 102598
Data columns (total 22 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   id                              102222 non-null  int64         
 1   name                            102222 non-null  object        
 2   host_id                         102222 non-null  int64         
 3   host_name                       102222 non-null  object        
 4   neighbourhood_group             102222 non-null  category      
 5   neighbourhood                   102222 non-null  object        
 6   lat                             102222 non-null  float64       
 7   long                            102222 non-null  float64       
 8   cancellation_policy             102222 non-null  category      
 9   room_type                       102222 non-null  category      
 10  construction_year               102222 non-null  Int64       

In [14]:
df_cleaned.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,lat,long,cancellation_policy,room_type,...,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,host_verification,instant_bookable_flag
0,1,Clean & quiet apt home by the park,150000,Madaline,Brooklyn,Kensington,40.64749,-73.97237,strict,Private room,...,193.0,10,9,2021-10-19,0.21,4,6,286,False,False
1,2,Skylit Midtown Castle,150001,Jenna,Manhattan,Midtown,40.75362,-73.98377,moderate,Entire home/apt,...,28.0,30,45,2022-05-21,0.38,4,2,228,True,False
2,3,THE VILLAGE OF HARLEM....NEW YORK !,150002,Elise,Manhattan,Harlem,40.80902,-73.9419,flexible,Private room,...,124.0,3,0,2262-04-11,0.0,5,1,352,False,True
3,4,no_name,150003,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,moderate,Entire home/apt,...,74.0,30,270,2019-07-05,4.64,4,1,322,False,True
4,5,Entire Apt: Spacious Studio/Loft by central park,150004,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,moderate,Entire home/apt,...,41.0,10,9,2018-11-19,0.1,3,1,289,True,False


In [15]:
print("Valores nulos en df_cleaned:")
print(df_cleaned.isnull().sum())

Valores nulos en df_cleaned:
id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
lat                               0
long                              0
cancellation_policy               0
room_type                         0
construction_year                 0
price                             0
service_fee                       0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
review_rate_number                0
calculated_host_listings_count    0
availability_365                  0
host_verification                 0
instant_bookable_flag             0
dtype: int64


In [17]:
print(f"df_cleaned tiene {df_cleaned.shape[0]} registros.")

df_cleaned tiene 102222 registros.


In [16]:
OUTPUT_CSV_PATH = os.path.join(RAW_DATA_DIR, 'Airbnb_Open_Data_cleaned.csv')
df_cleaned.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"DataFrame limpio guardado en: {OUTPUT_CSV_PATH}")