In [1]:
import pandas as pd
import numpy as np
import os

RAW_PATH = "../data/raw"
SILVER_PATH = "../data/silver"
os.makedirs(SILVER_PATH, exist_ok=True)

print("ðŸ“¥ Carregando camada Bronze (Raw)...")

df_listings = pd.read_csv(f"{RAW_PATH}/listings.csv", quotechar='"', on_bad_lines='warn', low_memory=False)
df_reviews = pd.read_csv(f"{RAW_PATH}/reviews.csv", quotechar='"', on_bad_lines='warn', delimiter=';')

df_listings.columns = df_listings.columns.str.strip().str.lower()
df_reviews.columns = df_reviews.columns.str.strip().str.lower()

print(f"Bronze Listings: {df_listings.shape}")
print(f"Bronze Reviews: {df_reviews.shape}")

ðŸ“¥ Carregando camada Bronze (Raw)...
Bronze Listings: (43068, 18)
Bronze Reviews: (268350, 6)


In [2]:
def clean_numeric_col(series):
    """Converte para numÃ©rico, transformando erros em NaN"""
    return pd.to_numeric(series, errors='coerce')


In [3]:
# --- 1. SANITIZAÃ‡ÃƒO DE LISTINGS ---
rows_initial = len(df_listings)

df_listings['clean_price'] = pd.to_numeric(df_listings['price'], errors='coerce')
df_listings = df_listings.dropna(subset=['clean_price'])
print(f"[Listings] Removidos por preÃ§o invÃ¡lido/nulo: {rows_initial - len(df_listings)}")

df_listings['clean_id'] = clean_numeric_col(df_listings['id'])
df_listings = df_listings.dropna(subset=['clean_id'])
df_listings['clean_id'] = df_listings['clean_id'].astype(np.int64) # Casting final

[Listings] Removidos por preÃ§o invÃ¡lido/nulo: 4398


In [4]:
# --- 2. SANITIZAÃ‡ÃƒO DE REVIEWS ---
rows_reviews_initial = len(df_reviews)

df_reviews['clean_id'] = clean_numeric_col(df_reviews['id'])
df_reviews['clean_listing_id'] = clean_numeric_col(df_reviews['listing_id'])
df_reviews = df_reviews.dropna(subset=['clean_id', 'clean_listing_id'])

# Casting para Inteiro
df_reviews['clean_id'] = df_reviews['clean_id'].astype(np.int64)
df_reviews['clean_listing_id'] = df_reviews['clean_listing_id'].astype(np.int64)

print(f"[Reviews] Linhas sujas removidas: {rows_reviews_initial - len(df_reviews)}")

[Reviews] Linhas sujas removidas: 3789


In [5]:
print("\n Aplicando Common Data Model (RenomeaÃ§Ã£o)...")

# Mapa De-Para: Listings
map_listings = {
    'clean_id': 'SK_LISTING',           
    'name': 'NM_ANUNCIO',               
    'host_id': 'SK_HOST',              
    'neighbourhood': 'NM_BAIRRO',       
    'latitude': 'NR_LATITUDE',         
    'longitude': 'NR_LONGITUDE',        
    'room_type': 'DS_TIPO_QUARTO',      
    'clean_price': 'VLR_DIARIA_BRL',    
    'minimum_nights': 'QTD_MIN_NOITES', 
    'number_of_reviews': 'QTD_TOTAL_AVALIACOES',
    'availability_365': 'QTD_DIAS_DISPONIVEIS'
}

# Mapa De-Para: Reviews
map_reviews = {
    'clean_id': 'SK_REVIEW',             
    'clean_listing_id': 'SK_LISTING',
    'reviewer_name': 'NM_REVIEWER',
    'comments': 'TXT_COMENTARIO',
    'date': 'DT_AVALIACAO'
}

# Aplica a seleÃ§Ã£o e renomeaÃ§Ã£o
df_silver_listings = df_listings.rename(columns=map_listings)[list(map_listings.values())]
df_silver_reviews = df_reviews.rename(columns=map_reviews)[list(map_reviews.values())]

print("âœ… Schema CDM Aplicado.")
print(f"Colunas Listings: {list(df_silver_listings.columns)}")


 Aplicando Common Data Model (RenomeaÃ§Ã£o)...
âœ… Schema CDM Aplicado.
Colunas Listings: ['SK_LISTING', 'NM_ANUNCIO', 'SK_HOST', 'NM_BAIRRO', 'NR_LATITUDE', 'NR_LONGITUDE', 'DS_TIPO_QUARTO', 'VLR_DIARIA_BRL', 'QTD_MIN_NOITES', 'QTD_TOTAL_AVALIACOES', 'QTD_DIAS_DISPONIVEIS']


In [6]:
# Salvando na pasta Gold (ou Silver, dependendo da sua nomenclatura, aqui chamei de Gold por estar pronto)
df_silver_listings.to_csv(f"{SILVER_PATH}/dim_listings.csv", index=False)
df_silver_reviews.to_csv(f"{SILVER_PATH}/fact_reviews.csv", index=False)

print("\nðŸ’¾ Dados tratados salvos em /data/silver/")


ðŸ’¾ Dados tratados salvos em /data/silver/
