In [15]:
import os
import re
import json
import time
import pandas as pd
from openai import OpenAI
from textblob import TextBlob

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
BASE_PATH = "../data/silver"
GOLD_PATH = "../data/gold"
NUM_SAMPLES = 1000 # N√∫mero total de amostras desejadas
os.makedirs(GOLD_PATH, exist_ok=True)

try:
    df_listings = pd.read_csv(f"{BASE_PATH}/silver_dim_listings.csv", quotechar='"', on_bad_lines='warn', low_memory=False)
    df_reviews = pd.read_csv(f"{BASE_PATH}/silver_fact_reviews.csv", quotechar='"', on_bad_lines='warn')
    print(f"‚úÖ Listings carregado: {df_listings.shape}")
    print(f"‚úÖ Reviews carregado: {df_reviews.shape}")
except FileNotFoundError as e:
    print(f"‚ùå Erro: Arquivo n√£o encontrado em {BASE_PATH}. Verifique se moveu os CSVs para a pasta data/silver!")
    raise e

‚úÖ Listings carregado: (38670, 11)
‚úÖ Reviews carregado: (264561, 5)


In [16]:
def classify_review_gpt(comentario: str):
    prompt = f"""
    Analise o seguinte review de Airbnb:

    "{comentario}"

    Formato obrigat√≥rio:
    {{
    "sentimento": "Positivo" | "Neutro" | "Negativo",
    "topico_principal": "Limpeza" | "Localiza√ß√£o" | "Check-in" | "Outro",
    "sub_topico": "Conforto" | "Comunica√ß√£o" | "Valor" | "Comodidades" | "Outro",
    "tom_de_urgencia": true | false
    }}
    As chaves sentimento, topico_principal e sub_topico devem ser √∫nicas n√£o podendo ser multicategorias (ex: Limpeza | Localiza√ß√£o).
    """

    try:
        response = client.responses.create(
            model="gpt-4.1-mini",
            input=prompt,
        )
        raw_output = response.output[0].content[0].text.strip()
        match = re.search(r"\{.*\}", raw_output, re.DOTALL)
    except Exception as e:
        error = str(e)

    if not match or 'error' in locals():
        return {
            "sentimento": None,
            "topico_principal": None,
            "sub_topico": None,
            "tom_de_urgencia": None,
            "erro": str(e)
        }
    
    json_text = match.group()
    return json.loads(json_text)


In [17]:
def classify_listing_gpt(text: str):
    prompt = f"""
    Atue como um especialista em Real Estate no Rio de Janeiro.
    Analise o t√≠tulo do an√∫ncio: "{text}".
 
    Retorne APENAS um JSON v√°lido com esta estrutura exata:
    {{
        "categoria_vibe": "Luxo | Econ√¥mico | Familiar | Rom√¢ntico | Moderno | Padr√£o",
        "tipo_vista": "Mar | Natureza | Urbana | Sem Vista",
        "ponto_forte": "Uma frase curta de 3 palavras resumindo o destaque",
        "principal_caracteristica": "Unica palavra destacando o im√≥vel"
    }}
    As chaves categoria_vibe e tipo_vista devem ser √∫nicas n√£o podendo ser multicategorias (ex: Luxo | Econ√¥mico).
    """
    try:
        response = client.responses.create(
            model="gpt-4.1-mini",
            input=prompt,
        )
        raw_output = response.output[0].content[0].text.strip()
        match = re.search(r"\{.*\}", raw_output, re.DOTALL)
    except Exception as e:
        error = str(e)

    if not match or 'error' in locals():
        return {
            "categoria_vibe": None,
            "tipo_vista": None,
            "ponto_forte": None,
            "principal_caracteristica": None,
            "erro": str(e)
        }
    
    json_text = match.group()
    return json.loads(json_text)

In [18]:
print("üé≤ Iniciando Amostragem Inteligente de Reviews...")
df_reviews_head = df_reviews.head(NUM_SAMPLES//2).copy()
df_reviews_remaining = df_reviews.iloc[NUM_SAMPLES//2:]
n_sample = min(NUM_SAMPLES//2, len(df_reviews_remaining))
df_reviews_random = df_reviews_remaining.sample(n=n_sample, random_state=42).copy()
df_reviews_selection = pd.concat([df_reviews_head, df_reviews_random])

print("\nüè† Iniciando Sele√ß√£o de Listings Relacionados...")
related_listing_ids = df_reviews_selection['SK_LISTING'].unique()
df_listings_related = df_listings[df_listings['SK_LISTING'].isin(related_listing_ids)].copy()
count_related = len(df_listings_related)
print(f"   -> Encontrados {count_related} im√≥veis citados nos reviews selecionados.")
TARGET_LISTINGS = NUM_SAMPLES

if count_related < TARGET_LISTINGS:
    needed = TARGET_LISTINGS - count_related
    print(f"   -> Necess√°rio completar com mais {needed} im√≥veis aleat√≥rios.")
    df_listings_available = df_listings[~df_listings['SK_LISTING'].isin(related_listing_ids)]
    n_fill = min(needed, len(df_listings_available))
    
    df_listings_fill = df_listings_available.sample(n=n_fill, random_state=42).copy()
    df_listings_selection = pd.concat([df_listings_related, df_listings_fill])
    
else:
    print("   -> Quantidade de im√≥veis relacionados j√° supera {NUM_SAMPLES}. Limitando sele√ß√£o.")
    df_listings_selection = df_listings_related.head(TARGET_LISTINGS)

print(f"‚úÖ Sele√ß√£o de Listings Conclu√≠da: {len(df_listings_selection)} registros.")

üé≤ Iniciando Amostragem Inteligente de Reviews...

üè† Iniciando Sele√ß√£o de Listings Relacionados...
   -> Encontrados 411 im√≥veis citados nos reviews selecionados.
   -> Necess√°rio completar com mais 589 im√≥veis aleat√≥rios.
‚úÖ Sele√ß√£o de Listings Conclu√≠da: 1000 registros.


In [19]:
# 1. Defini√ß√£o do Dicion√°rio de Tradu√ß√£o
map_tipo_quarto = {
    'Entire home/apt': 'Casa/Apto inteiro',
    'Private room': 'Quarto privativo',
    'Shared room': 'Quarto compartilhado',
    'Hotel room': 'Quarto de hotel' 
}

# 2. Aplica√ß√£o da Transforma√ß√£o
print("üîÑ Traduzindo tipos de quarto...")
df_listings_selection['DS_TIPO_QUARTO'] = (
    df_listings_selection['DS_TIPO_QUARTO']
    .map(map_tipo_quarto)
    .fillna('Outros')
)

# 3. Verifica√ß√£o (Opcional)
print("‚úÖ Valores √∫nicos ap√≥s tradu√ß√£o:")
print(df_listings_selection['DS_TIPO_QUARTO'].value_counts())

üîÑ Traduzindo tipos de quarto...
‚úÖ Valores √∫nicos ap√≥s tradu√ß√£o:
DS_TIPO_QUARTO
Casa/Apto inteiro       832
Quarto privativo        158
Quarto compartilhado     10
Name: count, dtype: int64


In [20]:

# # Reviews Analysis
print("ü§ñ Iniciando An√°lise de Reviews com GPT (Feature Engineering)...")
results = []
for index, row in df_reviews_selection.iterrows():
    results.append(classify_review_gpt(row['TXT_COMENTARIO']))
    # time.sleep(0.5)

df_features = pd.DataFrame(results, index=df_reviews_selection.index)
df_reviews_enriched = df_reviews_selection.drop('TXT_COMENTARIO', axis=1).join(df_features)

columns_names = {
    "CAT_SENTIMENTO": "sentimento",
    "CAT_TOPICO": "topico_principal",
    "CAT_SUB_TOPICO": "sub_topico",
    "FLG_URGENCIA": "tom_de_urgencia"
}
df_reviews_enriched.rename(columns=columns_names, inplace=True)
# print("ü§ñ Iniciando An√°lise de Reviews com TextBlob (Feature Engineering)...")

ü§ñ Iniciando An√°lise de Reviews com GPT (Feature Engineering)...


In [21]:
# Listings Analysis
print("ü§ñ Iniciando An√°lise de Listings com GPT (Feature Engineering)...")
results = []
for index, row in df_listings_selection.iterrows():
    results.append(classify_listing_gpt(row['NM_ANUNCIO']))
    # time.sleep(0.5)

df_features = pd.DataFrame(results, index=df_listings_selection.index)
df_listings_enriched = df_listings_selection.drop('NM_ANUNCIO', axis=1).join(df_features)

columns_names = {
    "CAT_VIBE_IA": "categoria_vibe",
    "CAT_TIPO_VISTA": "tipo_vista",
    "TXT_DESTAQUE_IA": "ponto_forte",
    "TXT_CARACTERISTICA_IA": "principal_caracteristica"
}
df_listings_enriched.rename(columns=columns_names, inplace=True)



ü§ñ Iniciando An√°lise de Listings com GPT (Feature Engineering)...


In [22]:


df_reviews_enriched.to_csv(f"{GOLD_PATH}/FACT_REVIEWS.csv", index=False)
df_listings_enriched.to_csv(f"{GOLD_PATH}/DIM_LISTINGS.csv", index=False)