In [12]:
import gcsfs
import duckdb
import json
import pandas as pd  # <-- novo

BUCKET = "teste-waha"
PREFIX = "raw/waha_events/"

fs = gcsfs.GCSFileSystem()

# lista e filtra NDJSON
arquivos = [a for a in fs.find(f"{BUCKET}/{PREFIX}") if a.endswith(".ndjson")]
print(f"Total de arquivos: {len(arquivos)}")

# lê tudo em memória
rows = []
for path in arquivos:
    with fs.open(path, "r") as f:
        for line in f:
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError:
                pass

print(f"Total de registros lidos: {len(rows)}")

# converte para DataFrame (pode ter colunas aninhadas como dict/objeto)
df = pd.json_normalize(rows)  # usa json_normalize para “achatar” onde der




Total de arquivos: 5
Total de registros lidos: 5


In [23]:
# consulta com DuckDB
con = duckdb.connect()
con.register("mensagens", df)
res = con.execute("""SELECT
message_id as id,
"payload.timestamp" as timestamp,
"payload.from" as sender,
"payload.body" as body
FROM mensagens LIMIT 5""").df()
print(res)

                                              id   timestamp  \
0  false_5511995020957@c.us_3A48BEB6F3281ADE9049  1762974898   
1  false_5511995020957@c.us_3A71FBF2C1084E80B221  1762974909   
2  false_5511995020957@c.us_3A2B074A3D4BA0D61BCA  1762975473   
3  false_5511995020957@c.us_3A3D14F903D1F19D7A34  1762975778   
4  false_5511995020957@c.us_3A3D72085CA3EF2B6946  1762975948   

               sender                                               body  
0  5511995020957@c.us                                              Teste  
1  5511995020957@c.us                                          Globo.com  
2  5511995020957@c.us                                              Teste  
3  5511995020957@c.us                                              Teste  
4  5511995020957@c.us  Olha o que eu encontrei! Apple iPhone 15 (128 ...  
