### **Documenta√ß√£o**: Script de Coleta de Not√≠cias (Google News RSS)

#### **1. Objetivo**

Este script implementa parte do Pilar 2 (Qualidade da Comunica√ß√£o) do projeto Aurum. Sua responsabilidade √© automatizar a coleta de not√≠cias financeiras recentes para cada empresa (ticker) listada no √≠ndice IBRX-100.

O script utiliza o *feed RSS do Google News* como fonte de dados, buscando men√ß√µes a cada ticker nos √∫ltimos 30 dias. Os dados brutos coletados s√£o a base para a futura an√°lise de sentimento (NLP).

#### **2. Configura√ß√£o (Input)**

O script depende de um √∫nico arquivo de entrada:

* **tickers_ibrx100_full.csv**: Um arquivo CSV que deve conter a lista completa de tickers do IBRX-100.
    * Formato esperado: O script l√™ a **primeira coluna** deste arquivo. Os tickers podem estar no formato **PETR4.SA** ou **PETR4**. A fun√ß√£o **load_tickers_from_csv** remove automaticamente o sufixo **.SA** para otimizar a busca no Google News.

#### **3. Sa√≠da (Output)**

O script gera dois arquivos id√™nticos em conte√∫do, localizados em **data/news/**:

1.  **raw_news_data.parquet**
2.  **raw_news_data.csv**

O schema (colunas) do DataFrame salvo √©:

| Coluna | Tipo | Descri√ß√£o |
| :--- | :--- | :--- |
| **ticker_query** | string | O ticker usado na busca (ex: **PETR4**). |
| **title** | string | O t√≠tulo da not√≠cia. |
| **link** | string | O link original da not√≠cia. |
| **published_date** | datetime | A data e hora da publica√ß√£o (j√° convertida). |
| **source** | string | O nome do ve√≠culo de m√≠dia (ex: "InfoMoney"). |
| **summary** | string | Um pequeno resumo ou *snippet* da not√≠cia (HTML). |


In [7]:
import os
import time
import urllib.parse

import feedparser
import tqdm
import pandas as pd

DATA_DIR = os.path.join("..", "data")
NEWS_DIR = os.path.join(DATA_DIR, "news")
os.makedirs(NEWS_DIR, exist_ok=True)

def load_tickers_from_csv(file_path: str) -> list:
    """Carrega a lista de tickers a partir de um arquivo CSV."""
    df = pd.read_csv(file_path)
    tickers = df.iloc[:, 0].dropna().astype(str).tolist()
    return [t.replace('.SA', '') for t in tickers]

def fetch_news_for_ticker(ticker: str):
    """Busca not√≠cias para um ticker espec√≠fico usando o RSS do Google News."""
    raw_query = f'"{ticker}" when:30d'
    search_query = urllib.parse.quote(raw_query)
    url = f"https://news.google.com/rss/search?q={search_query}&hl=pt-BR&gl=BR&ceid=BR:pt-419"
    
    feed = feedparser.parse(url)
    
    news_items = []
    for entry in feed.entries:
        news_items.append({
            'ticker_query': ticker,
            'title': entry.title,
            'link': entry.link,
            'published_date': entry.published,
            'source': entry.source.title if hasattr(entry, 'source') else 'Unknown',
            'summary': entry.summary if hasattr(entry, 'summary') else ''
        })
    return news_items


if __name__ == "__main__":
    tickers_csv_path = os.path.join(DATA_DIR, "tickers_ibrx100_full.csv")
    tickers = load_tickers_from_csv(tickers_csv_path)
    
    all_news = []
    
    print("Iniciando a coleta de not√≠cias via Google News RSS...")
    for ticker in tqdm.tqdm(tickers, desc="Buscando not√≠cias"):
        try:
            news = fetch_news_for_ticker(ticker)
            if news:
                all_news.extend(news)
            time.sleep(0.5)
        except Exception as e:
            print(f"Erro ao buscar not√≠cias para {ticker}: {e}")

    if not all_news:
        print("\nNenhuma not√≠cia foi coletada. Verifique a conex√£o ou a consulta de busca. Encerrando.")
    else:
        df_news = pd.DataFrame(all_news)
        df_news.drop_duplicates(subset=['title', 'link'], inplace=True)
        df_news['published_date'] = pd.to_datetime(df_news['published_date'], errors='coerce')
        
        output_path_parquet = os.path.join(NEWS_DIR, "raw_news_data.parquet")
        output_path_csv = os.path.join(NEWS_DIR, "raw_news_data.csv")

        df_news.to_parquet(output_path_parquet, index=False)
        
        df_news.to_csv(output_path_csv, index=False)
        
        print(f"\nColeta conclu√≠da. {len(df_news)} not√≠cias √∫nicas salvas.")
        print(f"-> {output_path_parquet}")
        print(f"-> {output_path_csv}")
        print("\nAmostra das not√≠cias coletadas:")
        display(df_news.head())

Iniciando a coleta de not√≠cias via Google News RSS...


Buscando not√≠cias: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 97/97 [01:59<00:00,  1.24s/it]



Coleta conclu√≠da. 2231 not√≠cias √∫nicas salvas.
-> ..\data\news\raw_news_data.parquet
-> ..\data\news\raw_news_data.csv

Amostra das not√≠cias coletadas:


Unnamed: 0,ticker_query,title,link,published_date,source,summary
0,ALOS3,Allos (ALOS3) e Azzas (AZZA3) t√™m queda livre ...,https://news.google.com/rss/articles/CBMihgJBV...,2025-12-05 08:00:00,Valor Investe,"<a href=""https://news.google.com/rss/articles/..."
1,ALOS3,Dividendos robustos e corte de custos: o futur...,https://news.google.com/rss/articles/CBMixwFBV...,2025-11-26 08:00:00,Seu Dinheiro,"<a href=""https://news.google.com/rss/articles/..."
2,ALOS3,"Allos (ALOS3) quer se tornar uma ""m√°quina"" de ...",https://news.google.com/rss/articles/CBMitwFBV...,2025-11-26 08:00:00,Investidor10,"<a href=""https://news.google.com/rss/articles/..."
3,ALOS3,Allos (ALOS3) paga R$ 438 milh√µes em dividendo...,https://news.google.com/rss/articles/CBMipAFBV...,2025-12-17 12:49:32,Estad√£o E-Investidor,"<a href=""https://news.google.com/rss/articles/..."
4,ALOS3,Allos (ALOS3) pagar√° R$ 438 milh√µes em dividen...,https://news.google.com/rss/articles/CBMilwFBV...,2025-12-16 22:38:00,Money Times,"<a href=""https://news.google.com/rss/articles/..."


In [10]:
import pandas as pd
import os

# Caminho definido no seu script
GOOGLE_NEWS_FILE = "../data/news/raw_news_data.parquet"

def analisar_google():
    print("--- üîç DIAGN√ìSTICO GOOGLE NEWS ---")
    
    if not os.path.exists(GOOGLE_NEWS_FILE):
        print(f"‚ùå Arquivo n√£o encontrado: {GOOGLE_NEWS_FILE}")
        print("Rode seu script de coleta do Google News primeiro!")
        return

    df = pd.read_parquet(GOOGLE_NEWS_FILE)
    print(f"üìÑ Total de Not√≠cias: {len(df)}")
    print(f"üìè Colunas: {list(df.columns)}")
    
    if not df.empty:
        print("\nüìã Amostra:")
        # Mostra data, ticker e titulo
        print(df[['published_date', 'ticker_query', 'title']].head().to_string())
        
        # Verifica datas
        print(f"\nüìÖ Data M√≠nima: {df['published_date'].min()}")
        print(f"üìÖ Data M√°xima: {df['published_date'].max()}")

if __name__ == "__main__":
    analisar_google()

--- üîç DIAGN√ìSTICO GOOGLE NEWS ---
üìÑ Total de Not√≠cias: 2231
üìè Colunas: ['ticker_query', 'title', 'link', 'published_date', 'source', 'summary']

üìã Amostra:
       published_date ticker_query                                                                                                                 title
0 2025-12-05 08:00:00        ALOS3            Allos (ALOS3) e Azzas (AZZA3) t√™m queda livre ap√≥s venda de a√ß√µes pelo fundo canadense CPP - Valor Investe
1 2025-11-26 08:00:00        ALOS3               Dividendos robustos e corte de custos: o futuro da Allos (ALOS3) na vis√£o do BTG Pactual - Seu Dinheiro
2 2025-11-26 08:00:00        ALOS3                           Allos (ALOS3) quer se tornar uma "m√°quina" de dividendos em 2026; saiba como - Investidor10
3 2025-12-17 12:49:32        ALOS3  Allos (ALOS3) paga R$ 438 milh√µes em dividendos intermedi√°rios; veja datas e valores por a√ß√£o - Estad√£o E-Investidor
4 2025-12-16 22:38:00        ALOS3                    

In [None]:
import requests
import pandas as pd
import time
import logging
import sys
from datetime import datetime, timedelta
from pathlib import Path

API_TOKENS = [
    "WFDoU2pOjYySzlaLErY6GJl1ltVCCfUCeTUY4LHL",
    "0P7N45dKN8qZ6YiseMjHOTQBwrgBGI90JRdbYGjk"
]

OUTPUT_DIR = "../data/news/marketaux"
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

TICKERS = ['PETR4.SA', 'VALE3.SA', 'ITUB4.SA', 'WEGE3.SA', 'BBAS3.SA'] 

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)] 
)

class TokenManager:
    """Gerencia a rota√ß√£o de chaves da API"""
    def __init__(self, tokens):
        self.tokens = tokens
        self.current_index = 0
        self.exhausted_tokens = set()

    def get_token(self):
        start_index = self.current_index
        while self.current_index in self.exhausted_tokens:
            self.current_index = (self.current_index + 1) % len(self.tokens)
            if self.current_index == start_index and start_index in self.exhausted_tokens:
                return None 
        
        return self.tokens[self.current_index]

    def mark_exhausted(self):
        logging.warning(f"üö´ Token final ...{self.tokens[self.current_index][-5:]} esgotado/inv√°lido.")
        self.exhausted_tokens.add(self.current_index)
        self.current_index = (self.current_index + 1) % len(self.tokens)

def buscar_noticias_api(symbol, data_inicio, data_fim, token_manager):
    url = "https://api.marketaux.com/v1/news/all"
    all_articles = []
    page = 1
    
    while True:
        token_atual = token_manager.get_token()
        
        if not token_atual:
            logging.error("üíÄ TODOS os tokens foram esgotados por hoje!")
            return all_articles, False # False = Pare tudo

        params = {
            'symbols': symbol,
            'published_after': data_inicio,
            'published_before': data_fim,
            'language': 'pt',
            'api_token': token_atual,
            'page': page
        }
        
        try:
            response = requests.get(url, params=params)
            
            if response.status_code in [402, 429, 401]:
                logging.warning(f"‚ö†Ô∏è Erro {response.status_code}. Trocando chave...")
                token_manager.mark_exhausted()
                continue
            
            if response.status_code != 200:
                logging.error(f"Erro desconhecido {response.status_code}: {response.text}")
                break

            data = response.json()
            
            if 'data' not in data or not data['data']:
                break 
                
            for item in data['data']:
                sentiment = 0
                if item.get('entities') and len(item['entities']) > 0:
                    sentiment = item['entities'][0].get('sentiment_score', 0)

                news_item = {
                    'ticker': symbol,
                    'title': item.get('title', ''),
                    'description': item.get('description', ''),
                    'source': item.get('source', ''),
                    'published_at': item.get('published_at', ''),
                    'url': item.get('url', ''),
                    'sentiment_score': sentiment
                }
                all_articles.append(news_item)
            
            logging.info(f"   -> P√°g {page} OK ({len(data['data'])} news)")
            page += 1
            time.sleep(1.2) 
            
        except Exception as e:
            logging.error(f"Erro de conex√£o: {e}")
            time.sleep(5)
            break
            
    return all_articles, True

def main():
    print("--- üöÄ INICIANDO COLETA MARKETAUX MULTI-KEY ---")
    
    manager = TokenManager(API_TOKENS)
    
    start_date = datetime(2022, 1, 1)
    end_date = datetime(2025, 12, 31)
    current_date = start_date
    
    while current_date < end_date:
        next_month = current_date + timedelta(days=32)
        next_month = next_month.replace(day=1)
        
        str_start = current_date.strftime("%Y-%m-%dT00:00")
        str_end = (next_month - timedelta(seconds=1)).strftime("%Y-%m-%dT23:59")
        
        print(f"\nüìÖ PROCESSANDO M√äS: {current_date.strftime('%m/%Y')}")
        
        for ticker in TICKERS:
            file_name = f"{ticker}_{current_date.strftime('%Y_%m')}.parquet"
            file_path = Path(OUTPUT_DIR) / file_name
            
            if file_path.exists():
                print(f"   ‚è≠Ô∏è {ticker} j√° existe. Pulando.")
                continue 
            
            print(f"   üîé Buscando {ticker}...")
            noticias, sistema_operante = buscar_noticias_api(ticker, str_start, str_end, manager)
            
            if noticias:
                df = pd.DataFrame(noticias)
                df.to_parquet(file_path, index=False)
                print(f"   üíæ Salvo: {file_name} ({len(df)} not√≠cias)")
            else:
                print(f"   ‚ö†Ô∏è Nenhuma not√≠cia encontrada para {ticker} neste m√™s.")
            
            if not sistema_operante:
                print("\nüèÅ FIM DA LINHA: Todas as chaves foram gastas hoje.")
                return 
                
        current_date = next_month

if __name__ == "__main__":
    main()

--- üöÄ INICIANDO COLETA MARKETAUX MULTI-KEY ---

üìÖ PROCESSANDO M√äS: 01/2022
   üîé Buscando PETR4.SA...
   ‚ö†Ô∏è Nenhuma not√≠cia encontrada para PETR4.SA neste m√™s.
   ‚è≠Ô∏è VALE3.SA j√° existe. Pulando.
   üîé Buscando ITUB4.SA...
   ‚ö†Ô∏è Nenhuma not√≠cia encontrada para ITUB4.SA neste m√™s.
   üîé Buscando WEGE3.SA...
   ‚ö†Ô∏è Nenhuma not√≠cia encontrada para WEGE3.SA neste m√™s.
   üîé Buscando BBAS3.SA...
   ‚ö†Ô∏è Nenhuma not√≠cia encontrada para BBAS3.SA neste m√™s.

üìÖ PROCESSANDO M√äS: 02/2022
   ‚è≠Ô∏è PETR4.SA j√° existe. Pulando.
   ‚è≠Ô∏è VALE3.SA j√° existe. Pulando.
   ‚è≠Ô∏è ITUB4.SA j√° existe. Pulando.
   ‚è≠Ô∏è WEGE3.SA j√° existe. Pulando.
   ‚è≠Ô∏è BBAS3.SA j√° existe. Pulando.

üìÖ PROCESSANDO M√äS: 03/2022
   ‚è≠Ô∏è PETR4.SA j√° existe. Pulando.
   ‚è≠Ô∏è VALE3.SA j√° existe. Pulando.
   üîé Buscando ITUB4.SA...
   ‚ö†Ô∏è Nenhuma not√≠cia encontrada para ITUB4.SA neste m√™s.
   ‚è≠Ô∏è WEGE3.SA j√° existe. Pulando.
   üîé Buscando BBAS3.SA...


In [None]:
import re
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import logging
import json
import os
from pathlib import Path
from tqdm import tqdm

OUTPUT_DIR = "../data/news/investing"
TICKERS_CSV = "../data/tickers_ibrx100_full.csv" 

KNOWN_SLUGS = {
    'PETR4': 'petrobras-pn', 'PETR3': 'petrobras-on',
    'VALE3': 'vale-on', 'ITUB4': 'itau-unibanco-pn-ed-j1',
    'BBDC4': 'banco-bradesco-pn-ej-n1', 'BBAS3': 'banco-brasil-on',
    'WEGE3': 'weg-on-ej-nm', 'MGLU3': 'magaz-luiza-on-nm',
    'ABEV3': 'ambev-on', 'JBSS3': 'jbs-on-nm',
    'B3SA3': 'bmfbovespa-on-nm', 'SUZB3': 'suzano-papel-celulose-sk-on',
    'RENT3': 'localiza-rent-a-car', 'RDOR3': 'rede-dor-sao-luiz-sa',
    'HAPV3': 'hapvida-on-nm', 'BPAC11': 'banco-btg-pactual-sa-unit',
    'EQTL3': 'equatorial-on-nm', 'LREN3': 'lojas-renner-on-nm',
    'PRIO3': 'petrorio-on-nm', 'RAIL3': 'rumo-on-nm',
    'GGBR4': 'gerdau-pn-n1', 'CSNA3': 'sid-nacional-on',
    'VBBR3': 'vibra-energia-on-nm', 'COGN3': 'kroton-on-nm',
    'USIM5': 'usiminas-pna', 'ELET3': 'eletrobras-on',
    'SBSP3': 'sabesp-on-ej-nm', 'CMIG4': 'cemig-pn-n1',
    'TIMS3': 'tim-part-on-ej-nm', 'VIVT3': 'telefonica-brasil-on',
    'CCRO3': 'ccr-sa-on-nm', 'UGPA3': 'ultrapar-part-on-nm',
    'EMBR3': 'embraer-on-nm', 'BRFS3': 'brasil-foods-sa-on-nm',
    'CSAN3': 'cosan-on-nm', 'GOAU4': 'gerdau-met-pn-n1',
    'MULT3': 'multiplan-on-n2', 'ASAI3': 'assai-atacadista-on',
    'ENEV3': 'eneva-on-nm', 'HYPE3': 'hypermarcas-on-nm',
    'CPLE6': 'copel-pnb', 'CPFE3': 'cpfl-energia-on-nm',
    'AZZA3': 'arezzo-co-on-ej-nm', 'RADL3': 'raiadrogasil-on-nm',
    'CYRE3': 'cyrela-realt-on-nm', 'YDUQ3': 'estacio-part-on-nm',
    'TOTS3': 'totvs-on-ej-nm', 'MRFG3': 'marfrig-on-nm',
    'CVCB3': 'cvc-brasil-on', 'GOLL4': 'gol-pn-es-n2',
    'AZUL4': 'azul-sa-pref'
}

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

def extrair_noticias_via_json(html_content):
    """
    Tenta encontrar o JSON oculto (Next.js/React hydration state)
    que cont√©m as not√≠cias cruas.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    noticias_extraidas = []

    script_next = soup.find('script', id='__NEXT_DATA__')
    
    if script_next:
        try:
            data = json.loads(script_next.string)
            
            def encontrar_chaves(obj, key_target):
                if isinstance(obj, dict):
                    for k, v in obj.items():
                        if k == key_target:
                            yield v
                        else:
                            yield from encontrar_chaves(v, key_target)
                elif isinstance(obj, list):
                    for item in obj:
                        yield from encontrar_chaves(item, key_target)

            for news_list in encontrar_chaves(data, 'news'):
                if isinstance(news_list, list):
                    for item in news_list:
                        if isinstance(item, dict) and 'title' in item:
                            noticias_extraidas.append({
                                'title': item.get('title'),
                                'link': item.get('href') or item.get('url'),
                                'raw_date': item.get('date') or item.get('published_at') or "Hoje",
                                'source': "Investing-JSON"
                            })
                            
            if noticias_extraidas:
                return noticias_extraidas
                
        except Exception as e:
            # logging.debug(f"Erro parse JSON Next.js: {e}")
            pass

    pattern = r'\{"title":"(.*?)","url":"(.*?)".*?"date":"(.*?)"\}'
    matches = re.findall(pattern, html_content)
    if matches:
        for m in matches:
            if len(m[0]) > 10: 
                noticias_extraidas.append({
                    'title': m[0],
                    'link': m[1].replace('\\u002F', '/'),
                    'raw_date': m[2],
                    'source': "Regex-Brute"
                })

    return noticias_extraidas

def carregar_tickers(caminho_csv):
    try:
        df = pd.read_csv(caminho_csv, sep=';')
        if len(df.columns) < 2: df = pd.read_csv(caminho_csv, sep=',')
        col_ticker = df.columns[0]
        return df[col_ticker].dropna().astype(str).apply(lambda x: x.split('.')[0].strip()).unique().tolist()
    except:
        return list(KNOWN_SLUGS.keys())

def main_json_mode():
    print("--- üïµÔ∏è INICIANDO COLETA VIA JSON HIDDEN (MODE AVAN√áADO) ---")
    scraper = cloudscraper.create_scraper()
    
    tickers = carregar_tickers(TICKERS_CSV)
    
    for ticker in tqdm(tickers, desc="Processando Tickers"):
        file_path = Path(OUTPUT_DIR) / f"news_investing_{ticker}.parquet"
        
        if file_path.exists():
            continue
            
        slug = KNOWN_SLUGS.get(ticker)
        
        if not slug: 
            slug = f"{ticker.lower()}-pn" if '4' in ticker else f"{ticker.lower()}-on"
        
        url = f"https://br.investing.com/equities/{slug}-news"
        
        try:
            time.sleep(random.uniform(2, 4))
            
            resp = scraper.get(url)
            
            if resp.status_code != 200:
                resp = scraper.get(f"https://br.investing.com/equities/{slug}")
            
            if resp.status_code == 200:
                news_data = extrair_noticias_via_json(resp.text)
                
                if news_data:
                    df = pd.DataFrame(news_data)
                    df['ticker'] = ticker
                    df['slug'] = slug
                    
                    df = df.drop_duplicates(subset=['title'])
                    
                    df.to_parquet(file_path, index=False)
                    # logging.info(f"‚úÖ {ticker}: {len(df)} not√≠cias extra√≠das via JSON!")
                else:
                    pd.DataFrame(columns=['title', 'raw_date']).to_parquet(file_path)
                    # logging.warning(f"‚ö†Ô∏è {ticker}: Nenhum JSON de not√≠cia encontrado.")
            
        except Exception as e:
            logging.error(f"Erro {ticker}: {e}")

if __name__ == "__main__":
    main_json_mode()

--- üïµÔ∏è INICIANDO COLETA VIA JSON HIDDEN (MODE AVAN√áADO) ---


Processando Tickers: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 97/97 [06:15<00:00,  3.87s/it]
