In [16]:
# sentiment_analysis_pipeline_final.py
"""
Pipeline robusto de análise de sentimento (batch), com fallback lexicon.
Salva parquet + csv e gera agregação por ticker/mês.
"""

import os
import math
from typing import List, Tuple, Optional
import pandas as pd
import numpy as np
from tqdm import tqdm

# Transformers (carregado de forma condicional)
try:
    from transformers import pipeline, AutoModelForSequenceClassification
except Exception:
    pipeline = None
    AutoModelForSequenceClassification = None



In [17]:
# --- Configurações de arquivo ---
NEWS_DIR = os.path.join("..", "data", "news")
INPUT_FILE = os.path.join(NEWS_DIR, "raw_news_data.parquet")
OUTPUT_FILE_PARQUET = os.path.join(NEWS_DIR, "news_with_sentiment.parquet")
OUTPUT_FILE_CSV = os.path.join(NEWS_DIR, "news_with_sentiment.csv")



In [18]:
# --- Modelos candidatos (ordem de preferência) ---
MODEL_CANDIDATES = [
    "cardiffnlp/twitter-xlm-roberta-base-sentiment",
    "lipaoMai/BERT-sentiment-analysis-portuguese-with-undersampling-v2",
    "pysentimiento/bertweet-pt-sentiment",
    "nlptown/bert-base-multilingual-uncased-sentiment",
    "distilbert-base-uncased-finetuned-sst-2-english"
]



In [19]:
# --- Parâmetros de inferência ---
BATCH_SIZE = 64       # ajuste conforme memória (CPU/GPU)
MAX_CHARS = 2000      # truncar textos muito longos por caracteres (você pode melhorar com tokenizer)
TRUNCATE_TOKENS = 512 # caso use tokenizer: limite de tokens (não aplicado aqui diretamente)

POS_WORDS = {"bom","ótimo","excelente","positivo","cresceu","alta","superior","melhor","lucro","recupera"}
NEG_WORDS = {"ruim","queda","prejuízo","redução","negativo","menor","rebaixamento","perda","crise","dívida"}



In [20]:
# ---------------- utilitários ----------------
def torch_available() -> bool:
    try:
        import torch
        return True
    except Exception:
        return False



In [21]:
def pick_device() -> int:
    """Retorna 0 para GPU (se existir) ou -1 para CPU."""
    if torch_available():
        import torch
        return 0 if torch.cuda.is_available() else -1
    return -1

In [22]:
def tiny_lexicon_sentiment(text: str) -> Tuple[Optional[str], float]:
    """Fallback muito simples baseado em presença de palavras (não substitui modelo ML)."""
    t = str(text).lower()
    pos = sum(1 for w in POS_WORDS if w in t)
    neg = sum(1 for w in NEG_WORDS if w in t)
    if pos > neg:
        score = pos/(pos+neg+1e-9)
        return "positive", float(score)
    if neg > pos:
        score = neg/(pos+neg+1e-9)
        return "negative", float(score)
    return "neutral", 0.0



In [23]:
def try_load_pipeline(models: List[str], task: str = "sentiment-analysis", device: int = -1, use_auth_token: Optional[str] = None):
    """
    Tenta carregar modelos em ordem. Retorna (pipeline, model_name, label_map).
    Só passa use_auth_token para pipeline se for uma string não-vazia.
    """
    if pipeline is None:
        print("transformers.pipeline não está disponível (biblioteca não instalada).")
        return None, None, None

    last_exc = None
    for m in models:
        try:
            print(f"Tentando carregar modelo '{m}' (device={device}) ...")
            # construir kwargs dinamicamente: só incluir token se houver
            kwargs = {}
            if use_auth_token:
                kwargs['use_auth_token'] = use_auth_token
            # device já definido
            kwargs['device'] = device

            pipe = pipeline(task, model=m, **kwargs)

            # label_map: tenta inferir id2label se possível (mas só quando torch disponível)
            label_map = {}
            try:
                if AutoModelForSequenceClassification is not None and torch_available():
                    model_inst = AutoModelForSequenceClassification.from_pretrained(m, use_auth_token=use_auth_token if use_auth_token else None)
                    id2label = getattr(model_inst.config, "id2label", None)
                    if id2label:
                        for k, v in id2label.items():
                            lv = str(v).lower()
                            if "neg" in lv: label_map[k] = "negative"
                            elif "pos" in lv: label_map[k] = "positive"
                            elif "neu" in lv or "neutral" in lv: label_map[k] = "neutral"
                # heurísticas para alguns modelos conhecidos
                if not label_map:
                    if "cardiffnlp/twitter-xlm-roberta" in m:
                        label_map = {"LABEL_0":"negative","LABEL_1":"neutral","LABEL_2":"positive"}
                    elif "pysentimiento/bertweet" in m or "pysentimiento" in m:
                        label_map = {"POS":"positive","NEG":"negative","NEU":"neutral"}
                    elif "nlptown" in m:
                        label_map = {"1 star":"negative","2 star":"negative","3 star":"neutral","4 star":"positive","5 star":"positive"}
            except Exception:
                label_map = {}

            print(f"Modelo '{m}' carregado com sucesso.")
            return pipe, m, label_map

        except Exception as e:
            last_exc = e
            print(f"Falha ao carregar '{m}': {repr(e)}")
            continue

    print("Nenhum modelo pôde ser carregado. Último erro:", repr(last_exc))
    return None, None, None



In [24]:
def normalize_label(label: Optional[str], label_map: dict) -> Optional[str]:
    """Normaliza label usando label_map ou heurísticas simples."""
    if label is None:
        return None
    # se map existir e label in it
    if label_map and label in label_map:
        return label_map[label]
    l = str(label).lower()
    if "pos" in l or "positive" in l or "5" in l or "4" in l:
        return "positive"
    if "neg" in l or "negative" in l or "1" in l or "2" in l:
        return "negative"
    if "neu" in l or "neutral" in l:
        return "neutral"
    # fallback: retorna label original
    return label



In [25]:
def batch_sentiment_inference(pipe, texts: List[str], batch_size:int = 32, label_map: dict = None):
    """Roda pipeline em batches e retorna lista de tuples (norm_label, score)."""
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_clean = ["" if (x is None or (isinstance(x, float) and math.isnan(x))) else str(x) for x in batch]
        # truncar por caracteres (simples) para reduzir OOM com inputs gigantes
        batch_clean = [t if len(t) <= MAX_CHARS else t[:MAX_CHARS] for t in batch_clean]
        try:
            outs = pipe(batch_clean)
        except Exception as e:
            # se falhar por causa do batch_size, reduz e tenta recursivamente
            if batch_size > 1:
                return batch_sentiment_inference(pipe, texts, batch_size=max(1, batch_size//2), label_map=label_map)
            else:
                raise
        for o in outs:
            label = o.get("label") if isinstance(o, dict) else None
            score = float(o.get("score", 0.0)) if isinstance(o, dict) else None
            norm = normalize_label(label, label_map)
            results.append((norm, score))
    return results



In [26]:
def label_to_numeric(label: Optional[str]) -> int:
    if label is None:
        return 0
    l = str(label).lower()
    if "pos" in l or "positive" in l:
        return 1
    if "neg" in l or "negative" in l:
        return -1
    return 0

In [27]:
# ---------------- Main pipeline ----------------
def main():
    print("Iniciando pipeline de sentimento.")
    if not os.path.exists(INPUT_FILE):
        print(f"Arquivo de notícias não encontrado: {INPUT_FILE}")
        return

    df = pd.read_parquet(INPUT_FILE)
    print("Loaded input:", df.shape)

    device = pick_device()
    print("Device selecionado:", device, "(0 => gpu, -1 => cpu)")

    hf_token = os.environ.get("HF_TOKEN", None)
    pipe, model_name, label_map = try_load_pipeline(MODEL_CANDIDATES, device=device, use_auth_token=hf_token)

    if pipe is None:
        print("Nenhum modelo ML disponível. Usando fallback lexicon (muito simples).")
        # fallback lexicon aplicado a todos os títulos
        text_col = "title" if "title" in df.columns else df.columns[0]
        labels_scores = [tiny_lexicon_sentiment(x) for x in df[text_col].astype(str).tolist()]
    else:
        print("Executando inferência em batch com modelo:", model_name)
        text_col = "title" if "title" in df.columns else ( "summary" if "summary" in df.columns else df.columns[0] )
        texts = df[text_col].astype(str).fillna("").tolist()
        labels_scores = batch_sentiment_inference(pipe, texts, batch_size=BATCH_SIZE, label_map=label_map)

    labels, scores = zip(*labels_scores)
    df["sentiment_label"] = list(labels)
    df["sentiment_score"] = list(scores)
    df["numeric_sentiment"] = df["sentiment_label"].apply(label_to_numeric)
    df["sentiment_weighted"] = df["numeric_sentiment"] * df["sentiment_score"].fillna(0.0)

    # salvar parquet + csv
    os.makedirs(NEWS_DIR, exist_ok=True)
    df.to_parquet(OUTPUT_FILE_PARQUET, index=False)
    df_to_csv = df.copy()
    if "published_date" in df_to_csv.columns:
        df_to_csv['published_date'] = pd.to_datetime(df_to_csv['published_date'], errors='coerce').dt.strftime('%Y-%m-%d %H:%M:%S%z')
    df_to_csv.to_csv(OUTPUT_FILE_CSV, index=False)
    print("Salvo:", OUTPUT_FILE_PARQUET, OUTPUT_FILE_CSV)

    # agregação por ticker/mês (se dados disponíveis)
    if "published_date" in df.columns and "ticker_query" in df.columns:
        df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce', utc=True)
        df['year_month'] = df['published_date'].dt.to_period('M')
        aggregation = df.groupby(['ticker_query','year_month']).agg(
            avg_sentiment_score=('sentiment_weighted','mean'),
            news_count=('title','count'),
            positive_news_count=('numeric_sentiment', lambda s: (s==1).sum()),
            negative_news_count=('numeric_sentiment', lambda s: (s==-1).sum())
        ).sort_values(by=['ticker_query','year_month'], ascending=False)
        print(aggregation.head(15))

if __name__ == "__main__":
    main()


Iniciando pipeline de sentimento.
Loaded input: (2766, 6)
Device selecionado: -1 (0 => gpu, -1 => cpu)
Tentando carregar modelo 'cardiffnlp/twitter-xlm-roberta-base-sentiment' (device=-1) ...
Falha ao carregar 'cardiffnlp/twitter-xlm-roberta-base-sentiment': NameError("name 'torch' is not defined")
Tentando carregar modelo 'lipaoMai/BERT-sentiment-analysis-portuguese-with-undersampling-v2' (device=-1) ...
Falha ao carregar 'lipaoMai/BERT-sentiment-analysis-portuguese-with-undersampling-v2': NameError("name 'torch' is not defined")
Tentando carregar modelo 'pysentimiento/bertweet-pt-sentiment' (device=-1) ...
Falha ao carregar 'pysentimiento/bertweet-pt-sentiment': NameError("name 'torch' is not defined")
Tentando carregar modelo 'nlptown/bert-base-multilingual-uncased-sentiment' (device=-1) ...
Falha ao carregar 'nlptown/bert-base-multilingual-uncased-sentiment': NameError("name 'torch' is not defined")
Tentando carregar modelo 'distilbert-base-uncased-finetuned-sst-2-english' (device=

  df['year_month'] = df['published_date'].dt.to_period('M')


In [28]:
import sys
print("Python:", sys.version.splitlines()[0])
try:
    import torch
    print("Torch:", torch.__version__, "CUDA available:", torch.cuda.is_available())
except Exception as e:
    print("Torch import failed:", e)
try:
    import transformers
    print("Transformers:", transformers.__version__)
except Exception as e:
    print("Transformers import failed:", e)
try:
    from transformers import pipeline
    print("Carregando modelo teste (distilbert)...")
    p = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english', device=-1)
    print("Modelo carregado. Test inference:", p("This is great")[0])
except Exception as e:
    print("Erro carregando/rodando pipeline:", type(e), e)



Python: 3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]
Torch import failed: No module named 'torch'
Transformers: 4.57.3
Carregando modelo teste (distilbert)...
Erro carregando/rodando pipeline: <class 'NameError'> name 'torch' is not defined


In [29]:
from transformers import pipeline
# teste com modelo em pt (publico)
for m in ["pysentimiento/bertweet-pt-sentiment","cardiffnlp/twitter-xlm-roberta-base-sentiment"]:
    try:
        print("Tentando", m)
        p = pipeline('sentiment-analysis', model=m, device=-1)  # use device=0 se GPU
        print(m, "-> ok. exemplo:", p("A empresa divulgou lucro melhor que o esperado.")[0])
    except Exception as e:
        print(m, "falhou:", type(e), e)



Tentando pysentimiento/bertweet-pt-sentiment
pysentimiento/bertweet-pt-sentiment falhou: <class 'NameError'> name 'torch' is not defined
Tentando cardiffnlp/twitter-xlm-roberta-base-sentiment
cardiffnlp/twitter-xlm-roberta-base-sentiment falhou: <class 'NameError'> name 'torch' is not defined
