In [None]:
import pandas as pd
import numpy as np
import psycopg2
import pandas as pd
from dotenv import load_dotenv
import os
import sys
from sqlalchemy import create_engine
from importlib import reload


In [None]:
import os, sys
from importlib import reload

MODULES_DIR = os.path.join(os.getcwd(), "..", "..", "modules")
sys.path.append(os.path.abspath(MODULES_DIR))

# 1. importe os módulos (não as funções)
import prefilter_ner
import embedder
import help
import prompt_builder

# 2. recarregue apenas os módulos que você editou
reload(prefilter_ner)
reload(embedder)
reload(help)
reload(prompt_builder)

# 3. agora sim importe as funções atualizadas
from prefilter_ner import prefilter_results
from embedder import HuggingFaceEmbedder
from help import estimate_tokens
from prompt_builder import build_prompt


In [None]:
load_dotenv()

# Pre-filter ddgo

### Get retrieval data

In [None]:
USER = os.getenv("POSTGRES_USER")
PASSWORD = os.getenv("POSTGRES_PASSWORD")
DB = os.getenv("POSTGRES_DB")
PORT = os.getenv("POSTGRES_PORT")
HOST = os.getenv("POSTGRES_HOST", "localhost")

# Criar engine SQLAlchemy
engine = create_engine(f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}")


In [None]:
df = pd.read_sql("SELECT * FROM retrieved_news_ddgo", engine)
df

### Filter Data by suffle_id

In [None]:
df_filt = df[df['shuffle_id'].between(0, 1999)]

In [None]:
df_filt['search_title'].nunique()

In [None]:
df_filt.groupby('shuffle_id').size().describe()

### NER

In [None]:
from transformers import pipeline

class EntityComparator:
    def __init__(self, model_name="dslim/bert-base-NER", grouped=True):
        """
        Inicializa o pipeline de NER.
        """
        self.ner = pipeline(
            "ner",
            model=model_name,
            grouped_entities=grouped
        )

    def extract_entities(self, text):
        """
        Extrai entidades nomeadas como um conjunto de strings.
        """
        ents = self.ner(text)
        return {e["word"] for e in ents}

    def ner_score(self, ents1, ents2):
        """
        Retorna score entre 0 e 1 baseado na fração de entidades de text1
        que aparecem em text2.

        Regra:
        - score = interseção / total_entidades_text1
        - se text1 não tiver entidades → score = 1
        """

        # Caso especial
        if len(ents1) == 0:
            return {
                "score": 1,
                "ents_text1": [],
                "ents_text2": [],
                "intersection": []
            }
        
        intersection = ents1.intersection(ents2)

        score = len(intersection) / len(ents1)

        return {
            "score": score,
            "ents_text1": list(ents1),
            "ents_text2": list(ents2),
            "intersection": list(intersection)
        }

cmp = EntityComparator()

### Filter retrievel data

In [None]:
embedder = HuggingFaceEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
top_x = 10  # número de top resultados que queremos

summary_rows = []
filtered_by_title = {}

for idx, title in enumerate(df_filt['search_title'].unique()):
    print(idx)
    subset = df_filt[df_filt['search_title'] == title]
    total_before = len(subset)
    
    subset = subset.drop_duplicates(subset=["original_title"])
    duplicates_removed = total_before - len(subset)
    shuffle_id_val = subset["shuffle_id"].iloc[0]

    results = [
        {
            "refined_title": row["refined_title"],
            "original_title": row["original_title"],
            "domain": row["domain"],
            "snippet": row["snippet"],
            "search_title": row["search_title"],
            "shuffle_id": row["shuffle_id"],
        }
        for _, row in subset.iterrows()
    ]

    # aplica filtro de similaridade
    filtered_results = prefilter_results(
        results=results,
        original_title=title,
        embedder=embedder,
        credible_domains_file='../../out/credible_sources.txt'
    )

    ents1 = cmp.extract_entities(title)
    for result in filtered_results:
        ents2 = cmp.extract_entities(result['refined_title'])
        details = cmp.ner_score(ents1, ents2)
        result['query'] = title
        result['NER_score'] = details['score']
        result['NER_intersection'] = details['intersection']
        result['ents_text1'] = details['ents_text1']
        result['ents_text2'] = details['ents_text2']
        result['NER_count'] = len(details['ents_text1'])

    # contar removidos pelo filtro de similaridade
    non_similar_removed = len(subset) - len(filtered_results)

    # agora o total final após duplicatas + filtro de similaridade
    totat_after = len(filtered_results)

    # salva resultados filtrados
    filtered_by_title[title] = filtered_results

    # adiciona linha ao summary
    summary_rows.append({
        "search_title": title,
        "shuffle_id": shuffle_id_val,
        "total_before": total_before,
        "duplicates_removed": duplicates_removed,
        "non_similar_removed": non_similar_removed,
        "totat_after": totat_after,
    })

In [None]:
import json

with open("resultados1.json", "w", encoding="utf-8") as f:
    json.dump(filtered_by_title, f, indent=4, ensure_ascii=False)

In [None]:
import numpy as np

alpha = 0.8
beta  = 0.2

final = {}

for k,values in filtered_by_title.items():
    for value in values:
        value["score"] = alpha*value["similarity"] + beta*value["NER_score"]

    matched = values[:]
    matched_085 = [item for item in values if item["score"] >= 0.85]
    matched_080 = [item for item in values if item["score"] >= 0.80]
    matched_070 = [item for item in values if item["score"] >= 0.70]
    
    if matched_085: matched = matched_085[:]        
    elif matched_080: matched = matched_080[:]
    elif matched_070: matched = matched_070[:]

    matched = sorted(matched, key=lambda x: x['score'], reverse=True)[:top_x]
    final[k] = matched

## Build Prompts

In [None]:
def generate_prompts(filtered_by_title, mode='test1'):
    rows = []
    for title, filtered_results in filtered_by_title.items():

        shuffle_id = filtered_results[0].get("shuffle_id") if filtered_results else df_filt.set_index('search_title').to_dict()['shuffle_id'][title]

        prompt = build_prompt(
            mode=mode,
            title_to_check=title,
            results_filtered=filtered_results
        )

        info = {
            "search_title": title,
            "shuffle_id": shuffle_id,
            "prompt": prompt,
            "num_results": len(filtered_results),
            "approx_tokens": estimate_tokens(prompt),
            "prompt_length_chars": len(prompt),
        }
        rows.append(info)

    df_prompts = pd.DataFrame(rows)
    return df_prompts

In [None]:
df_test1 = generate_prompts(final, mode="test1")
print(len(df_test1.search_title.unique()))
df_test1.to_sql(
    "test_ner",
    engine,
    if_exists="replace",
    index=False
)
df_test1