In [1]:
import pandas as pd
import numpy as np
import psycopg2
import pandas as pd
from dotenv import load_dotenv
import os
import sys
from sqlalchemy import create_engine
from importlib import reload


In [2]:
import os, sys
from importlib import reload

MODULES_DIR = os.path.join(os.getcwd(), "..", "..", "modules")
sys.path.append(os.path.abspath(MODULES_DIR))

# 1. importe os módulos (não as funções)
import prefilter
import embedder
import help
import prompt_builder

# 2. recarregue apenas os módulos que você editou
reload(prefilter)
reload(embedder)
reload(help)
reload(prompt_builder)

# 3. agora sim importe as funções atualizadas
from prefilter import prefilter_results
from embedder import HuggingFaceEmbedder
from help import estimate_tokens
from prompt_builder import build_prompt


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
load_dotenv()

True

# Pre-filter ddgo

### Get retrieval data

In [16]:
USER = os.getenv("POSTGRES_USER")
PASSWORD = os.getenv("POSTGRES_PASSWORD")
DB = os.getenv("POSTGRES_DB")
PORT = os.getenv("POSTGRES_PORT")
HOST = os.getenv("POSTGRES_HOST", "localhost")

# Criar engine SQLAlchemy
engine = create_engine(f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}")


In [17]:
df = pd.read_sql("SELECT * FROM retrieved_news_ddgo", engine)

### Filter Data by suffle_id

In [19]:
df_filt = df[df['shuffle_id'].between(0, 1999)]

In [20]:
df_filt['search_title'].nunique()

2000

In [21]:
df_filt.groupby('shuffle_id').size().describe()

count    2000.000000
mean       19.876500
std         1.060099
min         2.000000
25%        20.000000
50%        20.000000
75%        20.000000
max        20.000000
dtype: float64

### Filter retrievel data

In [97]:
embedder = HuggingFaceEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [98]:
top_x = 10  # número de top resultados que queremos

summary_rows = []
filtered_by_title = {}

for title in df_filt['search_title'].unique():
    subset = df_filt[df_filt['search_title'] == title]
    total_before = len(subset)
    
    # remover duplicatas
    subset = subset.drop_duplicates(subset=["original_title"])
    duplicates_removed = total_before - len(subset)
    shuffle_id_val = subset["shuffle_id"].iloc[0]

    results = [
        {
            "refined_title": row["refined_title"],
            "original_title": row["original_title"],
            "domain": row["domain"],
            "snippet": row["snippet"],
            "search_title": row["search_title"],
            "shuffle_id": row["shuffle_id"],
        }
        for _, row in subset.iterrows()
    ]

    # aplica filtro de similaridade
    filtered_results = prefilter_results(
        results=results,
        original_title=title,
        embedder=embedder,
        credible_domains_file='../../out/credible_sources.txt'
    )

    # contar removidos pelo filtro de similaridade
    non_similar_removed = len(subset) - len(filtered_results)

    # agora o total final após duplicatas + filtro de similaridade
    totat_after = len(filtered_results)

    # ordenar por similaridade e pegar top X
    top_results = sorted(filtered_results, key=lambda x: x.get("similarity_score", 0), reverse=True)[:top_x]

    # salva resultados filtrados
    filtered_by_title[title] = top_results

    # adiciona linha ao summary
    summary_rows.append({
        "search_title": title,
        "shuffle_id": shuffle_id_val,
        "total_before": total_before,
        "duplicates_removed": duplicates_removed,
        "non_similar_removed": non_similar_removed,
        "totat_after": totat_after,
        "top_x_count": len(top_results)
    })

summary_df = pd.DataFrame(summary_rows)


In [99]:
summary_df

Unnamed: 0,search_title,shuffle_id,total_before,duplicates_removed,non_similar_removed,totat_after,top_x_count
0,Supreme Court Justice Ginsburg 'regrets' Trump...,0,20,0,1,19,10
1,DOZENS Of GOP Foreign Policy Experts Pledge T...,1,20,2,15,3,3
2,Senate prepares Puerto Rico debt debate amid D...,190,20,2,17,1,1
3,REPORT: Trump Laughed After Woman Was Grabbed...,2,20,0,17,3,3
4,"In North Dakota, Trump finds Democrat willing ...",3,20,0,18,2,2
...,...,...,...,...,...,...,...
1995,Delusional Trump Hilariously Thinks Angela Me...,1995,20,10,1,9,9
1996,"Tired Of Things Going Well, Marco Rubio Makes...",1996,20,0,16,4,4
1997,Japan's biggest warship to drill with U.S. car...,1997,20,0,16,4,4
1998,China's Xi tells Trump two countries must prom...,1998,20,3,7,10,10


In [100]:
summary_df.to_sql(
    "retrieved_news_ddgo_prefilter_stats",
    engine,
    if_exists="replace",
    index=False
)

1000

# Pre-filter Google

### Get retrieval data

In [5]:
USER = os.getenv("POSTGRES_USER")
PASSWORD = os.getenv("POSTGRES_PASSWORD")
DB = os.getenv("POSTGRES_DB")
PORT = os.getenv("POSTGRES_PORT")
HOST = os.getenv("POSTGRES_HOST", "localhost")

# Criar engine SQLAlchemy
engine = create_engine(f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}")


In [6]:
df_google = pd.read_sql("SELECT * FROM retrieved_news_google", engine)

### Filter data by shuffle_id

In [7]:
df_google_filt = df_google[df_google['shuffle_id'].between(0, 1999)]

In [8]:
df_google_filt['search_title'].nunique()

2000

In [9]:
df_google_filt.groupby('shuffle_id').size().describe()

count    2000.000000
mean        9.537000
std         1.553975
min         1.000000
25%        10.000000
50%        10.000000
75%        10.000000
max        10.000000
dtype: float64

### Filter retrieval data

In [10]:
embedder = HuggingFaceEmbedder(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [27]:
top_x = 10  # número de top resultados que queremos

summary_rows = []
filtered_by_title = {}

# for title in df_google_filt['search_title'].unique():
for title in df_google_filt['search_title'].unique():
    subset = df_google_filt[df_google_filt['search_title'] == title]
    total_before = subset['refined_title'].notna().sum()

    
    # remover duplicatas
    subset = subset.drop_duplicates(subset=["original_title"])
    duplicates_removed = total_before - subset['refined_title'].notna().sum()
    shuffle_id_val = subset["shuffle_id"].iloc[0]

    results = [
        {
            "refined_title": row["refined_title"],
            "original_title": row["original_title"],
            "domain": row["domain"],
            "snippet": row["snippet"],
            "search_title": row["search_title"],
            "shuffle_id": row["shuffle_id"],
        }
        for _, row in subset.iterrows()
    ]
    # aplica filtro de similaridade
    filtered_results = prefilter_results(
        results=results,
        original_title=title,
        embedder=embedder,
        credible_domains_file='../../out/credible_sources.txt'
    )

    # contar removidos pelo filtro de similaridade
    non_similar_removed = subset['refined_title'].notna().sum() - len(filtered_results)

    # agora o total final após duplicatas + filtro de similaridade
    totat_after = len(filtered_results)

    if filtered_results:
        top_results = sorted(
            filtered_results,
            key=lambda x: x.get("similarity_score", 0),
            reverse=True
        )[:top_x]
    else:
        # retorna uma linha mínima mesmo sem resultados
        top_results = [{
            "search_title": title,
            "shuffle_id": shuffle_id_val
        }]
    # salva resultados filtrados
    filtered_by_title[title] = top_results

    # adiciona linha ao summary
    summary_rows.append({
        "search_title": title,
        "shuffle_id": shuffle_id_val,
        "total_before": total_before,
        "duplicates_removed": duplicates_removed,
        "non_similar_removed": non_similar_removed,
        "totat_after": totat_after,
        "top_x_count": len(top_results)
    })

summary_df = pd.DataFrame(summary_rows)


In [28]:
summary_df

Unnamed: 0,search_title,shuffle_id,total_before,duplicates_removed,non_similar_removed,totat_after,top_x_count
0,Supreme Court Justice Ginsburg 'regrets' Trump...,0,10,0,1,9,9
1,REPORT: Trump Laughed After Woman Was Grabbed...,2,10,0,4,6,6
2,WATCH: We Found Donald Trump’s Campaign Theme...,1024,10,0,5,5,5
3,"In North Dakota, Trump finds Democrat willing ...",3,10,1,2,7,7
4,"Flash floods kill five in Malaysia, army deplo...",4,10,1,3,6,6
...,...,...,...,...,...,...,...
1995,‘Tehran’ Tom Cotton Throws Tantrum After Whit...,1971,0,0,0,0,1
1996,Turkey's Erdogan says will not succumb to U.S....,1999,10,1,3,6,6
1997,Angry That Benghazi Panel Couldn’t Bury Hilla...,1002,3,0,0,3,3
1998,Donald Trump Just Threw 60 YEARS Of Inaugural...,1281,10,0,7,3,3


In [29]:
summary_df.to_sql(
    "retrieved_news_google_prefilter_stats",
    engine,
    if_exists="replace",
    index=False
)

1000

# Build Prompts

In [30]:
def generate_prompts(filtered_by_title, mode='test1'):
    rows = []
    for title, filtered_results in filtered_by_title.items():

        shuffle_id = filtered_results[0].get("shuffle_id") if filtered_results else None

        prompt = build_prompt(
            mode=mode,
            title_to_check=title,
            results_filtered=filtered_results
        )

        info = {
            "search_title": title,
            "shuffle_id": shuffle_id,
            "prompt": prompt,
            "num_results": len(filtered_results),
            "approx_tokens": estimate_tokens(prompt),
            "prompt_length_chars": len(prompt),
        }
        rows.append(info)

    df_prompts = pd.DataFrame(rows)
    return df_prompts


In [102]:
df_test1 = generate_prompts(filtered_by_title, mode="test1")

df_test1.to_sql(
    "test1_prompts",
    engine,
    if_exists="replace",   # substitui staging a cada execução
    index=False
)

1000

In [103]:
df_test2 = generate_prompts(filtered_by_title, mode="test2")

df_test2.to_sql(
    "test2_prompts",
    engine,
    if_exists="replace",   # substitui staging a cada execução
    index=False
)

1000

In [18]:
df_test3 = generate_prompts(filtered_by_title, mode="test3")

df_test3.to_sql(
    "test3_prompts",
    engine,
    if_exists="replace",   # substitui staging a cada execução
    index=False
)

1000

In [32]:
filtered_by_title

{"Supreme Court Justice Ginsburg 'regrets' Trump criticisms": [{'refined_title': "U.S. Supreme Court Justice Ginsburg 'regrets' Trump criticisms",
   'original_title': "U.S. Supreme Court Justice Ginsburg 'regrets' Trump criticisms",
   'domain': 'reuters.com',
   'snippet': 'In a CNN interview posted on Tuesday, Ginsburg called the presumptive Republican nominee "a faker." In a separate interview with the New York ...',
   'search_title': "Supreme Court Justice Ginsburg 'regrets' Trump criticisms",
   'shuffle_id': 0,
   'similarity': 0.9972145557403564,
   'credible': True},
  {'refined_title': "Ruth Bader Ginsburg: I Regret 'Ill-Advised' Criticisms of Donald Trump",
   'original_title': "Ruth Bader Ginsburg: I Regret 'Ill-Advised' Criticisms of Donald Trump",
   'domain': 'nbcnews.com',
   'snippet': "'On reflection, my recent remarks in response to press inquiries were ill-advised and I regret making them,' Ginsburg said in a statement.",
   'search_title': "Supreme Court Justice G

In [31]:
df_test4 = generate_prompts(filtered_by_title, mode="test4")


KeyError: 'credible'

In [26]:
df_test4

Unnamed: 0,search_title,shuffle_id,prompt,num_results,approx_tokens,prompt_length_chars
0,Supreme Court Justice Ginsburg 'regrets' Trump...,0.0,\nClassify the news headline as 'fake' or 'rea...,9,918,3672
1,REPORT: Trump Laughed After Woman Was Grabbed...,2.0,\nClassify the news headline as 'fake' or 'rea...,6,745,2981
2,WATCH: We Found Donald Trump’s Campaign Theme...,1024.0,\nClassify the news headline as 'fake' or 'rea...,5,637,2548
3,"In North Dakota, Trump finds Democrat willing ...",3.0,\nClassify the news headline as 'fake' or 'rea...,7,807,3229
4,"Flash floods kill five in Malaysia, army deplo...",4.0,\nClassify the news headline as 'fake' or 'rea...,6,753,3013
...,...,...,...,...,...,...
1995,‘Tehran’ Tom Cotton Throws Tantrum After Whit...,,\nClassify the news headline as 'fake' or 'rea...,0,379,1519
1996,Turkey's Erdogan says will not succumb to U.S....,1999.0,\nClassify the news headline as 'fake' or 'rea...,6,772,3089
1997,Angry That Benghazi Panel Couldn’t Bury Hilla...,1002.0,\nClassify the news headline as 'fake' or 'rea...,3,555,2221
1998,Donald Trump Just Threw 60 YEARS Of Inaugural...,1281.0,\nClassify the news headline as 'fake' or 'rea...,3,583,2332


In [24]:
df_test3[df_test3['shuffle_id'] == '121']

Unnamed: 0,search_title,shuffle_id,prompt,num_results,approx_tokens,prompt_length_chars


In [41]:
df_test3 = generate_prompts(filtered_by_title, mode="test4")

df_test3.to_sql(
    "test4_prompts",
    engine,
    if_exists="replace",   # substitui staging a cada execução
    index=False
)

1000