In [None]:
import json
import os
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup


# Archivos
INPUT_JSON = "drug_links.json" #Si dividimos, cambiar al propio
#INPUT_CSV = "drug_links.csv"
OUTPUT_JSON = "drogas_texto_vf.json" #Si dividimos, cambiar al propio
#OUTPUT_CSV = "drogas_texto.csv"

# Configuración del navegador
def create_driver():
    options = Options()
    options.headless = True
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                         "AppleWebKit/537.36 (KHTML, like Gecko) "
                         "Chrome/114.0.0.0 Safari/537.36")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Scraping del texto principal de la droga
def scrape_drug_text_nested(url, driver):
    import time
    import random
    from bs4 import BeautifulSoup

    try:
        driver.get(url)
        time.sleep(random.uniform(5, 9))
        soup = BeautifulSoup(driver.page_source, "html.parser")
        content = soup.find("div", id="content")
        if not content:
            return []

        elements = content.find_all(["h2", "h3", "p", "ul", "ol"])

        structure = []
        current_h2 = None
        current_h3 = None

        for el in elements:
            tag = el.name
            text = el.get_text(strip=True)

            if tag == "h2":
                current_h2 = {"type": "h2", "text": text, "children": []}
                structure.append(current_h2)
                current_h3 = None  # Reset nivel h3 dentro del nuevo h2

            elif tag == "h3":
                if current_h2 is None:
                    # Si no hay h2 previo, lo agregamos como h2 vacío
                    current_h2 = {"type": "h2", "text": "", "children": []}
                    structure.append(current_h2)
                current_h3 = {"type": "h3", "text": text, "children": []}
                current_h2["children"].append(current_h3)

            elif tag in ["p", "ul", "ol"]:
                block = None
                if tag == "p":
                    block = {"type": "p", "text": text}
                else:
                    items = [li.get_text(strip=True) for li in el.find_all("li") if li.get_text(strip=True)]
                    block = {"type": tag, "items": items}

                if current_h3:
                    current_h3["children"].append(block)
                elif current_h2:
                    current_h2["children"].append(block)
                else:
                    # Si no hay h2 ni h3, lo agregamos al nivel raíz
                    structure.append(block)

        return structure

    except Exception as e:
        print(f"❌ Error scrapeando {url}: {e}")
        return []

# Cargar datos previos
if os.path.exists(INPUT_JSON):
    with open(INPUT_JSON, "r", encoding="utf-8") as f:
        drugs = json.load(f)
    print(f"📂 Cargado desde {INPUT_JSON}")
#elif os.path.exists(INPUT_CSV):
#    df = pd.read_csv(INPUT_CSV)
#    drugs = df.to_dict(orient="records")
#    print(f"📂 Cargado desde {INPUT_CSV}")
else:
    print("❌ No se encontró archivo de entrada.")
    drugs = []

# Ver si ya hay progreso previo en archivo de salida
if os.path.exists(OUTPUT_JSON):
    with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
        drugs_with_text = json.load(f)
    print(f"🔁 Reanudando desde {OUTPUT_JSON}")
    existing_urls = {d["url"] for d in drugs_with_text}
else:
    drugs_with_text = []
    existing_urls = set()

# Iniciar navegador
driver = create_driver()

# Loop principal
counter = 0
for drug in drugs:
    if "text" in drug and drug["text"].strip():
        continue  # ya procesado
    if drug["url"] in existing_urls:
        continue  # ya está en la lista con texto

    print(f"🔍 Procesando: {drug['drug_name']}")
    text = scrape_drug_text_nested(drug["url"], driver)
    drug["text"] = text
    drugs_with_text.append(drug)
    existing_urls.add(drug["url"])
    counter += 1

    # Guardado progresivo cada 2
    if counter % 10 == 0:
        with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
            json.dump(drugs_with_text, f, ensure_ascii=False, indent=2)
        #pd.DataFrame(drugs_with_text).to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
        print(f"💾 Guardado parcial ({len(drugs_with_text)} drogas con texto)")
        time.sleep(random.uniform(3, 7))  # Pausa después de guardar

# Guardado final
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(drugs_with_text, f, ensure_ascii=False, indent=2)
#pd.DataFrame(drugs_with_text).to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
driver.quit()
print(f"\n✅ Finalizado. Total con texto: {len(drugs_with_text)} guardadas en {OUTPUT_JSON} y {OUTPUT_CSV}")



📂 Cargado desde drug_links.json
🔁 Reanudando desde drogas_texto_vf.json
🔍 Procesando: Abilify MyCite Maintenance Kit oral with sensor
🔍 Procesando: Abilify MyCite Starter Kit oral with sensor
🔍 Procesando: Abiraterone
🔍 Procesando: Abiraterone Acetate
🔍 Procesando: Abiraterone and niraparib
🔍 Procesando: Abiraterone, micronized
🔍 Procesando: Ablavar
🔍 Procesando: Ablysinol
🔍 Procesando: AbobotulinumtoxinA
🔍 Procesando: Abraxane
💾 Guardado parcial (39 drogas con texto)
🔍 Procesando: Abreva
🔍 Procesando: Abrilada
🔍 Procesando: Abrocitinib
🔍 Procesando: Abrysvo
🔍 Procesando: Absorbine Athletes Foot
🔍 Procesando: Absorbine Jr. Antifungal
🔍 Procesando: Absorica
🔍 Procesando: Absorica LD
🔍 Procesando: Abstral
🔍 Procesando: Abstral Sublingual Tablet
💾 Guardado parcial (49 drogas con texto)
🔍 Procesando: Ac
🔍 Procesando: Ad
🔍 Procesando: Ae
🔍 Procesando: Af
🔍 Procesando: Ag
🔍 Procesando: Ah
🔍 Procesando: Ai
🔍 Procesando: Aj
🔍 Procesando: Ak
🔍 Procesando: Al
💾 Guardado parcial (59 drogas con te