In [1]:
import json
import string
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import WebDriverException
from bs4 import BeautifulSoup
import os

In [4]:
# Primero definís la función (si no lo hiciste antes)
def scrape_drug_text_structured(url, driver):
    import time
    import random
    from bs4 import BeautifulSoup

    try:
        driver.get(url)
        time.sleep(random.uniform(5, 9))
        soup = BeautifulSoup(driver.page_source, "html.parser")
        content = soup.find("div", id="content")
        if not content:
            return []

        blocks = content.find_all(["h2", "h3", "p", "ul", "ol"])
        
        structured_text = []
        
        for b in blocks:
            if b.name in ["h2", "h3", "p"]:
                text = b.get_text(strip=True)
                if text:
                    structured_text.append({"type": b.name, "text": text})
            elif b.name in ["ul", "ol"]:
                items = [li.get_text(strip=True) for li in b.find_all("li") if li.get_text(strip=True)]
                if items:
                    structured_text.append({"type": b.name, "items": items})
        
        return structured_text

    except Exception as e:
        print(f"❌ Error scrapeando {url}: {e}")
        return []

def create_driver():
    options = Options()
    options.headless = True
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                         "AppleWebKit/537.36 (KHTML, like Gecko) "
                         "Chrome/114.0.0.0 Safari/537.36")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


# Luego creás el driver
driver = create_driver()

# Ahora sí, podés scrapear
text = scrape_drug_text_structured("https://www.drugs.com/lyrica.html", driver)
print(text[:1000])  # mostrar primeros 1000 caracteres

# Cierre del driver (opcional si seguís)
driver.quit()

with open("lyrica.json", "w", encoding="utf-8") as f:
    json.dump(text, f, ensure_ascii=False, indent=2)






In [5]:
def scrape_drug_text_nested(url, driver):
    import time
    import random
    from bs4 import BeautifulSoup

    try:
        driver.get(url)
        time.sleep(random.uniform(5, 9))
        soup = BeautifulSoup(driver.page_source, "html.parser")
        content = soup.find("div", id="content")
        if not content:
            return []

        elements = content.find_all(["h2", "h3", "p", "ul", "ol"])

        structure = []
        current_h2 = None
        current_h3 = None

        for el in elements:
            tag = el.name
            text = el.get_text(strip=True)

            if tag == "h2":
                current_h2 = {"type": "h2", "text": text, "children": []}
                structure.append(current_h2)
                current_h3 = None  # Reset nivel h3 dentro del nuevo h2

            elif tag == "h3":
                if current_h2 is None:
                    # Si no hay h2 previo, lo agregamos como h2 vacío
                    current_h2 = {"type": "h2", "text": "", "children": []}
                    structure.append(current_h2)
                current_h3 = {"type": "h3", "text": text, "children": []}
                current_h2["children"].append(current_h3)

            elif tag in ["p", "ul", "ol"]:
                block = None
                if tag == "p":
                    block = {"type": "p", "text": text}
                else:
                    items = [li.get_text(strip=True) for li in el.find_all("li") if li.get_text(strip=True)]
                    block = {"type": tag, "items": items}

                if current_h3:
                    current_h3["children"].append(block)
                elif current_h2:
                    current_h2["children"].append(block)
                else:
                    # Si no hay h2 ni h3, lo agregamos al nivel raíz
                    structure.append(block)

        return structure

    except Exception as e:
        print(f"❌ Error scrapeando {url}: {e}")
        return []

def create_driver():
    options = Options()
    options.headless = True
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                         "AppleWebKit/537.36 (KHTML, like Gecko) "
                         "Chrome/114.0.0.0 Safari/537.36")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


# Luego creás el driver
driver = create_driver()

# Ahora sí, podés scrapear
text = scrape_drug_text_nested("https://www.drugs.com/lyrica.html", driver)
print(text[:1000])  # mostrar primeros 1000 caracteres

# Cierre del driver (opcional si seguís)
driver.quit()

with open("lyrica.json", "w", encoding="utf-8") as f:
    json.dump(text, f, ensure_ascii=False, indent=2)



In [15]:
INPUT_JSON = "drug_links.json"
OUTPUT_JSON = "prueba_alt.json"

# Cargar datos base
if os.path.exists(INPUT_JSON):
    with open(INPUT_JSON, "r", encoding="utf-8") as f:
        drugs = json.load(f)
else:
    drugs = []

# Cargar progreso previo si existe
drugs_with_text = []
existing_urls = set()

if os.path.exists(OUTPUT_JSON):
    with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
        drugs_with_text = json.load(f)
        existing_urls = {d["url"] for d in drugs_with_text}

print(f"✅ Drogas totales: {len(drugs)}, ya procesadas: {len(existing_urls)}")

# Scraper
def scrape_drug_text_nested_quick(url, driver):
    try:
        driver.get(url)
        time.sleep(random.uniform(5, 9))
        soup = BeautifulSoup(driver.page_source, "html.parser")
        content = soup.find("div", id="content")
        if not content:
            return []

        elements = content.find_all(["h2", "h3", "p", "ul", "ol"])
        structure = []
        current_h2 = None
        current_h3 = None

        for el in elements:
            tag = el.name
            text = el.get_text(strip=True)

            if tag == "h2":
                current_h2 = {"type": "h2", "text": text, "children": []}
                structure.append(current_h2)
                current_h3 = None
            elif tag == "h3":
                if current_h2 is None:
                    current_h2 = {"type": "h2", "text": "", "children": []}
                    structure.append(current_h2)
                current_h3 = {"type": "h3", "text": text, "children": []}
                current_h2["children"].append(current_h3)
            elif tag in ["p", "ul", "ol"]:
                block = {"type": tag}
                if tag == "p":
                    block["text"] = text
                else:
                    items = [li.get_text(strip=True) for li in el.find_all("li") if li.get_text(strip=True)]
                    block["items"] = items

                if current_h3:
                    current_h3["children"].append(block)
                elif current_h2:
                    current_h2["children"].append(block)
                else:
                    structure.append(block)

        return structure

    except Exception as e:
        print(f"❌ Error scrapeando {url}: {e}")
        return []

# Setup Selenium
def create_driver():
    options = Options()
    options.headless = True
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                         "AppleWebKit/537.36 (KHTML, like Gecko) "
                         "Chrome/114.0.0.0 Safari/537.36")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Iniciar
driver = create_driver()
counter = 0

for drug in drugs:
    if drug["url"] in existing_urls:
        continue  # Ya procesado

    print(f"🔍 Procesando: {drug['drug_name']}")

    text = scrape_drug_text_nested_quick(drug["url"], driver)
    drug["text"] = text
    drugs_with_text.append(drug)
    existing_urls.add(drug["url"])
    counter += 1

    if counter % 2 == 0:
        with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
            json.dump(drugs_with_text, f, ensure_ascii=False, indent=2)
        print(f"💾 Guardado parcial: {len(drugs_with_text)} drogas procesadas.")
        time.sleep(random.uniform(1, 5))

# Guardado Final
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(drugs_with_text, f, ensure_ascii=False, indent=2)

print(f"\n✅ Finalizado. Total drogas guardadas: {len(drugs_with_text)}")

driver.quit()

✅ Drogas totales: 17369, ya procesadas: 0
🔍 Procesando: Professional Monographs
🔍 Procesando: A/B Otic
❌ Error scrapeando https://www.drugs.com/cons/a-b-otic.html: HTTPConnectionPool(host='localhost', port=54513): Read timed out. (read timeout=120)
💾 Guardado parcial: 2 drogas procesadas.
🔍 Procesando: Abacavir
❌ Error scrapeando https://www.drugs.com/mtm/abacavir.html: HTTPConnectionPool(host='localhost', port=54513): Read timed out. (read timeout=120)
🔍 Procesando: Abacavir and lamivudine


KeyboardInterrupt: 

In [11]:
print(text[:10000])

HomeLyrica
Generic name:pregabalin[pre-GAB-a-lin]Other brand namesof pregabalin include: Lyrica,Lyrica CRDrug class:Gamma-aminobutyric acid analogs
Medically reviewed bySophia Entringer, PharmD. Last updated on Nov 18, 2024.
What is Lyrica?
Lyrica was originally FDA approved as an anti-epileptic drug, also called an anticonvulsant. It works by slowing down impulses in the brain that causeseizures. Pregabalin also affects chemicals in the brain that send pain signals across the nervous system.
Lyrica is used to treat pain caused byfibromyalgia, or nerve pain in people with diabetes (diabetic neuropathy),herpes zoster(post-herpeticneuralgia), or spinal cord injury.
Lyrica may also be used for purposes not listed in this medication guide.
Lyrica can cause a severe allergic reaction.Stop taking this medicine and seek emergency medical help if you havehivesor blisters on your skin, trouble breathing, or swelling in your face, mouth, or throat.
Some people have thoughts about suicide while t

In [12]:
print(text)

HomeLyrica
Generic name:pregabalin[pre-GAB-a-lin]Other brand namesof pregabalin include: Lyrica,Lyrica CRDrug class:Gamma-aminobutyric acid analogs
Medically reviewed bySophia Entringer, PharmD. Last updated on Nov 18, 2024.
What is Lyrica?
Lyrica was originally FDA approved as an anti-epileptic drug, also called an anticonvulsant. It works by slowing down impulses in the brain that causeseizures. Pregabalin also affects chemicals in the brain that send pain signals across the nervous system.
Lyrica is used to treat pain caused byfibromyalgia, or nerve pain in people with diabetes (diabetic neuropathy),herpes zoster(post-herpeticneuralgia), or spinal cord injury.
Lyrica may also be used for purposes not listed in this medication guide.
Lyrica can cause a severe allergic reaction.Stop taking this medicine and seek emergency medical help if you havehivesor blisters on your skin, trouble breathing, or swelling in your face, mouth, or throat.
Some people have thoughts about suicide while t

In [13]:
#Pareciera tomar todo


In [None]:
#Considerar limpieza (luego de tomar todo)

In [None]:
import json
import os
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup

# Archivos
INPUT_JSON = "drug_links.json"
INPUT_CSV = "drug_links.csv"
OUTPUT_JSON = "drogas_texto.json"
OUTPUT_CSV = "drogas_texto.csv"

# Configuración del navegador
def create_driver():
    options = Options()
    options.headless = True
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                         "AppleWebKit/537.36 (KHTML, like Gecko) "
                         "Chrome/114.0.0.0 Safari/537.36")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Scraping del texto principal de la droga
def scrape_drug_text(url, driver):
    try:
        driver.get(url)
        time.sleep(random.uniform(5, 9))
        soup = BeautifulSoup(driver.page_source, "html.parser")

        content = soup.find("div", id="content")
        if not content:
            return ""

        blocks = content.find_all(["p", "h2", "h3", "ul", "ol"])
        text = "\n".join([b.get_text(strip=True) for b in blocks])
        return text
    except Exception as e:
        print(f"❌ Error scrapeando {url}: {e}")
        return ""

# Cargar datos previos
if os.path.exists(INPUT_JSON):
    with open(INPUT_JSON, "r", encoding="utf-8") as f:
        drugs = json.load(f)
    print(f"📂 Cargado desde {INPUT_JSON}")
elif os.path.exists(INPUT_CSV):
    df = pd.read_csv(INPUT_CSV)
    drugs = df.to_dict(orient="records")
    print(f"📂 Cargado desde {INPUT_CSV}")
else:
    print("❌ No se encontró archivo de entrada.")
    drugs = []

# Ver si ya hay progreso previo en archivo de salida
if os.path.exists(OUTPUT_JSON):
    with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
        drugs_with_text = json.load(f)
    print(f"🔁 Reanudando desde {OUTPUT_JSON}")
    existing_urls = {d["url"] for d in drugs_with_text}
else:
    drugs_with_text = []
    existing_urls = set()

# Iniciar navegador
driver = create_driver()

# Loop principal
counter = 0
for drug in drugs:
    if "text" in drug and drug["text"].strip():
        continue  # ya procesado
    if drug["url"] in existing_urls:
        continue  # ya está en la lista con texto

    print(f"🔍 Procesando: {drug['drug_name']}")
    text = scrape_drug_text(drug["url"], driver)
    drug["text"] = text
    drugs_with_text.append(drug)
    existing_urls.add(drug["url"])
    counter += 1

    # Guardado progresivo cada 2
    if counter % 2 == 0:
        with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
            json.dump(drugs_with_text, f, ensure_ascii=False, indent=2)
        pd.DataFrame(drugs_with_text).to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
        print(f"💾 Guardado parcial ({len(drugs_with_text)} drogas con texto)")
        time.sleep(random.uniform(1, 5))  # Pausa después de guardar

# Guardado final
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(drugs_with_text, f, ensure_ascii=False, indent=2)
pd.DataFrame(drugs_with_text).to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
driver.quit()
print(f"\n✅ Finalizado. Total con texto: {len(drugs_with_text)} guardadas en {OUTPUT_JSON} y {OUTPUT_CSV}")


📂 Cargado desde drug_links.json
🔁 Reanudando desde drogas_texto.json
🔍 Procesando: Abilify (Aripiprazole Intramuscular)
🔍 Procesando: Abilify Asimtufii
💾 Guardado parcial (24 drogas con texto)
🔍 Procesando: Abilify Asimtufii injection
🔍 Procesando: Abilify Discmelt
💾 Guardado parcial (26 drogas con texto)
🔍 Procesando: Abilify Maintena
🔍 Procesando: Abilify Maintena Prefilled Syringe injection
💾 Guardado parcial (28 drogas con texto)
🔍 Procesando: Abilify Mycite
🔍 Procesando: Abilify MyCite Maintenance Kit oral with sensor
💾 Guardado parcial (30 drogas con texto)
🔍 Procesando: Abilify MyCite Starter Kit oral with sensor
🔍 Procesando: Abiraterone
💾 Guardado parcial (32 drogas con texto)
🔍 Procesando: Abiraterone Acetate
🔍 Procesando: Abiraterone and niraparib
💾 Guardado parcial (34 drogas con texto)
🔍 Procesando: Abiraterone, micronized
🔍 Procesando: Ablavar
💾 Guardado parcial (36 drogas con texto)
🔍 Procesando: Ablysinol
🔍 Procesando: AbobotulinumtoxinA
💾 Guardado parcial (38 drogas co

## ACÁ

In [5]:
# Archivos
INPUT_JSON = "drug_links.json" #Si dividimos, cambiar al propio
#INPUT_CSV = "drug_links.csv"
OUTPUT_JSON = "drogas_texto_vf.json" #Si dividimos, cambiar al propio
#OUTPUT_CSV = "drogas_texto.csv"

# Configuración del navegador
def create_driver():
    options = Options()
    options.headless = True
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                         "AppleWebKit/537.36 (KHTML, like Gecko) "
                         "Chrome/114.0.0.0 Safari/537.36")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Scraping del texto principal de la droga
def scrape_drug_text_nested(url, driver):
    import time
    import random
    from bs4 import BeautifulSoup

    try:
        driver.get(url)
        time.sleep(random.uniform(5, 9))
        soup = BeautifulSoup(driver.page_source, "html.parser")
        content = soup.find("div", id="content")
        if not content:
            return []

        elements = content.find_all(["h2", "h3", "p", "ul", "ol"])

        structure = []
        current_h2 = None
        current_h3 = None

        for el in elements:
            tag = el.name
            text = el.get_text(strip=True)

            if tag == "h2":
                current_h2 = {"type": "h2", "text": text, "children": []}
                structure.append(current_h2)
                current_h3 = None  # Reset nivel h3 dentro del nuevo h2

            elif tag == "h3":
                if current_h2 is None:
                    # Si no hay h2 previo, lo agregamos como h2 vacío
                    current_h2 = {"type": "h2", "text": "", "children": []}
                    structure.append(current_h2)
                current_h3 = {"type": "h3", "text": text, "children": []}
                current_h2["children"].append(current_h3)

            elif tag in ["p", "ul", "ol"]:
                block = None
                if tag == "p":
                    block = {"type": "p", "text": text}
                else:
                    items = [li.get_text(strip=True) for li in el.find_all("li") if li.get_text(strip=True)]
                    block = {"type": tag, "items": items}

                if current_h3:
                    current_h3["children"].append(block)
                elif current_h2:
                    current_h2["children"].append(block)
                else:
                    # Si no hay h2 ni h3, lo agregamos al nivel raíz
                    structure.append(block)

        return structure

    except Exception as e:
        print(f"❌ Error scrapeando {url}: {e}")
        return []

# Cargar datos previos
if os.path.exists(INPUT_JSON):
    with open(INPUT_JSON, "r", encoding="utf-8") as f:
        drugs = json.load(f)
    print(f"📂 Cargado desde {INPUT_JSON}")
#elif os.path.exists(INPUT_CSV):
#    df = pd.read_csv(INPUT_CSV)
#    drugs = df.to_dict(orient="records")
#    print(f"📂 Cargado desde {INPUT_CSV}")
else:
    print("❌ No se encontró archivo de entrada.")
    drugs = []

# Ver si ya hay progreso previo en archivo de salida
if os.path.exists(OUTPUT_JSON):
    with open(OUTPUT_JSON, "r", encoding="utf-8") as f:
        drugs_with_text = json.load(f)
    print(f"🔁 Reanudando desde {OUTPUT_JSON}")
    existing_urls = {d["url"] for d in drugs_with_text}
else:
    drugs_with_text = []
    existing_urls = set()

# Iniciar navegador
driver = create_driver()

# Loop principal
counter = 0
for drug in drugs:
    if "text" in drug and drug["text"].strip():
        continue  # ya procesado
    if drug["url"] in existing_urls:
        continue  # ya está en la lista con texto

    print(f"🔍 Procesando: {drug['drug_name']}")
    text = scrape_drug_text_nested(drug["url"], driver)
    drug["text"] = text
    drugs_with_text.append(drug)
    existing_urls.add(drug["url"])
    counter += 1

    # Guardado progresivo cada 2
    if counter % 10 == 0:
        with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
            json.dump(drugs_with_text, f, ensure_ascii=False, indent=2)
        #pd.DataFrame(drugs_with_text).to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
        print(f"💾 Guardado parcial ({len(drugs_with_text)} drogas con texto)")
        time.sleep(random.uniform(3, 7))  # Pausa después de guardar

# Guardado final
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(drugs_with_text, f, ensure_ascii=False, indent=2)
#pd.DataFrame(drugs_with_text).to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
driver.quit()
print(f"\n✅ Finalizado. Total con texto: {len(drugs_with_text)} guardadas en {OUTPUT_JSON} y {OUTPUT_CSV}")


    
    
    

📂 Cargado desde drug_links.json
🔁 Reanudando desde drogas_texto_vf.json
🔍 Procesando: Ah


KeyboardInterrupt: 