Consultas utilizadas para la API de extracción de artículos científicos 

In [None]:
QUERIES = [
    "machine learning",
    "artificial intelligence",
    "climate change",
    "renewable energy",
    "genomics",
    "CRISPR",
    "quantum computing",
    "robotics",
    "internet of things",
    "blockchain",
    "natural language processing",
    "computer vision",
    "cybersecurity",
    "human-computer interaction",
    "biomedical engineering",
    "neuroscience",
    "materials science",
    "nanotechnology",
    "agriculture technology",
    "autonomous vehicles",
    "edge computing",
    "smart cities",
    "5G networks",
    "digital twins",
    "data privacy",
    "carbon capture",
    "fusion energy",
    "e-waste management",
    "sustainable development",
    "bioinformatics",
    "ecosystem restoration",
    "renewable hydrogen",
    "space exploration",
    "satellite data analysis",
    "mental health technologies",
    "disease modeling",
    "covid-19 vaccine",
    "epigenetics",
    "artificial general intelligence",
    "solar energy forecasting",
    "digital pathology",
    "quantum cryptography",
    "explainable AI",
    "precision agriculture",
    "computational fluid dynamics",
    "digital phenotyping",
    "neuroprosthetics",
    "climate resilient crops",
    "bioinspired robotics",
    "computational neuroscience",
    "digital therapeutics",
    "computational chemistry",
    "synthetic biology",
    "ocean acidification",
    "wearable biosensors",
    "environmental DNA",
    "federated learning",
    "molecular diagnostics",
    "human microbiome",
    "plastic degradation",
    "cyber-physical systems",
    "ecoacoustics",
    "quantum sensors",
    "computational linguistics",
    "geomatics",
    "urban heat islands",
    "space weather modeling",
    "digital biomarkers",
    "smart manufacturing",
    "metaverse technologies",
    "livestock genomics",
    "AI in healthcare",
    "bioelectronic medicine",
    "AI for education",
    "digital agriculture",
    "energy harvesting materials",
    "self-healing materials",
    "AI-driven drug discovery",
    "extreme weather prediction",
    "quantum machine learning"
]


Utilizamos la API de crossref para extraer información de artículos científicos y sus dois.

In [None]:
import os
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from io import BytesIO
import pandas as pd
import time
from dotenv import load_dotenv
from typing import List, Dict, Optional

load_dotenv()

# Configuración desde .env
EMAIL = os.getenv("email")
MAX_PER_QUERY = 250
TOTAL_MAX = 2000
YEARS_BACK = 5

# Archivos de control
VISITED_DOIS_FILE = "dois_visitados.txt"
OUTPUT_FILE = "papers_guardados.csv"

# Cargar DOIs ya procesados
existing_dois = set()
if os.path.exists(VISITED_DOIS_FILE):
    with open(VISITED_DOIS_FILE, "r") as f:
        existing_dois.update(line.strip() for line in f)

results = []
if os.path.exists(OUTPUT_FILE):
    df_existente = pd.read_csv(OUTPUT_FILE)
    results = df_existente.to_dict(orient="records")

total_collected = len(results)

def fetch_crossref_papers(query: str, max_results: int = 100, years_back: int = 5, offset: int = 0) -> List[Dict]:
    """
    Busca artículos científicos en Crossref con DOI y filtro por fecha.
    """
    url = "https://api.crossref.org/works" 
    fecha_limite = (datetime.now() - timedelta(days=years_back * 365)).strftime("%Y-%m-%d")

    params = {
        'query': query,
        'rows': max_results,
        'offset': offset,
        'filter': f"from-pub-date:{fecha_limite}"
    }

    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()

        papers = []
        for item in data.get("message", {}).get("items", []):
            if item.get("language", "en") != "en":
                continue

            title = item.get("title", [""])[0]
            doi = item.get("DOI")
            if not doi or doi in existing_dois:
                continue

            landing_page = f"https://doi.org/{doi}" 
            authors = item.get("author", [])
            authors_str = ", ".join([f"{a.get('given', '')} {a.get('family', '')}" for a in authors])

            published_parts = item.get("published-print", item.get("published-online", {})).get("date-parts", [])
            published = "-".join(map(str, published_parts[0])) if published_parts else ""

            abstract = item.get("abstract", "")
            if abstract:
                abstract = BeautifulSoup(abstract, "html.parser").get_text()

            papers.append({
                "titulo": title,
                "autores": authors_str,
                "publicado": published,
                "idioma": item.get("language", "en"),
                "doi": doi,
                "url": landing_page,
                "abstract": abstract,
            })
        return papers
    except Exception as e:
        print(f"❌ Error en Crossref: {e}")
        return []

def get_open_access_pdf(doi: str, email: str = EMAIL) -> Optional[str]:
    """Busca un PDF abierto usando Unpaywall."""
    url = f"https://api.unpaywall.org/v2/{doi}" 
    try:
        response = requests.get(url, params={"email": email}).json()
        if response.get("is_oa"):
            return response.get("best_oa_location", {}).get("url_for_pdf")
    except Exception as e:
        print(f"❌ Error en Unpaywall: {e}")
    return None

def extract_text_from_pdf(pdf_url: str) -> Optional[str]:
    """Extrae texto de un PDF dado su URL."""
    try:
        response = requests.get(pdf_url, timeout=10)
        response.raise_for_status()
        if "application/pdf" not in response.headers.get("Content-Type", ""):
            return None
        with BytesIO(response.content) as f:
            reader = PdfReader(f)
            return "\n".join(p.extract_text() or "" for p in reader.pages).strip()
    except Exception as e:
        print(f"❌ Error leyendo PDF: {e}")
        return None

def find_direct_pdf_link(url: str) -> Optional[str]:
    """Busca enlaces a PDFs dentro de la página."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        for link in soup.find_all("a", href=True):
            if ".pdf" in link["href"].lower():
                return requests.compat.urljoin(url, link["href"])
    except Exception as e:
        print(f"❌ Error buscando PDF manualmente: {e}")
    return None

# Consultas principales
QUERIES = ["machine learning", "climate change", "quantum computing"]

for query in QUERIES:
    print(f"\n🔍 Consultando: {query}")
    for offset in range(0, MAX_PER_QUERY, 100):
        if total_collected >= TOTAL_MAX:
            break

        papers = fetch_crossref_papers(query=query, max_results=100, years_back=YEARS_BACK, offset=offset)

        with open(VISITED_DOIS_FILE, "a") as f_dois:
            for paper in papers:
                if total_collected >= TOTAL_MAX:
                    break

                doi = paper["doi"]
                if doi in existing_dois:
                    continue

                existing_dois.add(doi)
                landing_url = paper["url"]
                pdf_url = get_open_access_pdf(doi)

                if not pdf_url:
                    pdf_url = find_direct_pdf_link(landing_url)

                content = extract_text_from_pdf(pdf_url) if pdf_url else None
                paper["contenido"] = content

                if content:
                    results.append(paper)
                    f_dois.write(doi + "\n")
                    total_collected += 1
                    print(f"✅ ({total_collected}) {paper['titulo'][:60]}...")
                    pd.DataFrame([paper]).to_csv(
                        OUTPUT_FILE,
                        mode="a",
                        header=not os.path.exists(OUTPUT_FILE),
                        index=False,
                        encoding='utf-8',
                        errors='ignore'
                    )
                else:
                    print(f"⚠️ Sin contenido: {paper['titulo'][:60]}")

                time.sleep(1)

print(f"\n✅ Total de papers útiles recopilados: {total_collected}")

Comprobamos que no existan aarticulos extraidos sin contenido 

In [None]:
import os
import pandas as pd

OUTPUT_FILE = "papers_guardados.csv"

if os.path.exists(OUTPUT_FILE):
    df_existente = pd.read_csv(OUTPUT_FILE)

    df_limpio = df_existente[df_existente['contenido'].notna() & (df_existente['contenido'] != '')]
    df_limpio.to_csv(OUTPUT_FILE, index=False)

    print(f"Archivo limpiado guardado.: {len(df_limpio)}")
