Consultas utilizadas para la API de extracción de artículos científicos 

In [None]:
QUERIES = [
    "machine learning",
    "artificial intelligence",
    "climate change",
    "renewable energy",
    "genomics",
    "CRISPR",
    "quantum computing",
    "robotics",
    "internet of things",
    "blockchain",
    "natural language processing",
    "computer vision",
    "cybersecurity",
    "human-computer interaction",
    "biomedical engineering",
    "neuroscience",
    "materials science",
    "nanotechnology",
    "agriculture technology",
    "autonomous vehicles",
    "edge computing",
    "smart cities",
    "5G networks",
    "digital twins",
    "data privacy",
    "carbon capture",
    "fusion energy",
    "e-waste management",
    "sustainable development",
    "bioinformatics",
    "ecosystem restoration",
    "renewable hydrogen",
    "space exploration",
    "satellite data analysis",
    "mental health technologies",
    "disease modeling",
    "covid-19 vaccine",
    "epigenetics",
    "artificial general intelligence",
    "solar energy forecasting",
    "digital pathology",
    "quantum cryptography",
    "explainable AI",
    "precision agriculture",
    "computational fluid dynamics",
    "digital phenotyping",
    "neuroprosthetics",
    "climate resilient crops",
    "bioinspired robotics",
    "computational neuroscience",
    "digital therapeutics",
    "computational chemistry",
    "synthetic biology",
    "ocean acidification",
    "wearable biosensors",
    "environmental DNA",
    "federated learning",
    "molecular diagnostics",
    "human microbiome",
    "plastic degradation",
    "cyber-physical systems",
    "ecoacoustics",
    "quantum sensors",
    "computational linguistics",
    "geomatics",
    "urban heat islands",
    "space weather modeling",
    "digital biomarkers",
    "smart manufacturing",
    "metaverse technologies",
    "livestock genomics",
    "AI in healthcare",
    "bioelectronic medicine",
    "AI for education",
    "digital agriculture",
    "energy harvesting materials",
    "self-healing materials",
    "AI-driven drug discovery",
    "extreme weather prediction",
    "quantum machine learning"
]


Utilizamos la API de crossref para extraer información de artículos científicos y sus dois.

In [None]:
import os
from datetime import datetime, timedelta
import requests
import pandas as pd
from io import BytesIO
from PyPDF2 import PdfReader
from bs4 import BeautifulSoup
import time

EMAIL=os.getenv("email")

MAX_PER_QUERY = 250 
TOTAL_MAX = 2000
YEARS_BACK = 5

VISITED_DOIS_FILE = "dois_visitados.txt"
OUTPUT_FILE = "papers_guardados.csv"

#Leyendo los archivos de documentos .csv por si existe no repetir dichas url
if os.path.exists(VISITED_DOIS_FILE):
    with open(VISITED_DOIS_FILE, "r") as f:
        existing_dois = set(f.read().splitlines())
else:
    existing_dois = set()

if os.path.exists(OUTPUT_FILE):
    df_existente = pd.read_csv(OUTPUT_FILE)
    dois_guardados = set(df_existente["doi"])
    results = df_existente.to_dict(orient="records")
else:
    df_existente = pd.DataFrame()
    dois_guardados = set()
    results = []

total_collected = len(results)


def fetch_crossref_papers(query="machine learning", max_results=5, years_back=5, offset=0,dois=None):
    """ 
    Busca los artículos científicos que coincidan con la consulta y que no sean de más de cinco años
    Args: 
       query(str)
       max_results(int) : cantidad máxima de artículos buscados por consulta
       years_back(int) : cantidad de años hacia atrás por los que filtramos los artículos
       offset (int): Número de resultados a saltar (para paginación)
       dois(list): Conjunto de identificadores DOI ya visitados
    Returns:
       list de diccionarios con la información de los artículos visitados 
    """
    url = "https://api.crossref.org/works"

    fecha_limite = (datetime.now() - timedelta(days=years_back * 365)).strftime("%Y-%m-%d")

    params = {
        'query': query,
        'rows': max_results,
        'offset': offset,
        'filter': f"from-pub-date:{fecha_limite}",
    }

    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()

        papers = []
        for item in data.get("message", {}).get("items", []):
            if item.get("language", "en") != "en":
                continue

            title = item.get("title", [""])[0]
            doi = item.get("DOI")
            if doi in existing_dois:
               continue
            existing_dois.add(doi)

            landing_page = f"https://doi.org/{doi}"

            authors = item.get("author", [])
            authors_str = ", ".join([f"{a.get('given', '')} {a.get('family', '')}" for a in authors])

            published_parts = item.get("published-print", item.get("published-online", {})).get("date-parts", [])
            published = "-".join(map(str, published_parts[0])) if published_parts else ""

            abstract = item.get("abstract", "")
            if abstract:
                abstract = BeautifulSoup(abstract, "html.parser").get_text()

            papers.append({
                "title": title,
                "doi": doi,
                "landing_page": landing_page,
                "authors": authors_str,
                "published": published,
                "abstract": abstract,
                "language": item.get("language", "en")
            })
        return papers
    except Exception as e:
        print(f"❌ Error en Crossref: {e}")
        return []

def get_open_access_pdf(doi, email=EMAIL):
    """
    Busca con la api unpaywall el contenidos de los artículos
    Args:
       doi(str): identificador del artículo
       email(str): correo electrónico registrado para usar la API de Unpaywall.
    Returns:
       str: url del pdf
    """
    url = f"https://api.unpaywall.org/v2/{doi}"
    params = {"email": email}
    try:
        response = requests.get(url, params=params).json()
        if response.get("is_oa"):
            return response.get("best_oa_location", {}).get("url_for_pdf")
    except Exception as e:
        print(f"❌ Error en Unpaywall: {e}")
    return None

def find_direct_pdf_link(url):
    """
    Intenta encontrar un enlace directo a PDF en una página web
    Args:
        url(str): url de la página donde buscar el enlace del pdf
    Returns:
        str: url completa del PDF
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        if "text/html" in response.headers.get("Content-Type", ""):
            soup = BeautifulSoup(response.text, "html.parser")
            for link in soup.find_all("a", href=True):
                if ".pdf" in link["href"].lower():
                    return requests.compat.urljoin(url, link["href"])
    except Exception as e:
        print(f"❌ PDF manual falló: {e}")
    return None

def extract_text_from_pdf(pdf_url):
    """
    Extrae el texto de un archivo PDF desde una url
    Args:
        pdf_url(str): url del pdf
    Returns:
        str: Texto extraído del PDF
    """
    try:
        response = requests.get(pdf_url, timeout=10)
        response.raise_for_status()
        if "application/pdf" not in response.headers.get("Content-Type", ""):
            return None
        with BytesIO(response.content) as f:
            reader = PdfReader(f)
            return "\n".join(filter(None, (p.extract_text() for p in reader.pages))).strip()
    except Exception as e:
        print(f"❌ Error leyendo PDF: {e}")
        return None

#iteración por las consultas
for query in QUERIES:
    print(f"\n🔍 Consultando: {query}")
    for offset in range(0, MAX_PER_QUERY, 100):
        if total_collected >= TOTAL_MAX:
            break

        papers = fetch_crossref_papers(query=query, max_results=100, years_back=YEARS_BACK, offset=offset,dois=existing_dois)

        with open(VISITED_DOIS_FILE, "a") as f_dois:
            for paper in papers:
                doi = paper["doi"]
                if doi in dois_guardados:
                    continue
                existing_dois.add(doi)
                landing_url = paper["landing_page"]
                pdf_url = get_open_access_pdf(doi)
                if not pdf_url:
                    pdf_url = find_direct_pdf_link(landing_url)

                paper["pdf_url"] = pdf_url
                content = extract_text_from_pdf(pdf_url) if pdf_url else None
                paper["contenido"] = content

                if content:
                    results.append(paper)
                    dois_guardados.add(doi)
                    f_dois.write(doi + "\n")
                    total_collected += 1

                    print(f"✅ ({total_collected}) {paper['title'][:60]}...")
                    
                    pd.DataFrame([paper]).to_csv(OUTPUT_FILE, mode="a", header=not os.path.exists(OUTPUT_FILE), index=False, encoding='utf-8',errors='ignore')
                else:
                    print(f"⚠️ Sin contenido: {paper['title'][:60]}")

                if total_collected >= TOTAL_MAX:
                    break

                time.sleep(1)  # Evitar baneos

print(f"\n✅ Total de papers útiles recopilados: {total_collected}")


Comprobamos que no existan aarticulos extraidos sin contenido 

In [None]:
import os
import pandas as pd

OUTPUT_FILE = "papers_guardados.csv"

if os.path.exists(OUTPUT_FILE):
    df_existente = pd.read_csv(OUTPUT_FILE)

    df_limpio = df_existente[df_existente['contenido'].notna() & (df_existente['contenido'] != '')]
    df_limpio.to_csv(OUTPUT_FILE, index=False)

    print(f"Archivo limpiado guardado.: {len(df_limpio)}")
