In [3]:
import os
import sys
import subprocess

# ✅ 가상환경 + 패키지 자동 설치
VENV_DIR = "nlp4pubmed"
if not os.path.exists(VENV_DIR):
    print("🔧 가상환경 생성 중...")
    subprocess.run([sys.executable, "-m", "venv", VENV_DIR])
    print("✅ 완료")

PIP = os.path.join(VENV_DIR, "Scripts" if os.name == "nt" else "bin", "pip")
subprocess.run([PIP, "install", "biopython", "pandas", "beautifulsoup4", "lxml"])






[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


CompletedProcess(args=['nlp4pubmed/bin/pip', 'install', 'biopython', 'pandas', 'beautifulsoup4', 'lxml'], returncode=0)

In [4]:
pip install bs4


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
# ✅ 본 기능 시작
import json
import pandas as pd
from Bio import Entrez
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

# 🔐 API 설정
with open("config.json", "r") as f:
    config = json.load(f)
Entrez.email = config["email"]
Entrez.api_key = config["api_key"]


In [7]:

# ✅ full text 가져오기
def fetch_full_text(pmid: str) -> dict:
    result = {"pmid": pmid, "full_text": None, "has_full_text": False}
    try:
        link = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pmc")
        link_result = Entrez.read(link)
        link.close()

        pmcid = link_result[0]["LinkSetDb"][0]["Link"][0]["Id"]
        fetch = Entrez.efetch(db="pmc", id=pmcid, rettype="full", retmode="xml")
        soup = BeautifulSoup(fetch.read(), "lxml")
        fetch.close()

        body = soup.find("body")
        if body:
            paras = body.find_all("p")
            full_text = "\n".join(p.get_text(strip=True) for p in paras)
            result["full_text"] = full_text
            result["has_full_text"] = True
    except:
        pass
    return result

# ✅ 사용자로부터 PMID 리스트 받기
def load_pmids() -> list[str]:
    option = input("📥 PMIDs 직접 입력(1) 또는 파일 불러오기(2)? ").strip()
    
    if option == "1":
        raw = input("🔢 쉼표로 구분된 PMID들을 입력하세요: ")
        pmids = [pmid.strip() for pmid in raw.split(",") if pmid.strip()]
    elif option == "2":
        filepath = input("📄 파일 경로를 입력하세요 (.txt or .csv): ").strip()
        if filepath.endswith(".txt"):
            with open(filepath, "r") as f:
                pmids = [line.strip() for line in f if line.strip()]
        elif filepath.endswith(".csv"):
            col = input("🧾 CSV 파일에서 어떤 열에 pmid가 있나요? ").strip()
            df = pd.read_csv(filepath)
            pmids = df[col].astype(str).tolist()
        else:
            print("❌ 지원하지 않는 파일 형식입니다.")
            return []
    else:
        print("❌ 잘못된 선택입니다.")
        return []
    return pmids

# ✅ 저장
def save_results(results: list[dict]):
    df = pd.DataFrame(results)
    os.makedirs("Database/fulltext_by_pmid", exist_ok=True)
    filename = input("📁 저장할 파일 이름을 입력하세요 (확장자 없이): ").strip()
    base = filename.replace(" ", "_")[:30]
    df.to_csv(f"Database/fulltext_by_pmid/{base}.csv", index=False, encoding="utf-8-sig")
    df.to_json(f"Database/fulltext_by_pmid/{base}.json", force_ascii=False, indent=2)
    print("✅ 저장 완료!")

# ✅ 실행
if __name__ == "__main__":
    pmids = load_pmids()
    if not pmids:
        print("❌ PMID가 없습니다. 종료합니다.")
        sys.exit()

    print(f"🔍 {len(pmids)}개의 논문에서 full text 수집 시작...")

    results = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(fetch_full_text, pmid): pmid for pmid in pmids}
        for future in as_completed(futures):
            result = future.result()
            if not result["full_text"]:
                result["full_text"] = "❌ Full text not available"
            results.append(result)

    save_results(results)


🔍 1개의 논문에서 full text 수집 시작...
✅ 저장 완료!
