In [1]:
import os
import time
import math
import requests
import pandas as pd
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

def export_clinical_trials_texts(
    search_term="cancer",
    csv_path="clinical_trials_cancer.csv",
    txt_dir="studies_txt",
    page_size=1000,
    batch_size=200,
    checkpoint_every=10,
    user_agent="Mozilla/5.0 (compatible; ctg-bot/1.0)"
):
    """
    Скачивает исследования с ClinicalTrials.gov по поисковому термину, 
    сохраняет CSV и TXT файлы.

    По умолчанию:
      - search_term = "cancer"
      - CSV = clinical_trials_cancer.csv
      - TXT = ./studies_txt
    """
    API_BASE = "https://clinicaltrials.gov/api/v2/studies"
    HEADERS = {"User-Agent": user_agent, "Accept": "application/json"}

    # ---- сессия с ретраями ----
    def make_session():
        s = requests.Session()
        retry = Retry(
            total=5, backoff_factor=0.5,
            status_forcelist=(429, 500, 502, 503, 504),
            allowed_methods=frozenset(["GET"]),
        )
        s.mount("https://", HTTPAdapter(max_retries=retry))
        s.headers.update(HEADERS)
        return s

    # ---- утилиты ----
    def join_semicolon(values):
        if not values:
            return ""
        return "; ".join(str(v).strip() for v in values if v)

    def outcome_measures(outcomes):
        if not outcomes:
            return ""
        return "; ".join(
            o.get("measure") for o in outcomes
            if isinstance(o, dict) and o.get("measure")
        )

    def interventions_fmt(interventions):
        if not interventions:
            return ""
        items = []
        for iv in interventions:
            name = iv.get("name")
            typ  = iv.get("type")
            if name and typ:
                items.append(f"{typ}: {name}")
            elif name:
                items.append(name)
        return join_semicolon(items)

    # ---- 1) все NCT ID ----
    def fetch_all_nct_ids(session):
        params = {
            "format": "json",
            "pageSize": page_size,
            "fields": "protocolSection.identificationModule.nctId",
            "query.term": search_term,
            "countTotal": "true",
        }
        ncts, page_token, page = [], None, 0
        while True:
            if page_token:
                params["pageToken"] = page_token
            else:
                params.pop("pageToken", None)
            r = session.get(API_BASE, params=params, timeout=60)
            if r.status_code != 200:
                raise RuntimeError(f"NCT IDs error {r.status_code}: {r.text[:500]}")
            data = r.json()
            studies = data.get("studies", []) or []
            page += 1
            print(f"[nct_ids] page {page}, items: {len(studies)}")
            for s in studies:
                nct = (s.get("protocolSection", {})
                        .get("identificationModule", {})
                        .get("nctId"))
                if nct: ncts.append(nct)
            page_token = data.get("nextPageToken")
            if not page_token:
                break
            time.sleep(0.1)
        unique_sorted = sorted(set(ncts))
        print(f"[nct_ids] total unique: {len(unique_sorted)}")
        return unique_sorted

    # ---- 2) детали по батчу ----
    DETAIL_FIELDS = ",".join([
        "protocolSection.identificationModule.nctId",
        "protocolSection.identificationModule.briefTitle",
        "protocolSection.identificationModule.officialTitle",
        "protocolSection.descriptionModule.briefSummary",
        "protocolSection.descriptionModule.detailedDescription",
        "protocolSection.conditionsModule.conditions",
        "protocolSection.armsInterventionsModule.interventions",
        "protocolSection.outcomesModule.primaryOutcomes",
        "protocolSection.outcomesModule.secondaryOutcomes",
        "protocolSection.outcomesModule.otherOutcomes",
        "hasResults",
    ])

    def fetch_details_for_batch(session, nct_batch):
        params = {
            "format": "json",
            "pageSize": len(nct_batch),
            "query.id": ",".join(nct_batch),
            "fields": DETAIL_FIELDS,
        }
        r = session.get(API_BASE, params=params, timeout=60)
        if r.status_code != 200:
            print(f"[details] error {r.status_code}, filling blanks for {len(nct_batch)} ids")
            return [{
                "NCTId": n,
                "Study Title": "", "Brief Summary": "", "Conditions": "", "Interventions": "",
                "Primary Outcome Measures": "", "Secondary Outcome Measures": "", "Other Outcome Measures": "",
                "Researcher Description": "", "Results URL": None
            } for n in nct_batch]

        data = r.json()
        studies = data.get("studies", []) or []
        by_id = {(
            s.get("protocolSection", {})
             .get("identificationModule", {})
             .get("nctId")
        ): s for s in studies if s.get("protocolSection")}

        rows = []
        for nct in nct_batch:
            s  = by_id.get(nct, {})
            ps = s.get("protocolSection", {}) if s else {}

            ident = ps.get("identificationModule", {}) or {}
            descm = ps.get("descriptionModule", {}) or {}
            condm = ps.get("conditionsModule", {}) or {}
            aim   = ps.get("armsInterventionsModule", {}) or {}
            outm  = ps.get("outcomesModule", {}) or {}

            title = ident.get("briefTitle") or ident.get("officialTitle") or ""
            brief = (descm.get("briefSummary") or "").strip()
            detailed = (descm.get("detailedDescription") or "").strip()
            conditions = join_semicolon(condm.get("conditions"))
            interventions = interventions_fmt(aim.get("interventions"))
            primary = outcome_measures(outm.get("primaryOutcomes"))
            secondary = outcome_measures(outm.get("secondaryOutcomes"))
            other = outcome_measures(outm.get("otherOutcomes"))
            has_results = bool(s.get("hasResults"))
            results_url = f"https://clinicaltrials.gov/study/{nct}?tab=results" if has_results else None

            rows.append({
                "NCTId": nct,
                "Study Title": title,
                "Brief Summary": brief,
                "Conditions": conditions,
                "Interventions": interventions,
                "Primary Outcome Measures": primary,
                "Secondary Outcome Measures": secondary,
                "Other Outcome Measures": other,
                "Researcher Description": detailed,
                "Results URL": results_url,
            })
        return rows

    # ---- 3) основной цикл ----
    session = make_session()
    nct_ids = fetch_all_nct_ids(session)

    all_rows = []
    batches = math.ceil(len(nct_ids) / batch_size)
    for i in range(batches):
        batch = nct_ids[i*batch_size:(i+1)*batch_size]
        print(f"[details] batch {i+1}/{batches} size {len(batch)}")
        rows = fetch_details_for_batch(session, batch)
        all_rows.extend(rows)
        if checkpoint_every and ((i + 1) % checkpoint_every == 0):
            pd.DataFrame(all_rows).to_csv(csv_path, index=False, encoding="utf-8-sig")
            print(f"[checkpoint] saved {len(all_rows)} rows -> {csv_path}")
        time.sleep(0.1)

    cols = [
        "NCTId","Study Title","Brief Summary","Conditions","Interventions",
        "Primary Outcome Measures","Secondary Outcome Measures","Other Outcome Measures",
        "Researcher Description","Results URL"
    ]
    df = pd.DataFrame(all_rows, columns=cols)
    df.to_csv(csv_path, index=False, encoding="utf-8-sig")
    print(f"[done] saved {csv_path}: {len(df)} rows")

    # ---- 4) TXT-экспорт ----
    os.makedirs(txt_dir, exist_ok=True)

    def merge_fields(row):
        fields = [
            row.get("Study Title", ""),
            row.get("Brief Summary", ""),
            row.get("Conditions", ""),
            row.get("Interventions", ""),
            row.get("Primary Outcome Measures", ""),
            row.get("Secondary Outcome Measures", ""),
            row.get("Other Outcome Measures", ""),
            row.get("Researcher Description", ""),
        ]
        return "\n".join(str(f) for f in fields if str(f).strip())

    for idx, row in df.iterrows():
        text = merge_fields(row)
        if text.strip():
            fname = f"{idx+1:05d}.txt"
            with open(os.path.join(txt_dir, fname), "w", encoding="utf-8") as f:
                f.write(text)

    print(f"[texts] saved {len(df)} txt files -> {txt_dir}")
    return df


In [2]:
export_clinical_trials_texts()

[nct_ids] page 1, items: 1000
[nct_ids] page 2, items: 1000
[nct_ids] page 3, items: 1000
[nct_ids] page 4, items: 1000
[nct_ids] page 5, items: 1000
[nct_ids] page 6, items: 1000
[nct_ids] page 7, items: 1000
[nct_ids] page 8, items: 1000
[nct_ids] page 9, items: 1000
[nct_ids] page 10, items: 1000
[nct_ids] page 11, items: 1000
[nct_ids] page 12, items: 1000
[nct_ids] page 13, items: 1000
[nct_ids] page 14, items: 1000
[nct_ids] page 15, items: 1000
[nct_ids] page 16, items: 1000
[nct_ids] page 17, items: 1000
[nct_ids] page 18, items: 1000
[nct_ids] page 19, items: 1000
[nct_ids] page 20, items: 1000
[nct_ids] page 21, items: 1000
[nct_ids] page 22, items: 1000
[nct_ids] page 23, items: 1000
[nct_ids] page 24, items: 1000
[nct_ids] page 25, items: 1000
[nct_ids] page 26, items: 1000
[nct_ids] page 27, items: 1000
[nct_ids] page 28, items: 1000
[nct_ids] page 29, items: 1000
[nct_ids] page 30, items: 1000
[nct_ids] page 31, items: 1000
[nct_ids] page 32, items: 1000
[nct_ids] page 33

Unnamed: 0,NCTId,Study Title,Brief Summary,Conditions,Interventions,Primary Outcome Measures,Secondary Outcome Measures,Other Outcome Measures,Researcher Description,Results URL
0,NCT00000105,Vaccination With Tetanus and KLH to Assess Imm...,The purpose of this study is to learn how the ...,Cancer,BIOLOGICAL: Intracel KLH Vaccine; BIOLOGICAL: ...,To assess whether patients can mediate an appr...,Tetanus Response,,Patients will receive each vaccine once only c...,
1,NCT00000106,41.8 Degree Centigrade Whole Body Hyperthermia...,Recently a non-toxic system for whole body hyp...,Rheumatic Diseases,DEVICE: Whole body hyperthermia unit,,,,,
2,NCT00000124,Collaborative Ocular Melanoma Study (COMS),To evaluate therapeutic interventions for pati...,Choroid Neoplasms; Uveitis,PROCEDURE: Brachytherapy; PROCEDURE: Eye Removal,,,,"For more than 100 years, removal of the eye (e...",
3,NCT00000134,Studies of the Ocular Complications of AIDS (S...,To compare the relative merits of three therap...,HIV Infections; Acquired Immunodeficiency Synd...,DRUG: Ganciclovir; DRUG: Foscarnet,Morbidity,,,CMV retinitis is the most common intraocular i...,https://clinicaltrials.gov/study/NCT00000134?t...
4,NCT00000136,Studies of the Ocular Complications of AIDS (S...,To evaluate the relative safety and efficacy o...,HIV Infections; Cytomegalovirus Retinitis,DRUG: Foscarnet; DRUG: Ganciclovir,Mortality,,,CMV retinitis is the most common intraocular i...,https://clinicaltrials.gov/study/NCT00000136?t...
...,...,...,...,...,...,...,...,...,...,...
132741,NCT07150793,A Study of Real-world Treatment and Burden of ...,The main goal of this study was to describe tr...,"Myeloid Leukemia, Chronic",,Number of Patients by Line of Treatment (LoT) ...,Annual Incidence of Inpatient and Outpatient V...,,,
132742,NCT07150806,RYZ101 for the Treatment of Progressive or Rec...,"This phase I/II tests the safety, side effects...",Recurrent Meningioma,DRUG: Actinium Ac 225 DOTATATE RYZ101; PROCEDU...,Progression free survival (PFS),Objective response rate (ORR); Overall surviva...,,PRIMARY OBJECTIVE:\n\nI. To estimate the effic...,
132743,NCT07150884,Lactase-Assisted Control Trial On Weight GAin ...,"Lactose, a disaccharide that includes the mono...",Premature - Weight 1000g-2499g or Gestation of...,DIETARY_SUPPLEMENT: Lactase,Weigh Gain,Body length; Head circumference,Biochemistry; Feeding intolerance,"Lactose, a disaccharide that includes the mono...",
132744,NCT07150949,Long-course Chemoradiotherapy or Short-course ...,"SERRAC is a prospective, multicentre, randomiz...",Locally Advanced Rectal Cancer; Neoadjuvant Th...,RADIATION: Long-course radiotherapy; DRUG: Oxa...,Complete response (CR) rate,Grade 3-4 adverse effects rate; 3 year anal pr...,,,
