In [16]:
import pandas as pd
pubmed = pd.read_csv('data/all_pubmed.csv')
pubmed.head()

Unnamed: 0,pubmed_id,title,keywords,journal,abstract,methods,results,conclusions,publication_date,category
0,40073121,Targeting the NPY/NPY1R signaling axis in muta...,,Science advances,Pancreatic cancer (PC) is a highly metastatic ...,,,,2025-03-12,Pubmed_Pancreatic_Cancer.csv
1,40069621,The value of preoperative RDW for post-pancrea...,"Pancreatic ductal adenocarcinoma, Post-pancrea...",BMC cancer,Pancreatic ductal adenocarcinoma (PDAC) is a h...,,A total of 2268 patients were analyzed. We fou...,The preoperative RDW may be a useful marker fo...,2025-03-12,Pubmed_Pancreatic_Cancer.csv
2,40069616,"Protocol of the IMPACT study: randomized, mult...","Atezolizumab, Bevacizumab, Conversion, Hepatoc...",BMC cancer,Atezolizumab plus bevacizumab is recommended a...,,,,2025-03-12,Pubmed_Pancreatic_Cancer.csv
3,40066089,Association between human leukocyte antigen E ...,"HLA-E, cancer, human leukocyte antigen, immuno...",Frontiers in oncology,Immunotherapy has gained momentum with the dis...,,"After screening 657 articles, 11 studies were ...",This systematic review highlights that HLA-E e...,2025-03-11,Pubmed_Pancreatic_Cancer.csv
4,40065459,Oncological and Survival Endpoints in Cancer C...,"adverse events, cachexia, cancer, clinical tri...","Journal of cachexia, sarcopenia and muscle","In patients receiving anti-cancer treatment, c...",,"Fifty-seven trials were eligible, totalling 97...","In CC trials, oncological endpoints were mostl...",2025-03-11,Pubmed_Pancreatic_Cancer.csv


In [17]:
import pandas as pd
import re
import json

# Function to extract NCT IDs safely
def extract_nct_ids_from_abstract(abstract):
    """
    Extract all NCT IDs from the abstract using regex.
    Ensure that the input is a string before applying regex.
    """
    if not isinstance(abstract, str):  # Ensure it's a valid string
        return []
    return re.findall(r'NCT\d+', abstract)

# Function to link PubMed articles to Clinical Trials
def link_pubmed_to_trials(pubmed_df):
    """
    Link PubMed articles to Clinical Trials using NCT IDs extracted from the abstracts.
    """
    data = []
    
    for index, row in pubmed_df.iterrows():
        pubmed_id = row['pubmed_id']
        abstract = row.get('abstract', '')  # Get abstract safely, default to empty string
        nct_ids = extract_nct_ids_from_abstract(abstract)
        
        if nct_ids:  # Only add if there are valid NCT IDs
            data.append({"PubMed_ID": pubmed_id, "NCT_IDs": nct_ids})
    
    return data

# Extracting and saving to JSON
linked_data = link_pubmed_to_trials(pubmed)

# Saving results to JSON
output_file = "data/linked_pubmed_nct_ids.json"
with open(output_file, 'w') as json_file:
    json.dump(linked_data, json_file, indent=4)

print(f"Data saved in {output_file}")


Data saved in data/linked_pubmed_nct_ids.json


In [13]:
import pandas as pd
import json

# pd.read_csv('data/all_diseas_processed.csv')
all_disease = pd.read_csv('data/CT_all_common_disease_processed.csv')
print(all_disease.head())

pubmed_links = json.load(open('linked_pubmed_nct_ids.json'))
print(pubmed_links[:5])




# Create a mapping of NCT ID to PubMed ID
nct_to_pubmed = {}

for entry in pubmed_links:
    pubmed_id = str(entry["PubMed_ID"])  # Ensure string format
    for nct_id in entry["NCT_IDs"]:
        nct_to_pubmed[nct_id] = pubmed_id

# Sample all_disease DataFrame
# all_disease = pd.DataFrame({
#     "NCT ID": ["NCT06088706", "NCT02871856", "NCT12345678", "NCT05622630"]
# })

# Map NCT ID to PubMed ID
all_disease["Associated Article ID"] = all_disease["NCT ID"].map(nct_to_pubmed).fillna("")

# Create binary column indicating association
all_disease["Associated Article?"] = all_disease["Associated Article ID"].apply(lambda x: "YES" if x else "NO")

# Display result
print(all_disease)

print("Count of all common diseases:", len(all_disease))
print("Number of trials with associated articles:", all_disease['Associated Article?'].value_counts()['YES'])
print("Number of trials not having any associated articles:", all_disease['Associated Article?'].value_counts()['NO'])



        NCT ID  Acronym         Overall Status  Start Date  \
0  NCT03116126    NorAD  ACTIVE_NOT_RECRUITING  2019-01-04   
1  NCT04137926  Unknown                UNKNOWN  2020-03-01   
2  NCT02537626  Unknown              COMPLETED  2018-03-15   
3  NCT05531526  Unknown             RECRUITING  2022-12-23   
4  NCT00297362  Unknown              COMPLETED     2004-06   

            Conditions                      Interventions  \
0    Alzheimer Disease                Guanfacine, Placebo   
1  Alzheimer's Disease                    MicRNAs battery   
2  Alzheimer's Disease  Erchonia ALS Laser, Placebo Laser   
3    Alzheimer Disease                    AR1001, Placebo   
4    Alzheimer Disease           Galantamine hydrobromide   

                                           Locations Primary Completion Date  \
0                            London - United Kingdom              2024-08-05   
1                                   Shanghai - China              2022-08-30   
2             Zapopa

In [14]:
import pandas as pd
import json

# pd.read_csv('data/all_diseas_processed.csv')
all_disease = pd.read_csv('data/CT_all_rare_disease_processed.csv')
print(all_disease.head())

pubmed_links = json.load(open('linked_pubmed_nct_ids.json'))
print(pubmed_links[:5])




# Create a mapping of NCT ID to PubMed ID
nct_to_pubmed = {}

for entry in pubmed_links:
    pubmed_id = str(entry["PubMed_ID"])  # Ensure string format
    for nct_id in entry["NCT_IDs"]:
        nct_to_pubmed[nct_id] = pubmed_id

# Sample all_disease DataFrame
# all_disease = pd.DataFrame({
#     "NCT ID": ["NCT06088706", "NCT02871856", "NCT12345678", "NCT05622630"]
# })

# Map NCT ID to PubMed ID
all_disease["Associated Article ID"] = all_disease["NCT ID"].map(nct_to_pubmed).fillna("")

# Create binary column indicating association
all_disease["Associated Article?"] = all_disease["Associated Article ID"].apply(lambda x: "YES" if x else "NO")

# Display result
print(all_disease)

print("Count of all rare diseases:", len(all_disease))
print("Number of trials with associated articles:", all_disease['Associated Article?'].value_counts()['YES'])
print("Number of trials not having any associated articles:", all_disease['Associated Article?'].value_counts()['NO'])



        NCT ID   Acronym Overall Status    Start Date  \
0  NCT00005926   Unknown      COMPLETED       2000-06   
1  NCT01094626   Unknown      WITHDRAWN       2010-04   
2  NCT00253526   Unknown      WITHDRAWN  Unknown Date   
3  NCT00003426   Unknown      COMPLETED       1998-04   
4  NCT03469726  DIA-PANC        UNKNOWN    2017-12-22   

                                          Conditions  \
0             Pancreatic Cancer, Pancreatic Neoplasm   
1  Pancreatic Cancer, Intraductal Papillary Mucin...   
2  Adenocarcinoma of the Pancreas, Recurrent Panc...   
3                                  Pancreatic Cancer   
4                               Pancreatic Neoplasms   

                                       Interventions  \
0          Gemcitabine, Herceptin, Radiation therapy   
1                                           Secretin   
2  bevacizumab, gemcitabine hydrochloride, adjuva...   
3       gemcitabine hydrochloride, radiation therapy   
4           Contrast-enhanced Diffusion-