Data Collection for Clinical Trials

In [None]:
def remove_description_module(data):
    # Count the number of outer lists
    num_outer_lists = len(data)
    # print(f"Number of outer lists: {num_outer_lists}")
    
    # Traverse the outer list
    for i, outer_item in enumerate(data):
        # print(f"Processing outer list {i+1}/{num_outer_lists}")
        
        # Ensure it's a list before iterating
        if isinstance(outer_item, list):
            for j, inner_item in enumerate(outer_item):
                # print(f"Processing inner list {j+1}/{len(outer_item)} in outer list {i+1}")
                
                # Ensure the structure matches expected format
                if isinstance(inner_item, dict) and "protocolSection" in inner_item:
                    if "descriptionModule" in inner_item["protocolSection"]:
                        del inner_item["protocolSection"]["descriptionModule"]
                        # print(f"Removed 'descriptionModule' from inner list {j+1} in outer list {i+1}")
    
    return data

In [None]:
import requests
import pandas as pd
import json
import time

diseases = [
    "pancreatic cancer",
    "chagas disease",
    "endometriosis",
    "drug resistant tuberculosis",
    "duchenne muscular dystrophy",
    "alzheimer's",
    "influenza",
    "hepatitis",
    "malaria",
    "breast-cancer",
]


# disease_to_consider = diseases[1]

for disease_to_consider in diseases:

    # Initial URL for the first API call
    base_url = "https://clinicaltrials.gov/api/v2/studies"
    params = {
        "query.titles": disease_to_consider,
        "pageSize": 100
    }

    # Initialize an empty list to store the data
    data_list = []
    all_data = []


    # Loop until there is no nextPageToken
    x=1
    while True:
        if x>50:
            break

        x = x+1
        # Print the current URL (for debugging purposes)
        print("Fetching data from:", base_url + '?' + '&'.join([f"{k}={v}" for k, v in params.items()]))

        # Send a GET request to the API
        response = requests.get(base_url, params=params)


        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()  # Parse JSON response
            studies = data.get('studies', [])  # Extract the list of studies
            all_data.append(studies)
            # print(data)        

            # Check for nextPageToken and update the params or break the loop
            nextPageToken = data.get('nextPageToken')
            if nextPageToken:
                params['pageToken'] = nextPageToken  # Set the pageToken for the next request
            else:
                break  # Exit the loop if no nextPageToken is present
        else:
            print("Failed to fetch data. Status code:", response.status_code)
            break
        print(len(all_data))

        time.sleep(10)

    # Cleaning the collected data
    all_data = remove_description_module(all_data)

    print(all_data)

    with open("data/CT_"+disease_to_consider+".json", "w") as json_file:
        json.dump(all_data, json_file)

Data Collection from Pubmed

In [None]:
from pymed import PubMed
import pandas as pd

diseases = [
    "pancreatic cancer",
    "chagas disease",
    "endometriosis",
    "drug resistant tuberculosis",
    "duchenne muscular dystrophy",
    "influenza",
    "breast-cancer",
    "hepatitis",
    "malaria",
    "alzheimer"
]
email = 'xxxxxxxx@gmail.com'
pubmed = PubMed(tool="PubMedSearcher", email=email)
print("\nPubMed Clinical Trials Articles:")

for disease in diseases:
    disease_to_consider = disease
    search_term = "alzheimer's+clinical+trials"
    results = pubmed.query(search_term, max_results=10000)

    articles_data = []

    for article in results:
        article_dict = article.toDict()
        raw_authors = article_dict.get("authors", [])
        authors = ", ".join([a["name"] if isinstance(a, dict) and "name" in a else str(a) for a in raw_authors]) if raw_authors else None

        articles_data.append({
            "pubmed_id": article_dict.get("pubmed_id", "").partition('\n')[0],
            "title": article_dict.get("title", None),
            "keywords": ", ".join(article_dict.get("keywords", [])) if article_dict.get("keywords") else None,
            "journal": article_dict.get("journal", None),
            "abstract": article_dict.get("abstract", None),
            # "methods": article_dict.get("methods", None),
            # "results": article_dict.get("results", None),
            # "conclusions": article_dict.get("conclusions", None),
            # "copyrights": article_dict.get("copyrights", None),
            # "doi": article_dict.get("doi", None),
            "publication_date": article_dict.get("publication_date", None),
            # "authors": authors
        })

    df_articles = pd.DataFrame(articles_data)
    # print(df_articles.head()) 
    df_articles.to_csv("data/Pubmed_"+disease_to_consider+".csv", index=False)