In [6]:
import json
import pandas as pd
import os

# Function to read a JSON file
def read_json_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

diseases = [
    "CT_pancreatic cancer",
    "CT_chagas disease",
    "CT_endometriosis",
    "CT_drug resistant tuberculosis",
    "CT_duchenne muscular dystrophy",
]


master_df = df = pd.DataFrame()

for disease_to_consider in diseases:
    # Example usage
    file_path = "data/"+disease_to_consider+".json"
    studies = read_json_file(file_path)
    data_list = []



    for batch_study in studies:
        for study in batch_study:
                # Safely access nested keys
                nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
                overallStatus = study['protocolSection']['statusModule'].get('overallStatus', 'Unknown')
                startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
                try:
                     conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
                except:
                     conditions = "No conditions listed"
                     
                acronym = study['protocolSection']['identificationModule'].get('acronym', 'Unknown')

                # Extract interventions safely
                interventions_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions', [])
                interventions = ', '.join([intervention.get('name', 'No intervention name listed') for intervention in interventions_list]) if interventions_list else "No interventions listed"

                # Extract locations safely
                locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
                locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"

                # Extract dates and phases
                primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
                studyFirstPostDate = study['protocolSection']['statusModule'].get('studyFirstPostDateStruct', {}).get('date', 'Unknown Date')
                lastUpdatePostDate = study['protocolSection']['statusModule'].get('lastUpdatePostDateStruct', {}).get('date', 'Unknown Date')
                studyType = study['protocolSection']['designModule'].get('studyType', 'Unknown')
                phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
                phases = phases.split(",")[-1].strip()
                if(phases == "EARLY_PHASE1"):
                     phases = "PHASE1"

                # phases = get_highest_phase(study)
                lead_sponsor_name = study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get('name', 'Unknown Sponsor')
                lead_sponsor_type = study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get('class', 'Unknown Sponsor Type')

                disease_to_consider = disease_to_consider.replace("_clinical_trials", "")

                # Append the data to the list as a dictionary
                data_list.append({
                    "NCT ID": nctId,
                    "Acronym": acronym,
                    "Overall Status": overallStatus,
                    "Start Date": startDate,
                    "Conditions": conditions,
                    "Interventions": interventions,
                    "Locations": locations,
                    "Primary Completion Date": primaryCompletionDate,
                    "Study First Post Date": studyFirstPostDate,
                    "Last Update Post Date": lastUpdatePostDate,
                    "Study Type": studyType,
                    "Phases": phases,
                    "Sponsor": lead_sponsor_name,
                    "Sponsor Type": lead_sponsor_type,
                    "Disease": disease_to_consider.replace(" ", "_")
                })

    # print(disease_to_consider)
    print(f"Processed {len(data_list)} records for {disease_to_consider}")
    df = pd.DataFrame(data_list)

    # print(df)
    # print(len(df))
    os.makedirs("data/", exist_ok=True)
    # df.to_csv("data/"+disease_to_consider.replace(" ", "_")+"_procesed.csv", index=False)
    master_df = pd.concat([master_df, df], ignore_index=True)
master_df.to_csv("data/CT_all_rare_disease_procesed.csv", index=False)
print(len(master_df))

Processed 2580 records for CT_pancreatic cancer
Processed 55 records for CT_chagas disease
Processed 589 records for CT_endometriosis
Processed 60 records for CT_drug resistant tuberculosis
Processed 364 records for CT_duchenne muscular dystrophy
3648


In [3]:
import json
import pandas as pd
import os

# Function to read a JSON file
def read_json_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data


diseases = [
    "CT_alzheimer's",
    "CT_influenza",
    "CT_breast-cancer",
    "CT_hepatitis",
    "CT_malaria",
]

master_df = df = pd.DataFrame()

for disease_to_consider in diseases:
    # Example usage
    file_path = "data/"+disease_to_consider+".json"
    studies = read_json_file(file_path)
    data_list = []



    for batch_study in studies:
        for study in batch_study:
                # Safely access nested keys
                nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
                overallStatus = study['protocolSection']['statusModule'].get('overallStatus', 'Unknown')
                startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
                try:
                     conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
                except:
                     conditions = "No conditions listed"
                     
                acronym = study['protocolSection']['identificationModule'].get('acronym', 'Unknown')

                # Extract interventions safely
                interventions_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions', [])
                interventions = ', '.join([intervention.get('name', 'No intervention name listed') for intervention in interventions_list]) if interventions_list else "No interventions listed"

                # Extract locations safely
                locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
                locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"

                # Extract dates and phases
                primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
                studyFirstPostDate = study['protocolSection']['statusModule'].get('studyFirstPostDateStruct', {}).get('date', 'Unknown Date')
                lastUpdatePostDate = study['protocolSection']['statusModule'].get('lastUpdatePostDateStruct', {}).get('date', 'Unknown Date')
                studyType = study['protocolSection']['designModule'].get('studyType', 'Unknown')
                phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
                phases = phases.split(",")[-1].strip()
                if(phases == "EARLY_PHASE1"):
                     phases = "PHASE1"

                # phases = get_highest_phase(study)
                lead_sponsor_name = study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get('name', 'Unknown Sponsor')
                lead_sponsor_type = study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get('class', 'Unknown Sponsor Type')

                disease_to_consider = disease_to_consider.replace("_clinical_trials", "")

                # Append the data to the list as a dictionary
                data_list.append({
                    "NCT ID": nctId,
                    "Acronym": acronym,
                    "Overall Status": overallStatus,
                    "Start Date": startDate,
                    "Conditions": conditions,
                    "Interventions": interventions,
                    "Locations": locations,
                    "Primary Completion Date": primaryCompletionDate,
                    "Study First Post Date": studyFirstPostDate,
                    "Last Update Post Date": lastUpdatePostDate,
                    "Study Type": studyType,
                    "Phases": phases,
                    "Sponsor": lead_sponsor_name,
                    "Sponsor Type": lead_sponsor_type,
                    "Disease": disease_to_consider.replace(" ", "_")
                })

    # print(disease_to_consider)
    print(f"Processed {len(data_list)} records for {disease_to_consider}")
    df = pd.DataFrame(data_list)

    # print(df)
    # print(len(df))
    os.makedirs("data/", exist_ok=True)
    # df.to_csv("data/"+disease_to_consider.replace(" ", "_")+"_procesed.csv", index=False)
    master_df = pd.concat([master_df, df], ignore_index=True)
master_df.to_csv("data/CT_all_common_disease_processed.csv", index=False)
print(len(master_df))

Processed 2169 records for CT_alzheimer's
Processed 2303 records for CT_influenza
Processed 5000 records for CT_breast-cancer
Processed 3517 records for CT_hepatitis
Processed 1129 records for CT_malaria
14118


## Cleaning the Pubmed Data

cleaned ["copyrights", "doi", "authors"]

In [2]:
import os
import pandas as pd

# Define the directory containing the CSV files
directory = "data/"  # Change this to your actual directory

# Ensure the directory exists
if not os.path.exists(directory):
    print(f"Directory '{directory}' does not exist.")
    exit()

master_df = pd.DataFrame()
# Process all CSV files in the directory
for filename in os.listdir(directory):
    if filename.startswith("Pubmed_"):
        if filename.endswith(".csv"):  # Only process CSV files
            filepath = os.path.join(directory, filename)
            
            # Load CSV
            df = pd.read_csv(filepath)
            
            # Remove specified columns if they exist
            columns_to_remove = ["copyrights", "doi", "authors"]
            try:
                df.drop(columns=[col for col in columns_to_remove if col in df.columns], inplace=True)
            except:
                print("Error in removing columns")
            
            # Save back to the same file
            df['category']= filename
            df.to_csv(filepath, index=False)
            print("Number of instances in "+filename, len(df))
            master_df = pd.concat([master_df, df], ignore_index=True)

master_df.to_csv('data/all_pubmed.csv', index=False)
print("all_pubmed",len(master_df))

Number of instances in Pubmed_Pancreatic_Cancer.csv 9831
Number of instances in Pubmed_influenza.csv 8905
Number of instances in Pubmed_Chagas_Disease.csv 680
Number of instances in Pubmed_malaria.csv 6855
Number of instances in Pubmed_Endometriosis.csv 2839
Number of instances in Pubmed_breast-cancer.csv 171
Number of instances in Pubmed_Duchenne_Muscular_Dystrophy.csv 1423
Number of instances in Pubmed_hepatitis.csv 7087
Number of instances in Pubmed_alzheimer.csv 60
Number of instances in Pubmed_Drug_Resistant_Tuberculosis.csv 1274
all_pubmed 39125
