In [9]:
import json
import pandas as pd
import os

# Function to read a JSON file
def read_json_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

diseases = [
    "pancreatic_cancer_clinical_trials",
    "chagas_disease_clinical_trials",
    "endometriosis_clinical_trials",
    "drug_resistant_tuberculosis_clinical_trials",
    "duchenne_muscular_dystrophy_clinical_trials",
]

master_df = df = pd.DataFrame()

for disease_to_consider in diseases:
    # Example usage
    file_path = "data/"+disease_to_consider+".json"  # Replace with your JSON file path
    studies = read_json_file(file_path)
    data_list = []



    for batch_study in studies:
        for study in batch_study:
                # Safely access nested keys
                nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
                overallStatus = study['protocolSection']['statusModule'].get('overallStatus', 'Unknown')
                startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
                try:
                     conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
                except:
                     conditions = "No conditions listed"
                     
                acronym = study['protocolSection']['identificationModule'].get('acronym', 'Unknown')

                # Extract interventions safely
                interventions_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions', [])
                interventions = ', '.join([intervention.get('name', 'No intervention name listed') for intervention in interventions_list]) if interventions_list else "No interventions listed"

                # Extract locations safely
                locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
                locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"

                # Extract dates and phases
                primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
                studyFirstPostDate = study['protocolSection']['statusModule'].get('studyFirstPostDateStruct', {}).get('date', 'Unknown Date')
                lastUpdatePostDate = study['protocolSection']['statusModule'].get('lastUpdatePostDateStruct', {}).get('date', 'Unknown Date')
                studyType = study['protocolSection']['designModule'].get('studyType', 'Unknown')
                phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
                phases = phases.split(",")[-1].strip()
                if(phases == "EARLY_PHASE1"):
                     phases = "PHASE1"

                # phases = get_highest_phase(study)
                lead_sponsor_name = study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get('name', 'Unknown Sponsor')
                lead_sponsor_type = study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get('class', 'Unknown Sponsor Type')

                disease_to_consider = disease_to_consider.replace("_clinical_trials", "")

                # Append the data to the list as a dictionary
                data_list.append({
                    "NCT ID": nctId,
                    "Acronym": acronym,
                    "Overall Status": overallStatus,
                    "Start Date": startDate,
                    "Conditions": conditions,
                    "Interventions": interventions,
                    "Locations": locations,
                    "Primary Completion Date": primaryCompletionDate,
                    "Study First Post Date": studyFirstPostDate,
                    "Last Update Post Date": lastUpdatePostDate,
                    "Study Type": studyType,
                    "Phases": phases,
                    "Sponsor": lead_sponsor_name,
                    "Sponsor Type": lead_sponsor_type,
                    "Disease": disease_to_consider.replace(" ", "_")
                })

    print(disease_to_consider)
    df = pd.DataFrame(data_list)

    # print(df)
    print(len(df))
    os.makedirs("data/", exist_ok=True)
    # df.to_csv("data/"+disease_to_consider.replace(" ", "_")+"_procesed.csv", index=False)
    master_df = pd.concat([master_df, df], ignore_index=True)
master_df.to_csv("data/all_disease_procesed.csv", index=False)
print(len(master_df))

pancreatic_cancer
2572
chagas_disease
55
endometriosis
588
drug_resistant_tuberculosis
60
duchenne_muscular_dystrophy
361
3636


In [5]:
import json
import pandas as pd
import os

# Function to read a JSON file
def read_json_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

# List of diseases to consider
diseases = [
    "alzheimer's",
    "influenza",
    "breast cancer",
    "hepatitis",
    "malaria",
]

master_df = pd.DataFrame()

for disease_to_consider in diseases:
    file_path = f"data/{disease_to_consider}.json"
    
    # Check if file exists before proceeding
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        continue

    studies = read_json_file(file_path)
    data_list = []

    # Ensure batch processing works correctly
    for study in studies:
        if not isinstance(study, dict):
            print(f"Skipping invalid study format in {disease_to_consider}")
            continue

        try:
            protocol = study.get("protocolSection", {})
            identification = protocol.get("identificationModule", {})
            status = protocol.get("statusModule", {})
            conditions_module = protocol.get("conditionsModule", {})
            design = protocol.get("designModule", {})
            interventions_module = protocol.get("armsInterventionsModule", {})
            locations_module = protocol.get("contactsLocationsModule", {})
            sponsor_module = protocol.get("sponsorCollaboratorsModule", {}).get("leadSponsor", {})

            nctId = identification.get("nctId", "Unknown")
            overallStatus = status.get("overallStatus", "Unknown")
            startDate = status.get("startDateStruct", {}).get("date", "Unknown Date")
            conditions = ', '.join(conditions_module.get("conditions", ["No conditions listed"]))
            acronym = identification.get("acronym", "Unknown")

            # Extract interventions safely
            interventions_list = interventions_module.get("interventions", [])
            interventions = ', '.join([intervention.get("name", "No intervention name listed") for intervention in interventions_list]) if interventions_list else "No interventions listed"

            # Extract locations safely
            locations_list = locations_module.get("locations", [])
            locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"

            primaryCompletionDate = status.get("primaryCompletionDateStruct", {}).get("date", "Unknown Date")
            studyFirstPostDate = status.get("studyFirstPostDateStruct", {}).get("date", "Unknown Date")
            lastUpdatePostDate = status.get("lastUpdatePostDateStruct", {}).get("date", "Unknown Date")
            studyType = design.get("studyType", "Unknown")
            phases = ', '.join(design.get("phases", ["Not Available"]))
            phases = phases.split(",")[-1].strip()
            if phases == "EARLY_PHASE1":
                phases = "PHASE1"

            lead_sponsor_name = sponsor_module.get("name", "Unknown Sponsor")
            lead_sponsor_type = sponsor_module.get("class", "Unknown Sponsor Type")

            disease_name = disease_to_consider.replace("_clinical_trials", "").replace(" ", "_")

            data_list.append({
                "NCT ID": nctId,
                "Acronym": acronym,
                "Overall Status": overallStatus,
                "Start Date": startDate,
                "Conditions": conditions,
                "Interventions": interventions,
                "Locations": locations,
                "Primary Completion Date": primaryCompletionDate,
                "Study First Post Date": studyFirstPostDate,
                "Last Update Post Date": lastUpdatePostDate,
                "Study Type": studyType,
                "Phases": phases,
                "Sponsor": lead_sponsor_name,
                "Sponsor Type": lead_sponsor_type,
                "Disease": disease_name
            })

        except KeyError as e:
            print(f"Skipping a study due to missing key: {e}")

    df = pd.DataFrame(data_list)
    print(f"Processed {len(df)} records for {disease_to_consider}")

    os.makedirs("data/", exist_ok=True)
    df.to_csv(f"data/{disease_to_consider.replace(' ', '_')}_processed.csv", index=False)
    master_df = pd.concat([master_df, df], ignore_index=True)

# Save the combined dataset
master_df.to_csv("data/all_disease_processed.csv", index=False)
print(f"Total records saved: {len(master_df)}")


Processed 2169 records for alzheimer's
Processed 2303 records for influenza
Processed 5000 records for breast cancer
Processed 3517 records for hepatitis
Processed 1129 records for malaria
Total records saved: 14118


# Cleaning the collected JSON files
### Clinical Trials
Removed the "DetailedDescription" key as it is not needed for analysis

In [None]:
import json

with open('duchenne muscular dystrophy.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Print JSON content
print(data)

def remove_description_module(data):
    # Count the number of outer lists
    num_outer_lists = len(data)
    # print(f"Number of outer lists: {num_outer_lists}")
    
    # Traverse the outer list
    for i, outer_item in enumerate(data):
        # print(f"Processing outer list {i+1}/{num_outer_lists}")
        
        # Ensure it's a list before iterating
        if isinstance(outer_item, list):
            for j, inner_item in enumerate(outer_item):
                # print(f"Processing inner list {j+1}/{len(outer_item)} in outer list {i+1}")
                
                # Ensure the structure matches expected format
                if isinstance(inner_item, dict) and "protocolSection" in inner_item:
                    if "descriptionModule" in inner_item["protocolSection"]:
                        del inner_item["protocolSection"]["descriptionModule"]
                        # print(f"Removed 'descriptionModule' from inner list {j+1} in outer list {i+1}")
    
    return data

# Load JSON data (replace with actual data loading)

# Process the data
data = remove_description_module(data)

# Print or save modified JSON
# print(json.dumps(data, indent=2))
# Save the modified data back to a file
with open('data/duchenne_muscular_dystrophy_clinical_trials.json', 'w', encoding='utf-8') as file:
    json.dump(data, file)

# diseases = [
#     "pancreatic_cancer_clinical_trials",
#     "chagas_disease_clinical_trials",
#     "endometriosis_clinical_trials",
#     "drug_resistant_tuberculosis_clinical_trials",
#     "duchenne_muscular_dystrophy_clinical_trials",
# ]


## Cleaning the Pubmed Data

cleaned ["copyrights", "doi", "authors"]

In [6]:
import os
import pandas as pd

# Define the directory containing the CSV files
directory = "data/"  # Change this to your actual directory

# Ensure the directory exists
if not os.path.exists(directory):
    print(f"Directory '{directory}' does not exist.")
    exit()

master_df = pd.DataFrame()
# Process all CSV files in the directory
for filename in os.listdir(directory):
    if filename.startswith("all_"):
        continue  # Skip hidden files

    if filename.endswith(".csv"):  # Only process CSV files
        filepath = os.path.join(directory, filename)
        
        # Load CSV
        df = pd.read_csv(filepath)
        
        # Remove specified columns if they exist
        columns_to_remove = ["copyrights", "doi", "authors"]
        try:
            df.drop(columns=[col for col in columns_to_remove if col in df.columns], inplace=True)
        except:
            print("Error in removing columns")
        
        # Save back to the same file
        df['category']= filename
        df.to_csv(filepath, index=False)
        print("Number of instances in "+filename, len(df))
        master_df = pd.concat([master_df, df], ignore_index=True)

master_df.to_csv('data/all_pubmed.csv', index=False)
print("all_pubmed",len(master_df))

Number of instances in Pubmed_Pancreatic_Cancer.csv 9831
Number of instances in Pubmed_Duchenne_Muscular_Dystrophy.csv 1423
Number of instances in Pubmed_Chagas_Disease.csv 680
Number of instances in Pubmed_Endometriosis.csv 2839
Number of instances in Pubmed_Drug_Resistant_Tuberculosis.csv 1274
all_pubmed 16047
