In [8]:
import json
import pandas as pd

# Function to read a JSON file
def read_json_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

diseases = [
    "pancreatic cancer",
    "chagas disease",
    "endometriosis",
    "drug resistant tuberculosis",
    "duchenne muscular dystrophy"
]

master_df = df = pd.DataFrame()

for disease_to_consider in diseases:
    # Example usage
    file_path = "data/"+disease_to_consider+".json"  # Replace with your JSON file path
    studies = read_json_file(file_path)
    data_list = []



    for batch_study in studies:
        for study in batch_study:
                # Safely access nested keys
                nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
                overallStatus = study['protocolSection']['statusModule'].get('overallStatus', 'Unknown')
                startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
                try:
                     conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
                except:
                     conditions = "No conditions listed"
                     
                acronym = study['protocolSection']['identificationModule'].get('acronym', 'Unknown')

                # Extract interventions safely
                interventions_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions', [])
                interventions = ', '.join([intervention.get('name', 'No intervention name listed') for intervention in interventions_list]) if interventions_list else "No interventions listed"

                # Extract locations safely
                locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
                locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"

                # Extract dates and phases
                primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
                studyFirstPostDate = study['protocolSection']['statusModule'].get('studyFirstPostDateStruct', {}).get('date', 'Unknown Date')
                lastUpdatePostDate = study['protocolSection']['statusModule'].get('lastUpdatePostDateStruct', {}).get('date', 'Unknown Date')
                studyType = study['protocolSection']['designModule'].get('studyType', 'Unknown')
                phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
                lead_sponsor_name = study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get('name', 'Unknown Sponsor')
                lead_sponsor_type = study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get('class', 'Unknown Sponsor Type')


                # Append the data to the list as a dictionary
                data_list.append({
                    "NCT ID": nctId,
                    "Acronym": acronym,
                    "Overall Status": overallStatus,
                    "Start Date": startDate,
                    "Conditions": conditions,
                    "Interventions": interventions,
                    "Locations": locations,
                    "Primary Completion Date": primaryCompletionDate,
                    "Study First Post Date": studyFirstPostDate,
                    "Last Update Post Date": lastUpdatePostDate,
                    "Study Type": studyType,
                    "Phases": phases,
                    "Sponsor": lead_sponsor_name,
                    "Sponsor Type": lead_sponsor_type,
                    "Disease": disease_to_consider.replace(" ", "_")
                })


    df = pd.DataFrame(data_list)

    # print(df)
    print(len(df))

    df.to_csv("data/"+disease_to_consider.replace(" ", "_")+"_procesed.csv", index=False)
    master_df = pd.concat([master_df, df], ignore_index=True)
master_df.to_csv("data/all_disease_procesed.csv", index=False)
print(len(master_df))

2572
55
588
60
361
3636
