In [4]:
import json
import pandas as pd

# Function to read a JSON file
def read_json_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

diseases = [
    "pancreatic cancer",
    "chagas disease",
    "endometriosis",
    "drug resistant tuberculosis",
    "duchenne muscular dystrophy"
]


# disease_to_consider = diseases[4]


for disease_to_consider in diseases:
    # Example usage
    file_path = "data/"+disease_to_consider+".json"  # Replace with your JSON file path
    studies = read_json_file(file_path)
    data_list = []



    for batch_study in studies:
        for study in batch_study:
                # Safely access nested keys
                nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
                overallStatus = study['protocolSection']['statusModule'].get('overallStatus', 'Unknown')
                startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
                try:
                     conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
                except:
                     conditions = "No conditions listed"
                     
                acronym = study['protocolSection']['identificationModule'].get('acronym', 'Unknown')

                # Extract interventions safely
                interventions_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions', [])
                interventions = ', '.join([intervention.get('name', 'No intervention name listed') for intervention in interventions_list]) if interventions_list else "No interventions listed"

                # Extract locations safely
                locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
                locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"

                # Extract dates and phases
                primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
                studyFirstPostDate = study['protocolSection']['statusModule'].get('studyFirstPostDateStruct', {}).get('date', 'Unknown Date')
                lastUpdatePostDate = study['protocolSection']['statusModule'].get('lastUpdatePostDateStruct', {}).get('date', 'Unknown Date')
                studyType = study['protocolSection']['designModule'].get('studyType', 'Unknown')
                phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
                lead_sponsor_name = study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get('name', 'Unknown Sponsor')
                lead_sponsor_type = study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get('class', 'Unknown Sponsor Type')


                # Append the data to the list as a dictionary
                data_list.append({
                    "NCT ID": nctId,
                    "Acronym": acronym,
                    "Overall Status": overallStatus,
                    "Start Date": startDate,
                    "Conditions": conditions,
                    "Interventions": interventions,
                    "Locations": locations,
                    "Primary Completion Date": primaryCompletionDate,
                    "Study First Post Date": studyFirstPostDate,
                    "Last Update Post Date": lastUpdatePostDate,
                    "Study Type": studyType,
                    "Phases": phases,
                    "Sponsor": lead_sponsor_name,
                    "Sponsor Type": lead_sponsor_type
                })


    df = pd.DataFrame(data_list)

    print(df)
    print(len(df))

    # with open('dia.json', "w") as json_file:
    #     json.dump(studies, json_file, indent=4)

    # print(studies[0])
    df.to_csv("data/"+disease_to_consider+"_procesed.csv", index=False)

           NCT ID    Acronym           Overall Status  Start Date  \
0     NCT02695966       EVIS                COMPLETED     2015-05   
1     NCT03882866    Unknown                  UNKNOWN  2019-03-28   
2     NCT04572165    Unknown  ENROLLING_BY_INVITATION  2021-01-26   
3     NCT03190265    Unknown                COMPLETED  2017-12-14   
4     NCT02822066    Unknown                  UNKNOWN     2016-04   
...           ...        ...                      ...         ...   
2567  NCT00996333    Unknown                COMPLETED     2003-06   
2568  NCT06598033    Unknown               RECRUITING  2024-11-22   
2569  NCT03851133    Unknown    ACTIVE_NOT_RECRUITING  2019-03-04   
2570  NCT06370754  FD-IMPACT       NOT_YET_RECRUITING  2024-04-30   
2571  NCT03257033  TIGeR-PaC               RECRUITING  2018-03-12   

                              Conditions  \
0                      Pancreatic Cancer   
1                      Pancreatic Cancer   
2              Diabetes Mellitus, Type 