In [2]:
import json
import pandas as pd

# Function to read a JSON file
def read_json_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)
    return data

diseases = [
    "pancreatic cancer",
    "chagas disease",
    "endometriosis",
    "drug resistant tuberculosis",
    "duchenne muscular dystrophy"
]


disease_to_consider = diseases[4]


# Example usage
file_path = "data/"+disease_to_consider+".json"  # Replace with your JSON file path
studies = read_json_file(file_path)
data_list = []



for batch_study in studies:
    for study in batch_study:
            # Safely access nested keys
            nctId = study['protocolSection']['identificationModule'].get('nctId', 'Unknown')
            overallStatus = study['protocolSection']['statusModule'].get('overallStatus', 'Unknown')
            startDate = study['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
            conditions = ', '.join(study['protocolSection']['conditionsModule'].get('conditions', ['No conditions listed']))
            acronym = study['protocolSection']['identificationModule'].get('acronym', 'Unknown')

            # Extract interventions safely
            interventions_list = study['protocolSection'].get('armsInterventionsModule', {}).get('interventions', [])
            interventions = ', '.join([intervention.get('name', 'No intervention name listed') for intervention in interventions_list]) if interventions_list else "No interventions listed"

            # Extract locations safely
            locations_list = study['protocolSection'].get('contactsLocationsModule', {}).get('locations', [])
            locations = ', '.join([f"{location.get('city', 'No City')} - {location.get('country', 'No Country')}" for location in locations_list]) if locations_list else "No locations listed"

            # Extract dates and phases
            primaryCompletionDate = study['protocolSection']['statusModule'].get('primaryCompletionDateStruct', {}).get('date', 'Unknown Date')
            studyFirstPostDate = study['protocolSection']['statusModule'].get('studyFirstPostDateStruct', {}).get('date', 'Unknown Date')
            lastUpdatePostDate = study['protocolSection']['statusModule'].get('lastUpdatePostDateStruct', {}).get('date', 'Unknown Date')
            studyType = study['protocolSection']['designModule'].get('studyType', 'Unknown')
            phases = ', '.join(study['protocolSection']['designModule'].get('phases', ['Not Available']))
            lead_sponsor_name = study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get('name', 'Unknown Sponsor')
            lead_sponsor_type = study["protocolSection"]["sponsorCollaboratorsModule"]["leadSponsor"].get('class', 'Unknown Sponsor Type')


            # Append the data to the list as a dictionary
            data_list.append({
                "NCT ID": nctId,
                "Acronym": acronym,
                "Overall Status": overallStatus,
                "Start Date": startDate,
                "Conditions": conditions,
                "Interventions": interventions,
                "Locations": locations,
                "Primary Completion Date": primaryCompletionDate,
                "Study First Post Date": studyFirstPostDate,
                "Last Update Post Date": lastUpdatePostDate,
                "Study Type": studyType,
                "Phases": phases,
                "Sponsor": lead_sponsor_name,
                "Sponsor Type": lead_sponsor_type
            })


df = pd.DataFrame(data_list)

print(df)
print(len(df))

# with open('dia.json', "w") as json_file:
#     json.dump(studies, json_file, indent=4)

# print(studies[0])
df.to_csv("data/"+disease_to_consider+"_procesed.csv", index=False)

          NCT ID       Acronym         Overall Status  Start Date  \
0    NCT03680365       Unknown              COMPLETED  2018-09-20   
1    NCT02484560       Unknown                UNKNOWN     2015-06   
2    NCT04906460       Unknown  ACTIVE_NOT_RECRUITING  2021-09-28   
3    NCT01847573       Unknown             TERMINATED     2013-05   
4    NCT05257473  GRASP-01-002             RECRUITING  2022-04-13   
..           ...           ...                    ...         ...   
356  NCT00592553       Unknown              COMPLETED  2008-02-29   
357  NCT00102453       Unknown              COMPLETED     2002-03   
358  NCT01350154       Unknown              COMPLETED     2011-11   
359  NCT00016653       Unknown              COMPLETED     2000-06   
360  NCT02653833       Unknown             TERMINATED  2017-11-01   

                                            Conditions  \
0    Duchenne Muscular Dystrophy, Burden, Dependenc...   
1                          Duchenne Muscular Dystrophy 