In [1]:
import pandas as pd
import json
import ast

In [2]:
# JSON_FILE = "sample-studies.json"
JSON_FILE = "../../data/clinicaltrialsgov/ctg-studies.json"

In [3]:
def load_data_into_df():
    # Load JSON data
    with open(JSON_FILE, "r") as file:
        data = json.load(file)

    # Convert JSON to DataFrame
    df = pd.json_normalize(data)

    return data, df


In [4]:
json_data, df = load_data_into_df()

In [5]:
# specific to observational studies
obsCol = [ 
 'protocolSection.designModule.patientRegistry', 
 'protocolSection.designModule.designInfo.observationalModel', 
 'protocolSection.designModule.designInfo.timePerspective', 
 'protocolSection.eligibilityModule.studyPopulation', 
 'protocolSection.eligibilityModule.samplingMethod', 
]

In [6]:
requiredColumns = [
    # id
    'protocolSection.identificationModule.nctId',
    'protocolSection.identificationModule.officialTitle',
    'protocolSection.identificationModule.acronym',

    # intro
    'protocolSection.descriptionModule.briefSummary',
    'protocolSection.descriptionModule.detailedDescription',
    'protocolSection.conditionsModule.conditions',
    'protocolSection.conditionsModule.keywords',

    # study design
    'protocolSection.designModule.studyType',
    'protocolSection.designModule.phases',
    'protocolSection.designModule.designInfo.allocation',
    'protocolSection.designModule.designInfo.interventionModel',
    'protocolSection.designModule.designInfo.primaryPurpose',
    'protocolSection.designModule.designInfo.maskingInfo.masking',
    'protocolSection.designModule.enrollmentInfo.count',
    'protocolSection.designModule.enrollmentInfo.type',

    # arms and their interventions
    'protocolSection.armsInterventionsModule.armGroups',
    'protocolSection.armsInterventionsModule.interventions',

    # outcomes designed to be tracked
    'protocolSection.outcomesModule.primaryOutcomes',
    'protocolSection.outcomesModule.secondaryOutcomes',

    # eligibility
    'protocolSection.eligibilityModule.eligibilityCriteria',
    'protocolSection.eligibilityModule.healthyVolunteers',
    'protocolSection.eligibilityModule.sex',
    'protocolSection.eligibilityModule.minimumAge',
    'protocolSection.eligibilityModule.stdAges',

    # location
    'protocolSection.contactsLocationsModule.overallOfficials',
    'protocolSection.contactsLocationsModule.locations',

    # pubmed references
    'protocolSection.referencesModule.references',
    'protocolSection.referencesModule.seeAlsoLinks',

    # regulation on drug
    'protocolSection.oversightModule.isFdaRegulatedDrug', 
    'protocolSection.oversightModule.isFdaRegulatedDevice',

    # participant cohorts
    'resultsSection.participantFlowModule.preAssignmentDetails',
    'resultsSection.participantFlowModule.recruitmentDetails',
    'resultsSection.participantFlowModule.groups',
    'resultsSection.participantFlowModule.periods',

    # baseline/demographic characteristics - eg. race, age, sex of patients in different groups
    'resultsSection.baselineCharacteristicsModule.groups',
    'resultsSection.baselineCharacteristicsModule.denoms',
    'resultsSection.baselineCharacteristicsModule.measures',
    'resultsSection.baselineCharacteristicsModule.populationDescription',

    # actual outcomes
    'resultsSection.outcomeMeasuresModule.outcomeMeasures',

    # causing deaths, serious effect or non-serious effect
    'resultsSection.adverseEventsModule.frequencyThreshold',
    'resultsSection.adverseEventsModule.timeFrame',
    'resultsSection.adverseEventsModule.description',
    'resultsSection.adverseEventsModule.eventGroups',
    'resultsSection.adverseEventsModule.seriousEvents',
    'resultsSection.adverseEventsModule.otherEvents', 
]

In [7]:
csv_df = df[requiredColumns]
csv_df.to_csv("../../data/clinicaltrialsgov/CLL_343_SelectedFields.csv", index=False)

In [8]:
csv_df.head()

Unnamed: 0,protocolSection.identificationModule.nctId,protocolSection.identificationModule.officialTitle,protocolSection.identificationModule.acronym,protocolSection.descriptionModule.briefSummary,protocolSection.descriptionModule.detailedDescription,protocolSection.conditionsModule.conditions,protocolSection.conditionsModule.keywords,protocolSection.designModule.studyType,protocolSection.designModule.phases,protocolSection.designModule.designInfo.allocation,...,resultsSection.baselineCharacteristicsModule.denoms,resultsSection.baselineCharacteristicsModule.measures,resultsSection.baselineCharacteristicsModule.populationDescription,resultsSection.outcomeMeasuresModule.outcomeMeasures,resultsSection.adverseEventsModule.frequencyThreshold,resultsSection.adverseEventsModule.timeFrame,resultsSection.adverseEventsModule.description,resultsSection.adverseEventsModule.eventGroups,resultsSection.adverseEventsModule.seriousEvents,resultsSection.adverseEventsModule.otherEvents
0,NCT01520922,"A Phase II, Multi-centre Study Investigating t...",,"This is a Phase II, open label, single arm, mu...","This is a Phase II, open label, single arm, mu...","[Chronic Lymphocytic Leukemia (CLL), Leukaemia...",[Relapsed or Refractory Chronic Lymphocytic Le...,INTERVENTIONAL,[PHASE2],,...,"[{'units': 'Participants', 'counts': [{'groupI...","[{'title': 'Age, Continuous', 'paramType': 'ME...",,"[{'type': 'PRIMARY', 'title': 'Number of Parti...",5,From the first dose of study medication to 60 ...,Serious adverse events (SAEs) and and non-seri...,"[{'id': 'EG000', 'title': 'Ofatumumab + Bendam...","[{'term': 'Anaemia', 'organSystem': 'Blood and...","[{'term': 'Anaemia', 'organSystem': 'Blood and..."
1,NCT00802737,"A Single-arm, International, Multi-center Tria...",,The purpose of the trial is to investigate the...,,"[Leukaemia, Lymphocytic, Chronic]","[Maintenance, Retreatment, Chronic lymphocytic...",INTERVENTIONAL,[PHASE4],,...,"[{'units': 'Participants', 'counts': [{'groupI...","[{'title': 'Age, Continuous', 'paramType': 'ME...",,"[{'type': 'PRIMARY', 'title': 'Number of Parti...",5,On-treatment serious adverse events (SAEs) and...,SAEs and non-serious AEs were reported for mem...,"[{'id': 'EG000', 'title': '2000 mg Ofatumumab ...","[{'term': 'Bronchopneumonia', 'organSystem': '...","[{'term': 'Diarrhea', 'organSystem': 'Gastroin..."
2,NCT00410163,"An Open-labeled, Randomized, Two-dose, Paralle...",BIFROST,To investigate the safety and efficacy of two ...,,"[Leukaemia, Lymphocytic, Chronic]","[B-cell, cyclophosphamide, fludarabine, Chroni...",INTERVENTIONAL,[PHASE2],RANDOMIZED,...,"[{'units': 'Participants', 'counts': [{'groupI...","[{'title': 'Age, Continuous', 'paramType': 'ME...",,"[{'type': 'PRIMARY', 'title': 'Number of Parti...",5,,"During the Extended Follow-up Phase, from 2 ye...","[{'id': 'EG000', 'title': 'Ofatumumab 500 mg +...","[{'term': 'Neutropenia', 'organSystem': 'Blood...","[{'term': 'Neutropenia', 'organSystem': 'Blood..."
3,NCT00349349,"A Single-arm, International, Multi-center Tria...",,The purpose of this study is to determine whet...,,"[Leukaemia, Lymphocytic, Chronic]",,INTERVENTIONAL,[PHASE2],NON_RANDOMIZED,...,"[{'units': 'Participants', 'counts': [{'groupI...","[{'title': 'Age, Continuous', 'paramType': 'ME...",,"[{'type': 'PRIMARY', 'title': 'Number of Parti...",5,,,"[{'id': 'EG000', 'title': '2000 mg Ofatumumab ...","[{'term': 'Pneumonia', 'organSystem': 'Infecti...","[{'term': 'Fatigue', 'organSystem': 'General d..."
4,NCT01178086,Rituximab in the Treatment of Chronic Lymphocy...,,This observational study will assess the thera...,,"[Lymphocytic Leukemia, Chronic]",,OBSERVATIONAL,,,...,"[{'units': 'Participants', 'counts': [{'groupI...","[{'title': 'Age, Continuous', 'paramType': 'ME...",Baseline characteristics of EAS has been repor...,"[{'type': 'PRIMARY', 'title': 'Progression-Fre...",5,Baseline up to 24 months,Analysis was performed on safety analysis set.,"[{'id': 'EG000', 'title': 'Unselected Populati...","[{'term': 'Anemia', 'organSystem': 'Blood and ...","[{'term': 'Anemia', 'organSystem': 'Blood and ..."
