In [9]:
import json
import os
import re
import pandas as pd

In [10]:
# Get current dictionary
current_directory = os.getcwd()
file_path = os.path.join(current_directory, 'data/ctg-studies_cancer.json')
# file_path = os.path.join(current_directory, 'data/ctg-studies_carcinoma.json')

with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file) 

## Filter rows with keywords in detailed_description, official_title, keywords, intervention_names, and intervention_other_names

In [11]:
# Set keyword to be removed
keywords_to_remove = ['gene', 'virus', 'cell therapy']

keywords_to_remove = [keyword.lower() for keyword in keywords_to_remove]

In [12]:
# Convert to lower case and remove special character
def preprocess_text(text):
    return re.sub(r'\W+', ' ', text.lower())

# Filter the trial list
filtered_data = []
for item in data:
    # Check the detailed_description, official_title, and keywords
    detailed_description = preprocess_text(item['protocolSection']['descriptionModule'].get('detailedDescription', ""))
    official_title = preprocess_text(item['protocolSection']['identificationModule'].get('officialTitle', ""))
    keywords = [preprocess_text(keyword) for keyword in item['protocolSection']['conditionsModule'].get('keywords', [])]

    # Check the interventionNames and otherNames
    interventions = item['protocolSection']['armsInterventionsModule'].get('interventions', [])
    intervention_names = [preprocess_text(intervention.get('name', "")) for intervention in interventions]
    other_names = [preprocess_text(name) for intervention in interventions for name in intervention.get('otherNames', [])]

    # check if the keyword exist
    if not any(keyword in detailed_description for keyword in keywords_to_remove) and \
       not any(keyword in official_title for keyword in keywords_to_remove) and \
       not any(keyword in keyword_item for keyword_item in keywords for keyword in keywords_to_remove) and \
       not any(keyword in intervention_name for intervention_name in intervention_names for keyword in keywords_to_remove) and \
       not any(keyword in other_name for other_name in other_names for keyword in keywords_to_remove):
        filtered_data.append(item)

print(f"Original: {len(data)}, filtered: {len(filtered_data)}")


Original: 4621, filtered: 3868


## Extract the sections of 'arms and intervention' that meet the conditions

In [13]:
# Extract the arms and intervention part
all_arms_interventions = []

# Filter these item without 'armsInterventionsModule'
for entry in filtered_data:
    if 'protocolSection' in entry and 'armsInterventionsModule' in entry['protocolSection']:
        arms_interventions_module = entry['protocolSection']['armsInterventionsModule']
        startDate = entry['protocolSection']['statusModule'].get('startDateStruct', {}).get('date', 'Unknown Date')
        nct_id = entry['protocolSection']['identificationModule']['nctId']
        arm_groups = arms_interventions_module.get('armGroups', [])
        interventions = arms_interventions_module.get('interventions', [])

        # print(nct_id, len(arm_groups), len(interventions))
        
        # Build the dictionary for arms and interventions
        arm_dict = {}
        for arm in arm_groups:
            arm_dict[arm['label']] = {
                'nctId': nct_id,
                'arm_label': arm.get('label', ''),
                'arm_type': arm.get('type', ''),
                'arm_description': arm.get('description', ''),
                'intervention_names': ', '.join(arm.get('interventionNames', []))
            }
        
        for intervention in interventions:
            for arm_label in intervention.get('armGroupLabels', []):
                if arm_label in arm_dict:
                    row = {
                        'nctId': nct_id,
                        'startDate': startDate,
                        'arm_label': arm_dict[arm_label]['arm_label'],
                        'arm_type': arm_dict[arm_label]['arm_type'],
                        'arm_description': arm_dict[arm_label]['arm_description'],
                        'intervention_names': arm_dict[arm_label]['intervention_names'],
                        'intervention_type': intervention.get('type', ''),
                        'intervention_name': intervention.get('name', ''),
                        'intervention_description': intervention.get('description', ''),
                        'intervention_arm_labels': ', '.join(intervention.get('armGroupLabels', [])),
                        'intervention_other_names': ', '.join(intervention.get('otherNames', []))
                    }
                    all_arms_interventions.append(row)

## Filter the rows without dosage information("mg")

In [14]:
# Filter these row with dosage information
# arms_interventions_with_dosage = [
#     entry for entry in all_arms_interventions
#     if 'mg' in entry['arm_description'] or 'mg' in entry['intervention_description'] or
#        'mcg' in entry['arm_description'] or 'mcg' in entry['intervention_description']
# ]

In [15]:
# Build a dataframe for arms and interventions
df = pd.DataFrame(all_arms_interventions)

# Output df as csv file
output_file_path = os.path.join(current_directory, 'output/Phase3/phase3_result.csv')
# output_file_path = os.path.join(current_directory, 'output/Phase3/phase3_result_carcinoma.csv')

df.to_csv(output_file_path, index=False)

print(f"Extracted data saved to {output_file_path}")

df.head()

Extracted data saved to c:\Users\shj4823\OneDrive - Takeda\Desktop\Cancer Data V4.1_without_dose_mab\output/Phase3/phase3_result.csv


Unnamed: 0,nctId,startDate,arm_label,arm_type,arm_description,intervention_names,intervention_type,intervention_name,intervention_description,intervention_arm_labels,intervention_other_names
0,NCT03295565,2017-05-07,A: Cabazitaxel,ACTIVE_COMPARATOR,"Cabazitaxel 25mg/m2 IV, once every 3 weeks",Drug: Cabazitaxel,DRUG,Cabazitaxel,"Cabazitaxel 25mg/m2 IV, once every 3 weeks",A: Cabazitaxel,No other intervention names
1,NCT03295565,2017-05-07,B: Abiraterone OR Enzalutamide,ACTIVE_COMPARATOR,At physician's discretion:\n\nAbiraterone 1000...,"Drug: Abiraterone, Drug: Enzalutamide",DRUG,Abiraterone,"Abiraterone 1000mg oral, taken daily + Prednis...",B: Abiraterone OR Enzalutamide,No other intervention names
2,NCT03295565,2017-05-07,B: Abiraterone OR Enzalutamide,ACTIVE_COMPARATOR,At physician's discretion:\n\nAbiraterone 1000...,"Drug: Abiraterone, Drug: Enzalutamide",DRUG,Enzalutamide,Enzalutamide 160mg oral taken daily,B: Abiraterone OR Enzalutamide,No other intervention names
3,NCT02628665,2015-10,24 to 48 hours group,EXPERIMENTAL,"photosensitizer(photofrin): 2mg/kg, Diomed Sur...","Drug: photosensitizer(photofrin), Device: 630 ...",DRUG,photosensitizer(photofrin),photosensitizer(photofrin): 2mg/kg,"24 to 48 hours group, 48 to 72 hours group",photofrin
4,NCT02628665,2015-10,48 to 72 hours group,ACTIVE_COMPARATOR,"photosensitizer(photofrin): 2mg/kg, Diomed Sur...","Drug: photosensitizer(photofrin), Device: 630 ...",DRUG,photosensitizer(photofrin),photosensitizer(photofrin): 2mg/kg,"24 to 48 hours group, 48 to 72 hours group",photofrin


In [16]:
unique_nct_ids = set(entry['nctId'] for entry in all_arms_interventions if entry['nctId'])

print(len(unique_nct_ids))

3861
