In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
import json
load_dotenv()
import openai
import time

Read JSON Files:

In [10]:
for file in os.listdir('../CT_Database/CT_Recruiting/')[:10]:
    json_data = json.load(open(f'../CT_Database/CT_Recruiting/{file}'))

Summarize:

In [11]:
json_data

{'NCT Number': 'NCT05059444',
 'Study Results': {},
 'Interventions': {'ArmGroupList': {'ArmGroup': [{'ArmGroupLabel': 'Cohort 1: Muscle invasive carcinoma of the bladder, ureter, or renal pelvis (stage II-III)',
     'ArmGroupInterventionList': {'ArmGroupInterventionName': ['Diagnostic Test: Guardant Reveal']}},
    {'ArmGroupLabel': 'Cohort 2: Non-small cell lung cancer (stage II-III)',
     'ArmGroupInterventionList': {'ArmGroupInterventionName': ['Diagnostic Test: Guardant Reveal']}},
    {'ArmGroupLabel': 'Cohort 3: Invasive breast carcinoma with all of the following:',
     'ArmGroupDescription': 'Clinical stage T1-4/N0-3/M0 at presentation AND\nCompleted preoperative systemic chemotherapy-containing regimen AND\nUnderwent definitive surgical resection of the primary tumor AND\nHas pathological evidence of residual invasive carcinoma in the breast and/or axillary lymph nodes AND\nHormone receptor and HER2 status are known',
     'ArmGroupInterventionList': {'ArmGroupInterventionN

Completetion Function:

In [12]:
def summarizer(system_prompt, user_prompt, model='gpt-4-0125-preview', temperature=0, verbose=False):
    response = openai.ChatCompletion.create(
        model=model, 
        temperature=temperature,
        messages=[
            {"role":"system", "content":system_prompt},
            {"role":"user", "content":str(user_prompt)},
        ],
        max_tokens = 1024,
        response_format={ "type": "json_object" }
        
    )
    res = response['choices'][0]['message']['content']
    return res

SYSTEM TEMPLATE --> Add or remove fields as required

In [13]:
system_template = f'''Give summary of the given fields below in the specified format that is mentioned ahead of it:
---BEGIN FORMAT TEMPLATE---
{{"NCT_NUMBER":"The NCT Number as it is"
"ELIGIBILITY_CRITERIA":"Summarize the elgibility criteria preciesely"
"PRIMARY_OUTCOMES": "Summarize primary outcomes into a single paragraph",
"SECONDARY_OUTCOMES": "Summarize secondary outcomes into a single paragraph"  
"INTERVENTIONS": "Summarize Interventions into a paragraph"}}
---END FORMAT TEMPLATE---
Give the output of the format template in json format
'''
res = summarizer(system_prompt=system_template, user_prompt=json_data)

In [14]:
json.loads(res)

{'NCT_NUMBER': 'NCT05059444',
 'ELIGIBILITY_CRITERIA': 'Participants must be over 18, treated with curative intent, willing to undergo regular follow-up, provide consent, and blood samples for up to 5 years. They must have a histologically confirmed Index Cancer that qualifies for inclusion, such as muscle invasive carcinoma of the bladder, non-small cell lung cancer, invasive breast carcinoma, among others, and meet specific criteria for each cohort. Exclusions include history of allogeneic organ or tissue transplant, neuroendocrine histology of the Index Cancer, history of certain primary cancers, known distant metastasis at enrollment, and participation in other genomic test studies.',
 'PRIMARY_OUTCOMES': 'The primary outcome is the Distant Recurrence Free Interval (D-RFi), defined as the time from the end of primary treatment until the diagnosis of a distant recurrence of the Index Cancer. Subjects without a distant recurrence will be censored at the time of last follow-up of thei

In [15]:
pd.DataFrame.from_dict(json.loads(res), orient='index').T

Unnamed: 0,NCT_NUMBER,ELIGIBILITY_CRITERIA,PRIMARY_OUTCOMES,SECONDARY_OUTCOMES,INTERVENTIONS
0,NCT05059444,"Participants must be over 18, treated with cur...",The primary outcome is the Distant Recurrence ...,"Secondary outcomes include sensitivity, positi...",The intervention across all cohorts is the Dia...


Summarised Attributes Function:

USING GPT-3-TURBO:

In [16]:
def generate_summarized_attrs(dir_path:str):
    main_df =pd.DataFrame()
    for file in os.listdir(dir_path)[:100]:
        json_data = json.load(open(os.path.join(dir_path, file)))
        res = summarizer(system_prompt=system_template, user_prompt=json_data)
        dict_string = res.split('<START>')[1].split('<END>')[0].replace('\n','')
        json_data = json.loads(dict_string)
        temp_df = pd.DataFrame.from_dict(json_data, orient='index').T
        main_df = pd.concat([main_df, temp_df])
    return main_df

USING GPT-4-TURBO:

In [17]:
def generate_summarized_attrs_json(dir_path:str):
    main_df =pd.DataFrame()
    for file in os.listdir(dir_path)[10:13]:
        json_data = json.load(open(os.path.join(dir_path, file)))
        res = summarizer(system_prompt=system_template, user_prompt=json_data)
        json_obj = json.loads(res)
        temp_df = pd.DataFrame.from_dict(json_obj, orient='index').T
        main_df = pd.concat([main_df, temp_df])
    return main_df

With batching:

In [18]:
def generate_summarized_attrs_json_batch(dir_path: str, batch_size: int = 100):
    main_df = pd.DataFrame()
    file_list = os.listdir(dir_path)
    num_files = len(file_list)
    
    for i in range(0, num_files, batch_size):
        batch_files = file_list[i:i+batch_size]
        for file in batch_files:
            print(f'Processing: {file}')
            json_data = json.load(open(os.path.join(dir_path, file)))
            res = summarizer(system_prompt=system_template, user_prompt=json_data)
            json_obj = json.loads(res)
            temp_df = pd.DataFrame.from_dict(json_obj, orient='index').T
            main_df = pd.concat([main_df, temp_df])
    
    return main_df

Concurrently:

In [19]:
from concurrent.futures import ThreadPoolExecutor
def generate_summarized_attrs_json_batch_threading(dir_path: str, batch_size: int = 100, unprocessed_ledger: list = []):
    main_df = pd.DataFrame()
    file_list = os.listdir(dir_path)
    num_files = len(file_list)
    
    
    def process_batch(batch_files):
        files_processed = 0
        temp_main = pd.DataFrame()
        batch_df = pd.DataFrame()
        for file in batch_files:
            file_name = file.split('.')[0].strip() 
            if unprocessed_ledger: #only this added to only process files that have not been processed
                files_processed += 1
                print(f'Processing: {file} - {files_processed}')
                json_data = json.load(open(os.path.join(dir_path, file)))
                res = summarizer(system_prompt=system_template, user_prompt=json_data)
                json_obj = json.loads(res)
                temp_df = pd.DataFrame.from_dict(json_obj, orient='index').T
                batch_df = pd.concat([batch_df, temp_df])
            else:
                print(f'----File {file_name} Processed----')
            
        batch_df.to_csv('../ct_csv/SummerizedCTAttrs_P2.csv', mode='a', index=False)
        
        return batch_df
            

    batches = [file_list[i:i+batch_size] for i in range(0, num_files, batch_size)]
    
    with ThreadPoolExecutor(max_workers=10) as executor:
        batch_dfs = list(executor.map(process_batch, batches))
        
    
    main_df = pd.concat(batch_dfs)
    
    return main_df

Currently we are giving the entire json as the context to the study, hence it is taking more time:


**Execution Time: 65.52171802520752**

In [None]:
start_time = time.time()
data = generate_summarized_attrs_json_batch_threading('../CT_Database/CT_Recruiting/')
end_time = time.time()
execution_time = end_time - start_time
print('Execution Time: {}'.format(execution_time))

In [39]:
import pickle
file_list = os.listdir('../CT_Database/CT_Recruiting/')
unprocessed_ledger = pickle.load(open('../ct_csv/remaining_trials_nctnumbers.pkl', 'rb'))
batches = [file_list[i:i+100] for i in range(0, len(file_list), 100)]
files_processed = 0
temp_main = pd.DataFrame()
batch_df = pd.DataFrame()
for file in batches[15]:
    files_processed += 1
    file_name = file.split('.')[0].strip() 
    if file_name in unprocessed_ledger:
        print(file_name)
    # print(f'Processing: {file} - {files_processed}')


NCT05107674
NCT04882306
NCT04057209
NCT04339140
NCT05049746
NCT05458739
NCT04080284
NCT05585034
NCT05715255
NCT06073418
NCT05496829
NCT04595565
NCT05300269
NCT05301881
NCT04245683
NCT04967248
NCT05590559
NCT02918474
NCT04938609
NCT05406713
NCT05494697
NCT03412877
NCT05219500
NCT05181033
NCT05692024
NCT03144648
NCT04660435
NCT05055323
NCT05491616
NCT05518253
NCT04501523
NCT04921644
NCT04088708
NCT05992870
NCT02155621
NCT04761146
NCT05305092
NCT05645380
NCT06128694
NCT05780814
NCT03872661
NCT06085274
NCT01927744
NCT04669301
NCT05535192
NCT06234748
NCT01210027
NCT05848011
NCT03987555
NCT04298983
NCT05582499
NCT03759431
NCT04402606
NCT05065957
NCT05753618
NCT05713006
NCT05596435
NCT05919147
NCT05309265
NCT05215574
NCT04657068
NCT05335473
NCT05959889
NCT04982926
NCT04929223
NCT05969496
NCT05296577
NCT04276272
NCT02201992
NCT06059118
NCT05396300
NCT03955627
NCT03750539
NCT05226078
NCT05841420
NCT04144907
NCT05797168
NCT05013216
NCT05982626
NCT06051695
NCT04829643
NCT04480203
NCT05306041
NCT0

In [261]:
start_time = time.time()
data_2 = generate_summarized_attrs_json('../CT_Database/CT_2/')
end_time = time.time()
execution_time = end_time - start_time
print('Execution Time: {}'.format(execution_time))

Execution Time: 40.68126583099365


In [232]:
pd.set_option('display.max_colwidth', None)
data

Unnamed: 0,NCT_NUMBER,PRIMARY_OUTCOMES,SECONDARY_OUTCOMES,INTERVENTIONS,LOCATIONS,STUDY_RESULTS
0,NCT05496101,"The primary outcome of this study is to assess the margin status of wide local excision (WLE) specimens and cavity shavings, if any, using LightPath CLI + FAR imaging compared with final histopathology results. A positive margin on histopathology is defined as invasive carcinoma: positive if <1mm; negative if ≥1mm, and for Ductal carcinoma in situ (DCIS), if present: positive if <2mm; negative if ≥2mm. This assessment occurs during the complete surgical procedure.","Secondary outcomes include the agreement between the margin status of cavity shavings as determined by intraoperative CLI + FAR LightPath imaging and post-operative histopathology, and the re-operation rate within the study cohort compared to the general breast cancer population undergoing a WLE. These outcomes are evaluated during the complete surgical procedure.","The intervention in this study involves the use of the LightPath Imaging System, an in vitro diagnostic device with CE mark in Europe, for intraoperative Cerenkov luminescence imaging (CLI) plus flexible autoradiography (FAR) during breast-conserving surgery (BCS) for breast cancer. This is compared with standard-of-care histopathology. The imaging system uses 18F-FDG, a routinely used Positron Emission Tomography (PET)/Computed Tomography (CT) radiopharmaceutical.","Guy's and St Thomas NHS Trust, London, SE19RT, United Kingdom","As the study is currently recruiting and has not posted results, there is no summary of study results available at this time."
0,NCT05935384,"The primary outcome of the study is to evaluate the sensitivity of circulating tumor DNA (ctDNA) in detecting disease progression across the primary study cohorts, which include patients with breast cancer, non-small cell lung cancer (NSCLC), or colorectal cancer (CRC). This evaluation is planned to be conducted over a period of 6 years.","Secondary outcomes of the study include assessing the RECIST v1.1 response, which measures tumor response to treatment through restaging scans and correlates changes in ctDNA quantities with clinical response. Progression-Free Survival (PFS), defined by the correlation of quantitative changes in ctDNA with participants' progression-free survival on each line of standard of care therapy, and the lead time between ctDNA detection or increase and clinical detection of disease progression, are also key secondary outcomes. These will be observed over a 6-year timeframe.","The intervention involves the diagnostic test Guardant360, a next-generation sequencing-based in vitro diagnostic device. It detects single nucleotide variants, insertions and deletions, copy number amplifications, and fusions in genes frequently mutated in cancer using circulating cell-free DNA obtained from the plasma of peripheral whole blood. Participants across various cohorts, including those with unresectable stage III/IV NSCLC, stage IV colorectal cancer, and unresectable stage III/IV breast cancer (HR+ HER2-, HR- HER2+, Triple Positive, Triple Negative), will have their blood samples collected and banked for this diagnostic test.","Orchard Healthcare Research Inc., Skokie, Illinois, 60077-1384, United States","As the study is currently recruiting and has not posted results, there is no summary available for the study results at this time."
0,NCT02034981,"The primary outcome of this study is to assess the efficacy of crizotinib as a single agent in treating diverse types of tumors that have identified activating molecular alterations in crizotinib target genes. Efficacy will be measured by the objective response, defined as either a complete response (CR) or partial response (PR) according to Response Evaluation Criteria in Solid Tumors (RECIST) 1.1 criteria, after 2 cycles (8 weeks) of treatment.","Secondary outcomes include assessing the safety profile of crizotinib, disease control rate, response duration, progression-free survival, and overall survival. The safety profile will be evaluated throughout the treatment and a 2-year post-treatment follow-up period using the International Common Terminology Criteria for Adverse Events (CTCAE), version 4.0. Disease control rate, response duration, progression-free survival, and overall survival will be measured at specified intervals during and after treatment.","Participants in this study will receive oral crizotinib as monotherapy. The dosage is 250 mg twice daily for adults aged 18 years and above, 280 mg/m² twice daily for children and adolescents aged from 1 to 17 (except for ALCL patients), and 165 mg/m² twice daily for ALCL patients aged from 1 to 17. Treatment will continue daily continuously until progression or unacceptable toxicity develops.","Gustave Roussy, Villejuif, Ile De France, 94805, France","No study results are provided as the study is active, not recruiting, and has not posted any results."


In [238]:
def get_memory_usage(df:pd.DataFrame):
    memoryUsage = round(df.memory_usage(deep=True).sum()/(1024 * 1024), 2)
    print('Memory usage: {} MB'.format(memoryUsage))
    return memoryUsage

In [239]:
get_memory_usage(data)

Memory usage: 0.01 MB


0.01

Checking Only Recruiting rows and the new folder have the same NCT numbers:

In [281]:
temp_data = pd.read_csv('../ct_csv/CT_07_03_2024_ElgCriteriaOnly.csv')
df_recruiting = temp_data[temp_data['Study Status'] == 'Recruiting']
df_recruiting.to_csv('../ct_csv/CTRecruiting.csv') #save the df

In [270]:
unique_nct_numbers_recruiting_count = len(df_recruiting['NCT Number'].unique())
print('Unique NCT number count - Recruiting: {}'.format(unique_nct_numbers_recruiting_count))

Unique NCT number count - Recruiting: 3836


In [271]:
new_folder_file_count_13_03 = len(os.listdir('../CT_Database/Clinical_Trials/'))
print('Only Recruiting file count: {}'.format(new_folder_file_count_13_03))

Only Recruiting file count: 3836


Get Locations:

In [34]:
def get_location(system_prompt, user_prompt, model='gpt-4-0125-preview', temperature=0, verbose=False):
    response = openai.chat.completions.create(
        model=model, 
        temperature=temperature,
        messages=[
            {"role":"system", "content":system_prompt},
            {"role":"user", "content":str(user_prompt)},
        ],
        max_tokens = 1024,
        response_format={ "type": "json_object" }
        
    )
    res = response.choices[0].message.content
    return res

In [35]:
system_template = f'''Give the names of any location such as city, state, country from the provided sentence in the specified format:
---BEGIN FORMAT TEMPLATE---
{{"CITY":"city"
"STATE":"state of the city"
"COUNTRY": "the country the city and state belong to"}}
---END FORMAT TEMPLATE---
Give the output of the format template in json format
'''

In [36]:
res_loc = get_location(system_prompt=system_template, user_prompt='Generate a list of clinical trials for a patient with newly diagnosed stage IV NSCLC for first line treatment options in the washington illinois?')

{"CITY":"Washington",
"STATE":"Illinois",
"COUNTRY": "United States"}


TypeError: 'ChatCompletion' object is not subscriptable

In [53]:
res_loc

'{"CITY":"Washington",\n"STATE":"Illinois",\n"COUNTRY": "United States"}'