In [1]:
import pandas as pd
import openai


In [2]:
ds_annotated_full = pd.read_csv("../../data/annotated_data/data_splits/ct_neuro_test_merged_90.csv")[['nct_id','text']]
ds_annotated_full.head()

Unnamed: 0,nct_id,text
0,NCT04849741,"A Phase 1-3, Double-Blind, Randomized, Placebo..."
1,NCT04576507,Effects of Repeated Cannabis Administration on...
2,NCT00261508,Efficacy And Safety Of Risperidone In The Trea...
3,NCT01495923,"Randomized, Double-blind, Comparative-effectiv..."
4,NCT03150563,Effects of Different Intensities of Passive St...


In [3]:
def load_pass(file_path, key_to_find):
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split("=")
            if len(parts) == 2 and parts[0] == key_to_find:
                found_password = parts[1]
                break
    if found_password:
        print("Found password.")
        return found_password
    else:
        print("Password not found for key:", key_to_find)

In [4]:
import time
def query_gpt(input_raw_text, task, max_retries=5, retry_delay=2):
    # Add a 20-second delay at the beginning of the function
    time.sleep(3)
    
    if task == "extract_interventions":
        prompt = "Extract the drug names from the following clinical trial and return them in a list separated with the | symbol. If none is found, return only the word none.: "
    elif task == "extract_conditions":
        prompt = "Extract the investigated disease names and related symptoms from the following clinical trial. Return them in a single list separated with the | symbol. If none is found, return only the word none: "

    retries = 0
    while retries < max_retries:
        try:
            completion = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                temperature=0.6,
                max_tokens=2000,
                messages=[
                    {"role": "system", "content": "You are an expert information extraction assistant from clinical trials."},
                    {"role": "user",
                     "content": prompt + input_raw_text}
                ]
            )
            return completion.choices[0].message.content
        except openai.error.Timeout as e:
          #Handle timeout error, e.g. retry or log
          print(f"OpenAI API request timed out: {e}")
          pass
        except openai.error.APIError as e:
          #Handle API error, e.g. retry or log
          print(f"OpenAI API returned an API Error: {e}")
          pass
        except openai.error.APIConnectionError as e:
          #Handle connection error, e.g. check network or log
          print(f"OpenAI API request failed to connect: {e}")
          pass
        except openai.error.InvalidRequestError as e:
          #Handle invalid request error, e.g. validate parameters or log
          print(f"OpenAI API request was invalid: {e}")
          pass
        except openai.error.AuthenticationError as e:
          #Handle authentication error, e.g. check credentials or log
          print(f"OpenAI API request was not authorized: {e}")
          pass
        except openai.error.PermissionError as e:
          #Handle permission error, e.g. check scope or log
          print(f"OpenAI API request was not permitted: {e}")
          pass
        except openai.error.RateLimitError as e:
          #Handle rate limit error, e.g. wait or log
          print(f"OpenAI API request exceeded rate limit: {e}")
          pass # Re-raise the exception for other errors
    raise RuntimeError("Max retries reached. Unable to complete the API call.")


In [6]:
openai.api_key = load_pass("../../credentials.txt", "OPENAI")


Found password.


In [7]:
ds_annotated_full['text'][0]

'A Phase 1-3, Double-Blind, Randomized, Placebo-Controlled Study to Evaluate the Efficacy, Safety, Pharmacokinetics and Pharmacodynamics of Intrathecally Administered ION373 in Patients With Alexander Disease | The purpose of this study is to evaluate the safety and efficacy of ION373 in improving or stabilizing gross motor function across the full range of affected domains in patients with AxD.'

In [8]:
ds_annotated_full['text'][89]

"Polyphenol Rich Supplementation on Markers of Recovery From Intense Resistance Exercise | Strength training is commonly used as an intervention to increase muscle mass, thus improving a person's ability to undertake activities of daily living, or enhance athletic performance. The strength training regimen itself, while ultimately having beneficial effects, causes muscle fibers to be damaged, which the body has to recover from. As the body recovers, it rebuilds the muscle tissue and after multiple consecutive bouts of strength training, the muscle eventually becomes larger and stronger. Thus, it is the recovery from strength training exercise that ultimately determines how well the body adapts. Where inadequate recovery could eventually lead to overtraining and/or injury, optimizing the recovery process from strength training could maximize strength training adaptations. This concept of optimizing recovery has led to development of many supplements, including antioxidants, which may re

In [9]:
query_gpt(ds_annotated_full['text'][0], "extract_conditions")

'Alexander Disease | none'

In [11]:
query_gpt(ds_annotated_full['text'][86], "extract_interventions")

'Exelon®|rivastigmine patch'

In [10]:
#ds_annotated_full['gpt_predictions_interventions'] = ds_annotated_full['text'].apply(lambda text: query_gpt(text, "extract_interventions"))
from tqdm import tqdm  # Import tqdm for the progress bar

# Define a function to apply GPT queries with a progress bar
def apply_gpt_with_progress(data_series, task):
    results = []
    total_items = len(data_series)
    
    # Create a tqdm progress bar
    with tqdm(total=total_items, desc=f"Processing {task}") as pbar:
        for text in data_series:
            result = query_gpt(text, task)
            results.append(result)
            pbar.update(1)  # Update the progress bar

    return results

ds_annotated_full['gpt_predictions_conditions'] = apply_gpt_with_progress(ds_annotated_full['text'], "extract_conditions")

Processing extract_conditions:  84%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                         | 76/90 [05:39<00:58,  4.20s/it]

OpenAI API request timed out: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600)


Processing extract_conditions: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [16:42<00:00, 11.14s/it]


In [12]:
ds_annotated_full['gpt_predictions_interventions'] = apply_gpt_with_progress(ds_annotated_full['text'], "extract_interventions")

Processing extract_interventions: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [06:09<00:00,  4.10s/it]


In [13]:
ds_annotated_full

Unnamed: 0,nct_id,text,gpt_predictions_conditions,gpt_predictions_interventions
0,NCT04849741,"A Phase 1-3, Double-Blind, Randomized, Placebo...",Alexander Disease | none,ION373
1,NCT04576507,Effects of Repeated Cannabis Administration on...,investigated disease names: chronic pain\n\nre...,cannabis|opioids|cannabinoids|delta-9-tetrahyd...
2,NCT00261508,Efficacy And Safety Of Risperidone In The Trea...,Autistic Disorder|Pervasive Developmental Diso...,risperidone|placebo
3,NCT01495923,"Randomized, Double-blind, Comparative-effectiv...",Lumbosacral radiculopathy | radicular pain,Epidural Steroid Injections | Gabapentin
4,NCT03150563,Effects of Different Intensities of Passive St...,none,none
...,...,...,...,...
85,NCT03755362,Treatment of Periodontitis to Prevent Dementia...,Periodontitis|inflammation|tobacco smoking|dia...,Periodontitis
86,NCT00506415,"A 48-Week, Multicenter, Randomized, Double-Bli...",Alzheimer's Disease (AD) | functional and cogn...,Exelon®|rivastigmine
87,NCT01344447,"Multicenter, Open-label Study to Evaluate the ...",Investigated disease names: Vascular Disease o...,Gadobutrol
88,NCT04184206,Effects of Attention Training Interventions on...,Depression|none,none


In [14]:
ds_annotated_full_copy = ds_annotated_full.copy()

### post-processing

In [15]:
ds_annotated_full_copy['gpt_predictions_conditions'] = ds_annotated_full_copy['gpt_predictions_conditions'].str.replace(r'\(.*\)', '', regex=True)
ds_annotated_full_copy['gpt_predictions_interventions'] = ds_annotated_full_copy['gpt_predictions_interventions'].str.replace('The drug names in the given clinical trial are:', '')
ds_annotated_full_copy['gpt_predictions_interventions'] = ds_annotated_full_copy['gpt_predictions_interventions'].str.replace('The drug names found in the clinical trial are:', '')
ds_annotated_full_copy['gpt_predictions_interventions'] = ds_annotated_full_copy['gpt_predictions_interventions'].str.replace('The drug names mentioned in the clinical trial are:', '')

In [16]:
#ds_annotated_full_copy.loc[ds_annotated_full_copy['gpt_predictions_interventions'].str.contains('investigated disease names:'), 'gpt_predictions_interventions'] = ds_annotated_full_copy['gpt_predictions_interventions'].str.replace(',', '|')
#ds_annotated_full_copy.loc[ds_annotated_full_copy['gpt_predictions_interventions'].str.contains('Investigated disease names:'), 'gpt_predictions_interventions'] = ds_annotated_full_copy['gpt_predictions_interventions'].str.replace(',', '|')
#ds_annotated_full_copy.loc[ds_annotated_full_copy['gpt_predictions_interventions'].str.contains('disease names:'), 'gpt_predictions_interventions'] = ds_annotated_full_copy['gpt_predictions_interventions'].str.replace(',', '|')
ds_annotated_full_copy['gpt_predictions_conditions'] = ds_annotated_full_copy['gpt_predictions_conditions'].str.replace(',','|')


In [17]:
ds_annotated_full_copy['gpt_predictions_conditions'] = ds_annotated_full_copy['gpt_predictions_conditions'].str.replace('investigated disease names:','')
ds_annotated_full_copy['gpt_predictions_conditions'] = ds_annotated_full_copy['gpt_predictions_conditions'].str.replace('Investigated disease names:','')
ds_annotated_full_copy['gpt_predictions_conditions'] = ds_annotated_full_copy['gpt_predictions_conditions'].str.replace('disease names:','')

In [18]:
ds_annotated_full_copy['gpt_predictions_conditions'] = ds_annotated_full_copy['gpt_predictions_conditions'].str.replace('related symptoms:','|')
ds_annotated_full_copy['gpt_predictions_conditions'] = ds_annotated_full_copy['gpt_predictions_conditions'].str.replace('Related symptoms:','|')

In [19]:
ds_annotated_full_copy

Unnamed: 0,nct_id,text,gpt_predictions_conditions,gpt_predictions_interventions
0,NCT04849741,"A Phase 1-3, Double-Blind, Randomized, Placebo...",Alexander Disease | none,ION373
1,NCT04576507,Effects of Repeated Cannabis Administration on...,chronic pain\n\n| hyperalgesia,cannabis|opioids|cannabinoids|delta-9-tetrahyd...
2,NCT00261508,Efficacy And Safety Of Risperidone In The Trea...,Autistic Disorder|Pervasive Developmental Diso...,risperidone|placebo
3,NCT01495923,"Randomized, Double-blind, Comparative-effectiv...",Lumbosacral radiculopathy | radicular pain,Epidural Steroid Injections | Gabapentin
4,NCT03150563,Effects of Different Intensities of Passive St...,none,none
...,...,...,...,...
85,NCT03755362,Treatment of Periodontitis to Prevent Dementia...,Periodontitis|inflammation|tobacco smoking|dia...,Periodontitis
86,NCT00506415,"A 48-Week, Multicenter, Randomized, Double-Bli...",Alzheimer's Disease | functional and cognitiv...,Exelon®|rivastigmine
87,NCT01344447,"Multicenter, Open-label Study to Evaluate the ...",Vascular Disease of the Supra-aortic Vessels\...,Gadobutrol
88,NCT04184206,Effects of Attention Training Interventions on...,Depression|none,none


In [20]:
ds_annotated_full_copy.to_csv("../predictions/ct_neuro_test_annotated_GPT-3-turbo_20230817_v4.csv")