In [1]:
import pandas as pd
import openai


In [2]:
ds_annotated_full = pd.read_csv("../../data/annotated_data/data_splits/ct_neuro_test_merged_90.csv")[['nct_id','text']]
ds_annotated_full.head()

Unnamed: 0,nct_id,text
0,NCT04849741,"A Phase 1-3, Double-Blind, Randomized, Placebo..."
1,NCT04576507,Effects of Repeated Cannabis Administration on...
2,NCT00261508,Efficacy And Safety Of Risperidone In The Trea...
3,NCT01495923,"Randomized, Double-blind, Comparative-effectiv..."
4,NCT03150563,Effects of Different Intensities of Passive St...


In [3]:
def load_pass(file_path, key_to_find):
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split("=")
            if len(parts) == 2 and parts[0] == key_to_find:
                found_password = parts[1]
                break
    if found_password:
        print("Found password.")
        return found_password
    else:
        print("Password not found for key:", key_to_find)

In [4]:
import time
def query_gpt(input_raw_text, prompt, gpt_model="gpt-3.5-turbo", max_retries=5, retry_delay=2):
    # Add a 20-second delay at the beginning of the function
    time.sleep(3)
    
    retries = 0
    while retries < max_retries:
        try:
            completion = openai.ChatCompletion.create(
                model=gpt_model,
                temperature=0.6,
                max_tokens=2000,
                messages=[
                    {"role": "system", "content": "You are an expert information extraction assistant from clinical trials."},
                    {"role": "user",
                     "content": prompt + "'''" + input_raw_text + "'''"}
                ]
            )
            return completion.choices[0].message.content
        except openai.error.Timeout as e:
          #Handle timeout error, e.g. retry or log
          print(f"OpenAI API request timed out: {e}")
          pass
        except openai.error.APIError as e:
          #Handle API error, e.g. retry or log
          print(f"OpenAI API returned an API Error: {e}")
          pass
        except openai.error.APIConnectionError as e:
          #Handle connection error, e.g. check network or log
          print(f"OpenAI API request failed to connect: {e}")
          pass
        except openai.error.InvalidRequestError as e:
          #Handle invalid request error, e.g. validate parameters or log
          print(f"OpenAI API request was invalid: {e}")
          pass
        except openai.error.AuthenticationError as e:
          #Handle authentication error, e.g. check credentials or log
          print(f"OpenAI API request was not authorized: {e}")
          pass
        except openai.error.PermissionError as e:
          #Handle permission error, e.g. check scope or log
          print(f"OpenAI API request was not permitted: {e}")
          pass
        except openai.error.RateLimitError as e:
          #Handle rate limit error, e.g. wait or log
          print(f"OpenAI API request exceeded rate limit: {e}")
          pass # Re-raise the exception for other errors
    raise RuntimeError("Max retries reached. Unable to complete the API call.")


In [5]:
openai.api_key = load_pass("../../credentials.txt", "OPENAI")


Found password.


In [6]:
interventions_prompt_v1 = "Extract the drug names from the following clinical trial and return them in a list separated with the | symbol. If none is found, return only the word none.: "
conditions_prompt_v1 = "Extract the investigated disease names and related symptoms from the following clinical trial. Return them in a single list separated with the | symbol. If none is found, return only the word none: "


In [7]:
interventions_prompt_v2 = "Review the clinical trial document enclosed within triple quotes. Extract only the names of drugs that are actively being investigated in the trial. List these names separated by the '|' symbol without any additional text or explanation. Exclude drugs merely mentioned and not under investigation. If there are no drugs actively investigated, simply respond with 'none'. Focus solely on the drug names for clarity and precision."

conditions_prompt_v2 = "Examine the clinical trial document within the triple quotes. Identify and list only the names of diseases and related symptoms under investigation. Format this list with each name or symptom separated by the '|' symbol, omitting any additional descriptions or text. Exclude diseases and symptoms that are only mentioned but not investigated. If there are no diseases or symptoms actively investigated, answer with 'none'. The response should strictly contain the list of names and symptoms."

### example

In [59]:
ds_annotated_full['text'][0]

'A Phase 1-3, Double-Blind, Randomized, Placebo-Controlled Study to Evaluate the Efficacy, Safety, Pharmacokinetics and Pharmacodynamics of Intrathecally Administered ION373 in Patients With Alexander Disease | The purpose of this study is to evaluate the safety and efficacy of ION373 in improving or stabilizing gross motor function across the full range of affected domains in patients with AxD.'

In [60]:
ds_annotated_full['text'][89]

"Polyphenol Rich Supplementation on Markers of Recovery From Intense Resistance Exercise | Strength training is commonly used as an intervention to increase muscle mass, thus improving a person's ability to undertake activities of daily living, or enhance athletic performance. The strength training regimen itself, while ultimately having beneficial effects, causes muscle fibers to be damaged, which the body has to recover from. As the body recovers, it rebuilds the muscle tissue and after multiple consecutive bouts of strength training, the muscle eventually becomes larger and stronger. Thus, it is the recovery from strength training exercise that ultimately determines how well the body adapts. Where inadequate recovery could eventually lead to overtraining and/or injury, optimizing the recovery process from strength training could maximize strength training adaptations. This concept of optimizing recovery has led to development of many supplements, including antioxidants, which may re

In [61]:
query_gpt(ds_annotated_full['text'][0], prompt=conditions_prompt_v1)

'Alexander Disease | none'

In [64]:
query_gpt(ds_annotated_full['text'][0], prompt=conditions_prompt_v2, gpt_model="gpt-3.5-turbo")

'Alexander Disease | gross motor function'

In [65]:
query_gpt(ds_annotated_full['text'][86], prompt=interventions_prompt_v2, gpt_model="gpt-3.5-turbo")

'Exelon'

## run over full test dataset

In [8]:
#ds_annotated_full['gpt_predictions_interventions'] = ds_annotated_full['text'].apply(lambda text: query_gpt(text, "extract_interventions"))
from tqdm import tqdm  # Import tqdm for the progress bar

# Define a function to apply GPT queries with a progress bar
def apply_gpt_with_progress(data_series, gpt_prompt, gpt_model="gpt-3.5-turbo"):
    results = []
    total_items = len(data_series)
    #print(f"Using {gpt_model} with prompt {gpt_prompt}.")

    # Create a tqdm progress bar
    with tqdm(total=total_items, desc=f"Processing {gpt_prompt} with {gpt_model}") as pbar:
        for text in data_series:
            result = query_gpt(text, prompt=gpt_prompt, gpt_model=gpt_model)
            results.append(result)
            pbar.update(1)  # Update the progress bar

    return results


In [9]:
def clean_data(df, conditions_col, interventions_col):
    """
    Clean and format the 'conditions' and 'interventions' columns in a dataframe.

    Args:
    df (DataFrame): The dataframe containing the data.
    conditions_col (str): The name of the column containing condition data.
    interventions_col (str): The name of the column containing intervention data.

    Returns:
    DataFrame: The dataframe with cleaned and formatted data.
    """

    # Cleaning the conditions column
    df[conditions_col] = df[conditions_col].str.replace(r'\(.*\)', '', regex=True)
    df[conditions_col] = df[conditions_col].str.replace(',','|')
    df[conditions_col] = df[conditions_col].str.replace('\"','')
    df[conditions_col] = df[conditions_col].str.replace('investigated disease names:','', case=False)
    df[conditions_col] = df[conditions_col].str.replace('disease names:','', case=False)
    df[conditions_col] = df[conditions_col].str.replace('diseases:','', case=False)
    df[conditions_col] = df[conditions_col].str.replace('related symptoms:','|', case=False)
    df[conditions_col] = df[conditions_col].str.replace('symptoms:','|', case=False)


    # Cleaning the interventions column
    intervention_phrases = [
        'The drug names in the given clinical trial are:',
        'The drug names found in the clinical trial are:',
        'The drug names mentioned in the clinical trial are:',
        'The drugs being actively investigated in this clinical trial are:',
        'The drugs being actively investigated in this clinical trial are',
        'The drug being actively investigated in this clinical trial is'

    ]
    for phrase in intervention_phrases:
        df[interventions_col] = df[interventions_col].str.replace(phrase, '')

    return df

In [10]:
def annotate_clean_save(df, gpt_model, prompt_conditions, prompt_drugs, out_file_suffix="prompt_v2"):
    conditions_col_name = f'{gpt_model}_predictions_conditions'
    interventions_col_name = f'{gpt_model}_predictions_interventions'
    
    df[f'{gpt_model}_predictions_conditions'] = apply_gpt_with_progress(df['text'], gpt_prompt=prompt_conditions, gpt_model=gpt_model)
    df[f'{gpt_model}_predictions_interventions'] = apply_gpt_with_progress(df['text'], gpt_prompt=prompt_drugs, gpt_model=gpt_model)
    
    ds_annotated_full_copy = df.copy()
    cleaned_df = clean_data(ds_annotated_full_copy, conditions_col_name, interventions_col_name)

    cleaned_df.to_csv(f"../predictions/ct_neuro_test_annotated_{gpt_model}_{out_file_suffix}_20240129.csv")
    
    return cleaned_df

In [14]:
ds_annotated_full.head()

Unnamed: 0,nct_id,text
0,NCT04849741,"A Phase 1-3, Double-Blind, Randomized, Placebo..."
1,NCT04576507,Effects of Repeated Cannabis Administration on...
2,NCT00261508,Efficacy And Safety Of Risperidone In The Trea...
3,NCT01495923,"Randomized, Double-blind, Comparative-effectiv..."
4,NCT03150563,Effects of Different Intensities of Passive St...


### gpt-3.5-turbo

In [37]:
conditions_prompt_v1

'Extract the investigated disease names and related symptoms from the following clinical trial. Return them in a single list separated with the | symbol. If none is found, return only the word none: '

In [38]:
interventions_prompt_v1

'Extract the drug names from the following clinical trial and return them in a list separated with the | symbol. If none is found, return only the word none.: '

In [20]:
annotations_gpt_3_5 = annotate_clean_save(ds_annotated_full.copy(), "gpt-3.5-turbo", conditions_prompt_v1, interventions_prompt_v1, out_file_suffix="prompt_v1")

Processing Extract the investigated disease names and related symptoms from the following clinical trial. Return them in a single list separated with the | symbol. If none is found, return only the word none:  with gpt-3.5-t
Processing Extract the drug names from the following clinical trial and return them in a list separated with the | symbol. If none is found, return only the word none.:  with gpt-3.5-turbo: 100%|█| 90/90 [06:36<00:00,  4.41s


In [39]:
conditions_prompt_v2

"Examine the clinical trial document within the triple quotes. Identify and list only the names of diseases and related symptoms under investigation. Format this list with each name or symptom separated by the '|' symbol, omitting any additional descriptions or text. Exclude diseases and symptoms that are only mentioned but not investigated. If there are no diseases or symptoms actively investigated, answer with 'none'. The response should strictly contain the list of names and symptoms."

In [40]:
interventions_prompt_v2

"Review the clinical trial document enclosed within triple quotes. Extract only the names of drugs that are actively being investigated in the trial. List these names separated by the '|' symbol without any additional text or explanation. Exclude drugs merely mentioned and not under investigation. If there are no drugs actively investigated, simply respond with 'none'. Focus solely on the drug names for clarity and precision."

In [23]:
annotations_gpt_3_5 = annotate_clean_save(ds_annotated_full.copy(), "gpt-3.5-turbo", conditions_prompt_v2, interventions_prompt_v2, out_file_suffix="prompt_v2")

Processing Examine the clinical trial document within the triple quotes. Identify and list only the names of diseases and related symptoms under investigation. Format this list with each name or symptom separated by the '|' 
Processing Review the clinical trial document enclosed within triple quotes. Extract only the names of drugs that are actively being investigated in the trial. List these names separated by the '|' symbol without any additio


### gpt-4

In [22]:
annotations_gpt_4 = annotate_clean_save(ds_annotated_full.copy(), "gpt-4", conditions_prompt_v1, interventions_prompt_v1, out_file_suffix="prompt_v1")

Processing Extract the investigated disease names and related symptoms from the following clinical trial. Return them in a single list separated with the | symbol. If none is found, return only the word none:  with gpt-4: 10
Processing Extract the drug names from the following clinical trial and return them in a list separated with the | symbol. If none is found, return only the word none.:  with gpt-4: 100%|█████| 90/90 [05:45<00:00,  3.84s/it]


In [24]:
annotations_gpt_4 = annotate_clean_save(ds_annotated_full.copy(), "gpt-4", conditions_prompt_v2, interventions_prompt_v2, out_file_suffix="prompt_v2")

Processing Examine the clinical trial document within the triple quotes. Identify and list only the names of diseases and related symptoms under investigation. Format this list with each name or symptom separated by the '|' 
Processing Review the clinical trial document enclosed within triple quotes. Extract only the names of drugs that are actively being investigated in the trial. List these names separated by the '|' symbol without any additio


In [31]:
gpt_model = "gpt-4"
out_file_suffix = "prompt_v2"
gpt_4 = pd.read_csv(f"../predictions/ct_neuro_test_annotated_{gpt_model}_{out_file_suffix}_20240129.csv")

In [32]:
gpt_4

Unnamed: 0.1,Unnamed: 0,nct_id,text,gpt-4_predictions_conditions,gpt-4_predictions_interventions
0,0,NCT04849741,"A Phase 1-3, Double-Blind, Randomized, Placebo...",Alexander Disease | gross motor function,ION373
1,1,NCT04576507,Effects of Repeated Cannabis Administration on...,Chronic pain|opioid use disorders|hyperalgesia,Cannabis | delta-9-tetrahydrocannabinol (THC) ...
2,2,NCT00261508,Efficacy And Safety Of Risperidone In The Trea...,Autistic Disorder|Pervasive Developmental Diso...,Risperidone
3,3,NCT01495923,"Randomized, Double-blind, Comparative-effectiv...",Lumbosacral Radiculopathy | Lumbosacral Radicu...,Epidural Steroid Injections|Gabapentin
4,4,NCT03150563,Effects of Different Intensities of Passive St...,none,none
...,...,...,...,...,...
85,85,NCT03755362,Treatment of Periodontitis to Prevent Dementia...,Periodontitis | Dementia | Asymptomatic Caroti...,none
86,86,NCT00506415,"A 48-Week, Multicenter, Randomized, Double-Bli...",Alzheimer's Disease | Functional Decline | Cog...,Exelon | rivastigmine patch
87,87,NCT01344447,"Multicenter, Open-label Study to Evaluate the ...","""Vascular Disease of the Supra-aortic Vessels""",Gadobutrol
88,88,NCT04184206,Effects of Attention Training Interventions on...,Depression | Major depressive disorder,none


In [35]:
# Removing quotation marks that surround text values
gpt_4['gpt-4_predictions_conditions'] = gpt_4['gpt-4_predictions_conditions'].str.replace('^"|"$', '', regex=True)
gpt_4['gpt-4_predictions_conditions'] = gpt_4['gpt-4_predictions_conditions'].str.replace('^["\']|["\']$', '', regex=True)

# Display the modified DataFrame
gpt_4

Unnamed: 0.1,Unnamed: 0,nct_id,text,gpt-4_predictions_conditions,gpt-4_predictions_interventions
0,0,NCT04849741,"A Phase 1-3, Double-Blind, Randomized, Placebo...",Alexander Disease | gross motor function,ION373
1,1,NCT04576507,Effects of Repeated Cannabis Administration on...,Chronic pain|opioid use disorders|hyperalgesia,Cannabis | delta-9-tetrahydrocannabinol (THC) ...
2,2,NCT00261508,Efficacy And Safety Of Risperidone In The Trea...,Autistic Disorder|Pervasive Developmental Diso...,Risperidone
3,3,NCT01495923,"Randomized, Double-blind, Comparative-effectiv...",Lumbosacral Radiculopathy | Lumbosacral Radicu...,Epidural Steroid Injections|Gabapentin
4,4,NCT03150563,Effects of Different Intensities of Passive St...,none,none
...,...,...,...,...,...
85,85,NCT03755362,Treatment of Periodontitis to Prevent Dementia...,Periodontitis | Dementia | Asymptomatic Caroti...,none
86,86,NCT00506415,"A 48-Week, Multicenter, Randomized, Double-Bli...",Alzheimer's Disease | Functional Decline | Cog...,Exelon | rivastigmine patch
87,87,NCT01344447,"Multicenter, Open-label Study to Evaluate the ...",Vascular Disease of the Supra-aortic Vessels,Gadobutrol
88,88,NCT04184206,Effects of Attention Training Interventions on...,Depression | Major depressive disorder,none


In [36]:
gpt_4.to_csv(f"../predictions/ct_neuro_test_annotated_{gpt_model}_{out_file_suffix}_20240129.csv")

## demo 

In [None]:
gpt_model="gpt-4"

In [39]:
ds_annotated_full[f'{gpt_model}_predictions_conditions'] = apply_gpt_with_progress(ds_annotated_full['text'], prompt=interventions_prompt_v1, gpt_model=gpt_model)

Using gpt-4 for extract_conditions.


Processing extract_conditions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [06:44<00:00,  4.49s/it]


In [40]:
ds_annotated_full[f'{gpt_model}_predictions_interventions'] = apply_gpt_with_progress(ds_annotated_full['text'], prompt=conditions_prompt_v1, gpt_model=gpt_model)

Using gpt-4 for extract_interventions.


Processing extract_interventions:  28%|██████████████████████████████████████████▌                                                                                                              | 25/90 [06:14<30:02, 27.73s/it]

OpenAI API request failed to connect: Error communicating with OpenAI: HTTPSConnectionPool(host='api.openai.com', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f9602ae72e0>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))


Processing extract_interventions: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 90/90 [14:11<00:00,  9.46s/it]


In [41]:
ds_annotated_full

Unnamed: 0,nct_id,text,gpt-4_predictions_conditions,gpt-4_predictions_interventions
0,NCT04849741,"A Phase 1-3, Double-Blind, Randomized, Placebo...",Alexander Disease,ION373
1,NCT04576507,Effects of Repeated Cannabis Administration on...,Chronic pain,Cannabis | delta-9-tetrahydrocannabinol (THC) ...
2,NCT00261508,Efficacy And Safety Of Risperidone In The Trea...,Autistic Disorder | Pervasive Developmental Di...,Risperidone|Placebo
3,NCT01495923,"Randomized, Double-blind, Comparative-effectiv...",Lumbosacral Radiculopathy | Radicular Pain,Epidural Steroid Injections|Gabapentin
4,NCT03150563,Effects of Different Intensities of Passive St...,,none
...,...,...,...,...
85,NCT03755362,Treatment of Periodontitis to Prevent Dementia...,Periodontitis | Dementia | Asymptomatic Caroti...,none
86,NCT00506415,"A 48-Week, Multicenter, Randomized, Double-Bli...","Alzheimer's Disease | functional decline, cogn...",Exelon® | rivastigmine patch
87,NCT01344447,"Multicenter, Open-label Study to Evaluate the ...",none,Gadobutrol
88,NCT04184206,Effects of Attention Training Interventions on...,Major Depressive Disorder | symptoms and brain...,


In [42]:
ds_annotated_full_copy = ds_annotated_full.copy()

In [66]:
conditions_col_name = f'{gpt_model}_predictions_conditions'
interventions_col_name = f'{gpt_model}_predictions_interventions'

In [67]:
def clean_data(df, conditions_col, interventions_col):
    """
    Clean and format the 'conditions' and 'interventions' columns in a dataframe.

    Args:
    df (DataFrame): The dataframe containing the data.
    conditions_col (str): The name of the column containing condition data.
    interventions_col (str): The name of the column containing intervention data.

    Returns:
    DataFrame: The dataframe with cleaned and formatted data.
    """

    # Cleaning the conditions column
    df[conditions_col] = df[conditions_col].str.replace(r'\(.*\)', '', regex=True)
    df[conditions_col] = df[conditions_col].str.replace(',','|')
    df[conditions_col] = df[conditions_col].str.replace('investigated disease names:','', case=False)
    df[conditions_col] = df[conditions_col].str.replace('disease names:','', case=False)
    df[conditions_col] = df[conditions_col].str.replace('related symptoms:','|', case=False)

    # Cleaning the interventions column
    intervention_phrases = [
        'The drug names in the given clinical trial are:',
        'The drug names found in the clinical trial are:',
        'The drug names mentioned in the clinical trial are:'
    ]
    for phrase in intervention_phrases:
        df[interventions_col] = df[interventions_col].str.replace(phrase, '')

    return df


In [69]:
cleaned_df = clean_data(ds_annotated_full_copy, conditions_col_name, interventions_col_name)


In [70]:
cleaned_df

Unnamed: 0,nct_id,text,gpt-4_predictions_conditions,gpt-4_predictions_interventions
0,NCT04849741,"A Phase 1-3, Double-Blind, Randomized, Placebo...",Alexander Disease,ION373
1,NCT04576507,Effects of Repeated Cannabis Administration on...,Chronic pain,Cannabis | delta-9-tetrahydrocannabinol (THC) ...
2,NCT00261508,Efficacy And Safety Of Risperidone In The Trea...,Autistic Disorder | Pervasive Developmental Di...,Risperidone|Placebo
3,NCT01495923,"Randomized, Double-blind, Comparative-effectiv...",Lumbosacral Radiculopathy | Radicular Pain,Epidural Steroid Injections|Gabapentin
4,NCT03150563,Effects of Different Intensities of Passive St...,,none
...,...,...,...,...
85,NCT03755362,Treatment of Periodontitis to Prevent Dementia...,Periodontitis | Dementia | Asymptomatic Caroti...,none
86,NCT00506415,"A 48-Week, Multicenter, Randomized, Double-Bli...",Alzheimer's Disease | functional decline| cogn...,Exelon® | rivastigmine patch
87,NCT01344447,"Multicenter, Open-label Study to Evaluate the ...",none,Gadobutrol
88,NCT04184206,Effects of Attention Training Interventions on...,Major Depressive Disorder | symptoms and brain...,


In [50]:
cleaned_df.to_csv(f"../predictions/ct_neuro_test_annotated_{gpt_model}_20240129.csv")