In [1]:
import pandas as pd
import openai
import json
import time

In [2]:
from openai import OpenAI

In [3]:
openai.__version__

'1.1.1'

### Read in data

In [4]:
# Initialize an empty list to store the parsed data
data_list = []

# Read the JSONL file line by line
with open('../data/prodigy/annotated_output/test_merged.jsonl', 'r') as file:
    for line in file:
        # Parse each line as a JSON object
        data = json.loads(line.strip())
        
        # Check if _view_id is "review"
        if data.get("_view_id") == "review":
            # Split text into journal_name, title, and abstract based on ^\n symbol
            text = data.get("text", "")
            
            # Splitting text based on ^\n for journal_name, title, and abstract
            parts = text.split("^\n", 2)  # Split into three parts based on first two ^\n occurrences
            
            journal_name = parts[0].strip() if len(parts) > 0 else ""
            title = parts[1].strip() if len(parts) > 1 else ""
            abstract = parts[2].strip() if len(parts) > 2 else ""
            
            # Extract other required fields
            pmid = data.get("pmid", "")
            accept = data.get("accept", [])
            
            # Append to the list
            data_list.append({
                "pmid": pmid,
                "journal_name": journal_name,
                "title": title,
                "abstract": abstract,
                "accepted_label": accept
            })

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(data_list)

In [5]:
df

Unnamed: 0,pmid,journal_name,title,abstract,accepted_label
0,37550718,Trials,Can dexamethasone improve postoperative sleep ...,Perioperative sleep disorders (PSD) are an ind...,[Animal-other]
1,2500373,Developmental medicine and child neurology,Effects of puberty on seizure frequency.,"Seizure frequency was documented before, durin...",[Remaining]
2,36189588,Journal of Alzheimer's disease : JAD,Characterization of Mild Cognitive Impairment ...,"Despite tremendous advancements in the field, ...",[Human-systematic-review]


### Load key for the OpenAI API 

In [6]:
def load_pass(file_path, key_to_find):
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split("=")
            if len(parts) == 2 and parts[0] == key_to_find:
                found_password = parts[1]
                break
    if found_password:
        print("Found password.")
        return found_password
    else:
        print("Password not found for key:", key_to_find)

Note: You need to create a credentials.txt file with the following content:  
OPENAI=sk-77QXXXXXXXXXXXXXXXXXXXXXXXXXXX  
replace the value after the = sign with your API key.  
Make sure the credentials.txt is added to .gitignore, you don't want to put your password on Git!

In [7]:
openai.api_key = load_pass("credentials.txt", "OPENAI")

Found password.


In [8]:
client = OpenAI(api_key=openai.api_key)

### Query GPT models

To change the task the model is solving, you need to change the text of the prompt and the content text of the system role.  
To change the GPT model used, you need to change the text of the model name when initiating the openai API.  
The function gets as input the input_raw_text, that will be text for information extraction or classification.

In [9]:
def query_gpt(input_raw_text, task, max_retries=5, retry_delay=2):
    # Add a 20-second delay at the beginning of the function -> to not overload the API if multiple calls
    time.sleep(3)
    
    if task == "classify":
        prompt = "Classify this text to one of this labels Clinical-study-protocol, Human-systematic-review, Non-systematic-review, Human-RCT-non-drug-intervention, Human-RCT-drug-intervention, Human-RCT-non-intervention, Human-case-report, Human-non-RCT-non-drug-intervention, Human-non-RCT-drug-intervention, Animal-systematic-review, Animal-drug-intervention, Animal-non-drug-intervention, Animal-other, In-vitro-study, Remaining: "
    else:
        return "Task not known."
    
    retries = 0
    while retries < max_retries:
        try:
            completion = client.chat.completions.create(
                model="gpt-3.5-turbo", # CHANGE this to the desired model name, see https://platform.openai.com/docs/models
                temperature=0.6,
                max_tokens=2000,
                messages=[
                    {"role": "system", "content": "You are an expert information extraction assistant from clinical trials."},
                    {"role": "user",
                     "content": prompt + input_raw_text}
                ]
            )
            return completion.choices[0].message.content

        except Exception as e:
          #Handle API error, e.g. retry or log
          print(f"OpenAI API returned an API Error: {e}")
          pass

    raise RuntimeError("Max retries reached. Unable to complete the API call.")


In [18]:
df['abstract'][0]

"Perioperative sleep disorders (PSD) are an independent risk factor for postoperative delirium (POD), which is a common complication after surgery. Elderly patients who undergo robot-assisted radical prostatectomy (RARP) often experience perioperative sleep disorders (PSD). Dexamethasone, a medication that works by inhibiting the hypothalamic-pituitary-suprarenal cortical axis, can reduce the negative effects of surgical stress. The objective of this study was to determine whether intravenous administration of dexamethasone at the time of anesthesia induction could improve postoperative sleep quality in elderly patients, thereby indirectly reducing the risk of postoperative cognitive impairment and accelerating postoperative rehabilitation.\tThis study is a randomized, double-blind, placebo-controlled trial that was conducted at a single center. A sample size of 116 patients was determined through calculation, and these patients were randomly assigned to either the dexamethasone group 

In [11]:
example_input = df['abstract'][0]

In [12]:
query_gpt(example_input, "classify")

'Human-RCT-drug-intervention'

### Run the GPT task over the whole dataset

In [13]:
#ds_annotated_full['gpt_predictions_interventions'] = ds_annotated_full['text'].apply(lambda text: query_gpt(text, "extract_interventions"))
from tqdm import tqdm  # Import tqdm for the progress bar

# Define a function to apply GPT queries with a progress bar
def apply_gpt_with_progress(data_series, task):
    results = []
    total_items = len(data_series)
    
    # Create a tqdm progress bar
    with tqdm(total=total_items, desc=f"Processing {task}") as pbar:
        for text in data_series:
            result = query_gpt(text, task)
            results.append(result)
            pbar.update(1)  # Update the progress bar

    return results

# gpt_predictions_conditions is the column
df['gpt_predictions'] = apply_gpt_with_progress(df['abstract'], "classify")

Processing classify: 100%|█████████████████| 3/3 [00:12<00:00,  4.04s/it]


In [14]:
df

Unnamed: 0,pmid,journal_name,title,abstract,accepted_label,gpt_predictions
0,37550718,Trials,Can dexamethasone improve postoperative sleep ...,Perioperative sleep disorders (PSD) are an ind...,[Animal-other],Clinical-study-protocol
1,2500373,Developmental medicine and child neurology,Effects of puberty on seizure frequency.,"Seizure frequency was documented before, durin...",[Remaining],Human-case-report
2,36189588,Journal of Alzheimer's disease : JAD,Characterization of Mild Cognitive Impairment ...,"Despite tremendous advancements in the field, ...",[Human-systematic-review],Human-case-report


In [15]:
ds_annotated_full_copy = df.copy()