In [1]:
import pandas as pd
import openai
import json
import time
import matplotlib.pyplot as plt
from tqdm import tqdm  # Import tqdm for the progress bar


In [2]:
#pip install openai

In [3]:
from openai import OpenAI

In [4]:
openai.__version__


'1.1.1'

### Read in data

In [5]:
# Initialize an empty list to store the parsed data
data_list = []

# Read the JSONL file line by line
with open('../data/prodigy/annotated_output/pilot_500_pubmed_abstracts_shirin_correct_id.jsonl', 'r') as file:
    for line in file:
        # Parse each line as a JSON object
        data = json.loads(line.strip())
        
        # Check if _view_id is "choice"// should be review if this was coming from a review prodigy session
        if data.get("_view_id") == "choice":
            # Split text into journal_name, title, and abstract based on ^\n symbol
            text = data.get("text", "")
            
            # Splitting text based on ^\n for journal_name, title, and abstract
            parts = text.split("^\n", 2)  # Split into three parts based on first two ^\n occurrences
            
            journal_name = parts[0].strip() if len(parts) > 0 else ""
            title = parts[1].strip() if len(parts) > 1 else ""
            abstract = parts[2].strip() if len(parts) > 2 else ""
            
            # Extract other required fields
            pmid = data.get("pmid", "")
            accept = data.get("accept", [])
            
            # Append to the list
            data_list.append({
                "pmid": pmid,
                "journal_name": journal_name,
                "title": title,
                "abstract": abstract,
                "accepted_label": accept
            })

# Convert the list of dictionaries to a pandas DataFrame
df = pd.DataFrame(data_list)

In [6]:
df.head(5)

Unnamed: 0,pmid,journal_name,title,abstract,accepted_label
0,37550718,Trials,Can dexamethasone improve postoperative sleep ...,Perioperative sleep disorders (PSD) are an ind...,[Human-RCT-drug-intervention]
1,2500373,Developmental medicine and child neurology,Effects of puberty on seizure frequency.,"Seizure frequency was documented before, durin...",[Remaining]
2,36189588,Journal of Alzheimer's disease : JAD,Characterization of Mild Cognitive Impairment ...,"Despite tremendous advancements in the field, ...",[Remaining]
3,36314672,Journal of vector ecology : journal of the Soc...,Effects of woody plant encroachment by eastern...,Woody plant encroachment into grasslands is oc...,[Remaining]
4,29172241,Depression and anxiety,The impact of resilience and subsequent stress...,There remains a dearth of research examining t...,[Remaining]


### Load key for the OpenAI API 

In [7]:
def load_pass(file_path, key_to_find):
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split("=")
            if len(parts) == 2 and parts[0] == key_to_find:
                found_password = parts[1]
                break
    if found_password:
        print("Found password.")
        return found_password
    else:
        print("Password not found for key:", key_to_find)

Note: You need to create a credentials.txt file with the following content:  
OPENAI=sk-77QXXXXXXXXXXXXXXXXXXXXXXXXXXX  
replace the value after the = sign with your API key.  
Make sure the credentials.txt is added to .gitignore, you don't want to put your password on Git!

In [9]:
openai.api_key = load_pass("./credentials.txt", "OPENAI")


Found password.


In [10]:
client = OpenAI(api_key=openai.api_key)

### Query GPT models

To change the task the model is solving, you need to change the text of the prompt and the content text of the system role.  
To change the GPT model used, you need to change the text of the model name when initiating the openai API.  
The function gets as input the input_raw_text, that will be text for information extraction or classification.

In [11]:
import time

DEFAULT_TEMPERATURE = 0
DEFAULT_MAX_TOKENS = 500
DEFAULT_MODEL = "gpt-3.5-turbo"

def query_gpt(input_raw_text, prompt_text, gpt_model="gpt-3.5-turbo", temperature=0, max_retries=5, retry_delay=3):
    # CHANGE gpt_model to the desired model name, see https://platform.openai.com/docs/models (gpt-3.5-turbo and gpt-4-turbo-preview)
    
    # Add a delay at the beginning of the function to avoid overloading the API if there are multiple calls
    # time.sleep(10)  

    system_msg = f"""
    You are an assistant that classifies PubMed abstracts. 
    """

    retries = 0
    while retries < max_retries:
        print("Trying to call OpenAI API...")
        try:
            completion = client.chat.completions.create(
                model=gpt_model,  
                response_format={"type": "json_object"},
                temperature=temperature,
                #max_tokens=2000,
                messages=[
                    {"role": "system", "content": system_msg},
                    {"role": "user", "content": prompt_text + input_raw_text}
                ]
            )
            return completion.choices[0].message.content
        except Exception as e:
            # Handle API error, e.g., retry or log
            print(f"OpenAI API returned an error: {e}")
            time.sleep(retry_delay)  # Wait before retrying
            retries += 1

    raise RuntimeError("Max retries reached. Unable to complete the API call.")


In [12]:
df['abstract'][0]

"Perioperative sleep disorders (PSD) are an independent risk factor for postoperative delirium (POD), which is a common complication after surgery. Elderly patients who undergo robot-assisted radical prostatectomy (RARP) often experience perioperative sleep disorders (PSD). Dexamethasone, a medication that works by inhibiting the hypothalamic-pituitary-suprarenal cortical axis, can reduce the negative effects of surgical stress. The objective of this study was to determine whether intravenous administration of dexamethasone at the time of anesthesia induction could improve postoperative sleep quality in elderly patients, thereby indirectly reducing the risk of postoperative cognitive impairment and accelerating postoperative rehabilitation.\tThis study is a randomized, double-blind, placebo-controlled trial that was conducted at a single center. A sample size of 116 patients was determined through calculation, and these patients were randomly assigned to either the dexamethasone group 

## Single prompt example

In [13]:
prompt = (
    "Classify this text, choosing one of these labels: "
    "Clinical-study-protocol, Human-systematic-review, Non-systematic-review, "
    "Human-RCT-non-drug-intervention, Human-RCT-drug-intervention, Human-RCT-non-intervention, "
    "Human-case-report, Human-non-RCT-non-drug-intervention, Human-non-RCT-drug-intervention, "
    "Animal-systematic-review, Animal-drug-intervention, Animal-non-drug-intervention, "
    "Animal-other, In-vitro-study, Remaining. "
    "Respond in json format with the keys: gpt_label and gpt_explanation. \
    The value for gpt_explanation should be a very short explanation for why gpt chose the label."
)

In [14]:
example_input = df['abstract'][0]
query_gpt(example_input, prompt)

Trying to call OpenAI API...


'{\n    "gpt_label": "Human-RCT-drug-intervention",\n    "gpt_explanation": "The text describes a randomized, double-blind, placebo-controlled trial investigating the impact of intravenous dexamethasone on sleep quality in elderly patients undergoing robot-assisted radical prostatectomy."\n}'

### Random data sample

In [15]:
# Sample 10 elements randomly in a reproducible way
sampled_df = df.sample(n=10, random_state=1)

In [16]:
#shows first few lines of sampled_df
sampled_df.head()

Unnamed: 0,pmid,journal_name,title,abstract,accepted_label
304,12905582,Fa yi xue za zhi,[The research of the heroin and its metabolite...,Heroin can be metabolized easily in body and t...,[Remaining]
340,28645717,Vaccine,Surveillance of pneumococcal colonization and ...,Following the introduction of pneumococcal con...,[Human-non-RCT-drug-intervention]
47,11482695,Acta neurochirurgica,Carotid endarterectomy: a new technique replac...,Carotid endarterectomy has been reported to in...,[Human-non-RCT-non-drug-intervention]
67,15065953,Journal of consulting and clinical psychology,Traditional versus integrative behavioral coup...,A randomized clinical trial compared the effec...,[Human-RCT-non-drug-intervention]
479,9578881,The Journal of laryngology and otology,Vocal fold abductor paralysis as a solitary an...,A patient is presented who had bilateral abduc...,[Remaining]


In [17]:
#shows entire smampled_df
print(sampled_df)

         pmid                                       journal_name  \
304  12905582                                   Fa yi xue za zhi   
340  28645717                                            Vaccine   
47   11482695                               Acta neurochirurgica   
67   15065953      Journal of consulting and clinical psychology   
479   9578881             The Journal of laryngology and otology   
485   2115207  Scandinavian journal of infectious diseases. S...   
310   2946351  The British journal of psychiatry : the journa...   
31   12045804                           Revista de saude publica   
249  18084303              Nature structural & molecular biology   
90   16332401                              Neurobiology of aging   

                                                 title  \
304  [The research of the heroin and its metabolite...   
340  Surveillance of pneumococcal colonization and ...   
47   Carotid endarterectomy: a new technique replac...   
67   Traditional ve

In [None]:
# Explode the 'accepted_label' column to have each label as a separate row
df_exploded = sampled_df.explode('accepted_label')

# Count the occurrences of each label
label_counts = sampled_df['accepted_label'].value_counts()

# Plot the distribution
label_counts.plot(kind='barh')
plt.title('Distribution of Accepted Labels')
plt.xlabel('Accepted Label')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Define a function to apply GPT queries with a progress bar
def apply_gpt_with_progress(data_series, prompt_text, model="gpt-3.5-turbo"):
    results = []
    total_items = len(data_series)
    # Create a tqdm progress bar
    with tqdm(total=total_items, desc=f"Processing dataset") as pbar:
        for text in data_series:
            result = query_gpt(text, prompt_text, model)
            results.append(result)
            pbar.update(1)  # Update the progress bar

    return results

### Run GPT

In [None]:
sampled_df['gpt_predictions'] = apply_gpt_with_progress(sampled_df['abstract'], prompt)

In [None]:
labels = ["Human-systematic-review", "Human-RCT-drug-intervention", "Human-RCT-non-drug-intervention", "Human-RCT-non-intervention", "Human-case-report", "Human-non-RCT-drug-intervention", "Human-non-RCT-non-drug-intervention", "Animal-systematic-review", "Animal-drug-intervention", "Animal-non-drug-intervention", "Animal-other", "Non-systematic-review", "In-vitro-study", "Clinical-study-protocol", "Remaining"]

label_to_numerical = {label: i for i, label in enumerate(labels)}
label_to_numerical["label missing"] = -1

In [None]:
label_to_numerical

In [None]:
df_exploded = sampled_df.explode('accepted_label')
df_exploded['gpt_label'] = df_exploded['gpt_predictions'].apply(lambda x: json.loads(x)['gpt_label'])

df_exploded.head(3)

### Evaluate performance

In [None]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import balanced_accuracy_score


In [None]:
# Adjust mapping to return -1 for labels not found in the map
df_exploded['accepted_label_numerical'] = df_exploded['accepted_label'].apply(lambda x: label_to_numerical.get(x, -1))
df_exploded['gpt_predictions_numerical'] = df_exploded['gpt_label'].apply(lambda x: label_to_numerical.get(x, -1))

In [None]:
df_exploded.head()

In [None]:
len(labels)

In [None]:
# Extract arrays for evaluation
y_true = df_exploded['accepted_label_numerical'].values
y_pred = df_exploded['gpt_predictions_numerical'].values

# Evaluation
accuracy = accuracy_score(y_true, y_pred)
#report = classification_report(y_true, y_pred,output_dict=True, zero_division=0)
accuracy_balanced = balanced_accuracy_score(y_true, y_pred)

# Adjusting the classification report to only include relevant labels
unique_labels = sorted(set(y_true) | set(y_pred))
target_names_adjusted = [labels[i] for i in unique_labels]

# Recalculate classification report with adjusted target names
report_adjusted = classification_report(y_true, y_pred, output_dict=True, labels=unique_labels, target_names=target_names_adjusted, zero_division=0)

accuracy, accuracy_balanced # Note: proportion of correctly predicted observations to the total observations. It's most useful when the classes are balanced but can be misleading when dealing with imbalanced classes.

In [None]:
# Convert the report to a DataFrame for visualization
report_df = pd.DataFrame(report_adjusted).transpose()

report_df

In [None]:
# note: The support is the number of occurrences of each class in y_true.
report_adjusted