In [1]:
import torch
import numpy as np
from pathlib import Path

from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import pairwise
import pandas as pd
from openai import OpenAI
import openai

from sklearn.metrics import accuracy_score, classification_report, balanced_accuracy_score
from sklearn.metrics import balanced_accuracy_score

import random

In [2]:
embeddings_data_path = Path("./data/embeddings/")
input_data_path = Path("./data/data_splits_stratified/6-2-2_all_classes_enriched_with_kw")

## Load Embeddings and Calculate Similarities

### Load 
-> each row represents the text from one sample embedded into a 768-demnsional vector

In [3]:
embeddings_train = np.load(
    embeddings_data_path / "embeddings_microsoft_BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext_train_ds.npy"
)
embeddings_test = np.load(
    embeddings_data_path / "embeddings_microsoft_BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext_test_ds.npy"
)

FileNotFoundError: [Errno 2] No such file or directory: 'data/embeddings/embeddings_microsoft_BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext_train_ds.npy'

In [None]:
embeddings_train.shape

In [None]:
embeddings_test.shape

### Compute similarity between test dataset elements to all train elements


In [None]:
dist_matrix = pairwise_distances(embeddings_test, embeddings_train, metric='sqeuclidean')
dist_matrix.shape

In [None]:
dist_matrix_cosine = pairwise.cosine_similarity(X=embeddings_test, Y=embeddings_train)
dist_matrix.shape

The matrix has the dimensions 404 (test data elements) x 1191 (train data elements). Each of the 1191 values per test row is the similarity score between the test element to the train elements.

In [None]:
dist_matrix_cosine

### Find closest neighbours from the train dataset to each test example

In [None]:
values, indices = torch.topk(-torch.from_numpy(dist_matrix_cosine), k=3, dim=-1)

In [None]:
indices.shape

The indices have the dimension 404 (test data elements) x 3 (top k=3 closest train data elements).

In [None]:
indices

In [None]:
df_train = pd.read_csv(input_data_path/ 'train.csv')
df_test = pd.read_csv(input_data_path/ 'test.csv')


In [None]:
df_test[df_test['accepted_label'] == 'In-vitro-study']

In [None]:
df_test.iloc[246]

In [None]:
indices[246]

In [None]:
df_train.iloc[1032]

In [None]:
df_test.groupby('accepted_label').size()

## Init OpenAI API

In [None]:
def load_pass(file_path, key_to_find):
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split("=")
            if len(parts) == 2 and parts[0] == key_to_find:
                found_password = parts[1]
                break
    if found_password:
        print("Found password.")
        return found_password
    else:
        print("Password not found for key:", key_to_find)

In [None]:
openai.api_key = load_pass("./credentials.txt", "OPENAI")
client = OpenAI(api_key=openai.api_key)

## Create Prompts

In [None]:
df_train['input_journal_title_abstract'] = '<journal>' + df_train['journal_name'] + '</journal>' + \
                                         '<title>' + df_train['title'] + '</title>' + \
                                         '<abstract>' + df_train['abstract'] + '</abstract>'
df_test['input_journal_title_abstract'] = '<journal>' + df_test['journal_name'] + '</journal>' + \
                                         '<title>' + df_test['title'] + '</title>' + \
                                         '<abstract>' + df_test['abstract'] + '</abstract>'

In [None]:
df_test

In [None]:
def create_prompt(df_train, df_test, test_index, example_indices):
    # Start the prompt with a task description (optional)
    prompt = "Classify this text, choosing one of these labels: Clinical-study-protocol, Human-systematic-review, Non-systematic-review, Human-RCT-non-drug-intervention, Human-RCT-drug-intervention, Human-RCT-non-intervention, Human-case-report, Human-non-RCT-non-drug-intervention, Human-non-RCT-drug-intervention, Animal-systematic-review, Animal-drug-intervention, Animal-non-drug-intervention, Animal-other, In-vitro-study, Remaining. Respond in json format with the key: gpt_label.\n\n"
    
    # Add examples from df_train
    for idx in example_indices:
        example_text = df_train.loc[idx, 'input_journal_title_abstract']
        example_label = df_train.loc[idx, 'accepted_label']
        prompt += f"Text: \"{example_text}\"\nCategory: {example_label}\n\n"
    
    # Add the test text needing classification
    test_text = df_test.loc[test_index, 'input_journal_title_abstract']
    prompt += f"Text: \"{test_text}\"\nCategory: "
    
    return prompt

In [None]:
example_i = 0
example_prompt = create_prompt(df_train, df_test, example_i, indices[example_i].tolist())
#example_prompt

In [None]:
len(example_prompt.split())

In [None]:
import time
from tqdm.auto import tqdm

DEFAULT_TEMPERATURE = 0

def create_prompt(df_train, example_indices, input_raw_text):
    prompt = "Classify this text, choosing one of these labels: Clinical-study-protocol, Human-systematic-review, Non-systematic-review, Human-RCT-non-drug-intervention, Human-RCT-drug-intervention, Human-RCT-non-intervention, Human-case-report, Human-non-RCT-non-drug-intervention, Human-non-RCT-drug-intervention, Animal-systematic-review, Animal-drug-intervention, Animal-non-drug-intervention, Animal-other, In-vitro-study, Remaining. Respond in json format with the key: gpt_label.\n\n"
    for idx in example_indices:
        example_text = df_train.loc[idx, 'input_journal_title_abstract']
        example_label = df_train.loc[idx, 'accepted_label']
        prompt += f"Text: \"{example_text}\"\nCategory: {example_label}\n\n"
    prompt += f"Text: \"{input_raw_text}\"\nCategory: "
    return prompt

def query_gpt(df_train, input_raw_text, example_indices, gpt_model="gpt-3.5-turbo", temperature=DEFAULT_TEMPERATURE, max_retries=5, retry_delay=3):
    prompt_text = create_prompt(df_train, example_indices, input_raw_text)
    system_msg = f"You are an expert assistant specialized in text classification of PubMed abstracts."

    retries = 0
    while retries < max_retries:
        print("Trying to call OpenAI API...")
        try:
            completion = client.chat.completions.create(
                model=gpt_model,  
                response_format={"type": "json_object"},
                temperature=temperature,
                messages=[
                    {"role": "system", "content": system_msg},
                    {"role": "user", "content": prompt_text}
                ]
            )
            return completion.choices[0].message.content
        except Exception as e:
            print(f"OpenAI API returned an error: {e}")
            time.sleep(retry_delay)
            retries += 1

    raise RuntimeError("Max retries reached. Unable to complete the API call.")

def apply_gpt_with_progress(df_train, test_data_series, example_indices_tensor=None, num_samples=3, use_random=False, model="gpt-3.5-turbo"):
    results = []
    total_items = len(test_data_series)
    with tqdm(total=total_items, desc="Processing dataset") as pbar:
        for i, text in enumerate(test_data_series):
            if use_random:
                example_indices = random.sample(range(len(df_train)), num_samples)
            else:
                example_indices = example_indices_tensor[i].tolist()
            print("Retrieved in-context learning examples with idx: ", example_indices)
            result = query_gpt(df_train, text, example_indices, model)
            results.append(result)
            pbar.update(1)
    return results


In [None]:
query_gpt(example_prompt)

In [None]:
# Example usage:
df_test[f'gpt_predictions_in_context'] = apply_gpt_with_progress(df_train, df_test['input_journal_title_abstract'], indices)

In [None]:
df_test[f'gpt_predictions_in_context_random'] = apply_gpt_with_progress(df_train, df_test['input_journal_title_abstract'], use_random=True)

## Evaluate

In [None]:
df_test_to_eval = df_test.copy()
df_test_to_eval.head()

In [None]:
prompt_ids_to_test = ["in_context", "in_context_random"]

In [None]:
labels = ["Human-systematic-review", "Human-RCT-drug-intervention", "Human-RCT-non-drug-intervention", "Human-RCT-non-intervention", "Human-case-report", "Human-non-RCT-drug-intervention", "Human-non-RCT-non-drug-intervention", "Animal-systematic-review", "Animal-drug-intervention", "Animal-non-drug-intervention", "Animal-other", "Non-systematic-review", "In-vitro-study", "Clinical-study-protocol", "Remaining"]

label_to_numerical = {label: i for i, label in enumerate(labels)}
label_to_numerical["label missing"] = -1

In [None]:
def map_label_to_numerical(label):
    # Check if label is a dictionary
    if isinstance(label, dict):
        # Extract the label with the highest score/probability
        highest_label = max(label, key=label.get)
        return label_to_numerical.get(highest_label, -1)
    else:
        # Directly map string labels to numerical IDs
        return label_to_numerical.get(label, -1)
        
# Convert accepted labels to numerical
df_test_to_eval['accepted_label_numerical'] = df_test_to_eval['accepted_label'].apply(lambda x: label_to_numerical.get(x, -1))


# Initialize a list to hold DataFrame for each report and summary statistics
report_dfs = []
summary_stats = []

# Iterate over each GPT prediction column
for prompt_id in prompt_ids_to_test:
    print("Evaluating ", prompt_id)
    prediction_col = f'gpt_predictions_{prompt_id}_clean'

    df_test_to_eval[prediction_col] = df_test_to_eval[f'gpt_predictions_{prompt_id}'].apply(
            lambda x: json.loads(x)['gpt_label'] if isinstance(x, str) and 'gpt_label' in json.loads(x) else x
        )
    
    # Map GPT predictions to numerical values
    df_test_to_eval[f'{prediction_col}_numerical'] = df_test_to_eval[prediction_col].apply(map_label_to_numerical)

    # Extract arrays for evaluation
    y_true = df_test_to_eval['accepted_label_numerical'].values
    y_pred = df_test_to_eval[f'{prediction_col}_numerical'].values
    
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    accuracy_balanced = balanced_accuracy_score(y_true, y_pred)
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0, labels=range(len(labels)), target_names=labels)
    
    # Create DataFrame from report
    report_df = pd.DataFrame(report).transpose()
    report_df['Prompt ID'] = prompt_id  # Add column to indicate the prompt ID
    report_dfs.append(report_df)
    
    # Extract summary statistics (average precision, recall, F1)
    summary = report_df.loc['weighted avg', ['precision', 'recall', 'f1-score']].to_dict()
    summary['Prompt ID'] = prompt_id
    summary_stats.append(summary)

    # Plotting confusion matrix
    plt.figure(figsize=(10, 6))
    ax = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=range(len(label_to_numerical)), yticklabels=range(len(label_to_numerical)))
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=13)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=0, fontsize=13)
    plt.title(f'Confusion Matrix for Model {model} and ICL-Prompt {prompt_id}', fontsize=14)
    plt.xlabel('Predicted Labels', fontsize=13)
    plt.ylabel('True Labels', fontsize=13)

    # Add an inset with label mapping
    textstr = '\n'.join([f'{v}: {k}' for k, v in label_to_numerical.items()])
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
    ax.text(1.16, 1.0, textstr, transform=ax.transAxes, fontsize=10, verticalalignment='top', bbox=props)
    plt.tight_layout()
    plt.savefig(f'plots/confusion_matrix_{model}_enriched_kw_test_{prompt_id}_{eval_type}_ICL.pdf')  # Save to PDF

    # Combine all report DataFrames
    all_reports_df = pd.concat(report_dfs)

    # Create a summary table for average precision, recall, and F1-score
    summary_df = pd.DataFrame(summary_stats)

In [None]:
all_reports_df

In [None]:
summary_df

In [None]:
print(summary_df)