# NER using GPT-3.5

### Project name: Honos
Date: 24th May 2024

Author: Milindi Kodikara | Supervisor: Professor Karin Verspoor


Before running this notebook:
1. [Install Jupyter notebook](https://jupyter.org/install) 


2. [Setting up Azure OpenAI model](https://learn.microsoft.com/en-us/azure/ai-services/openai/how-to/working-with-models?tabs=powershell#model-updates)


3. [Setting up connection to GPT-3.5 using Azure OpenAI service](https://learn.microsoft.com/en-us/azure/ai-services/openai/quickstart?tabs=command-line%2Cpython-new&pivots=programming-language-python)
        - In the Environment variables section, instead of doing what is outlined in the link, add the `API_KEY`, `API-VERSION`, `ENDPOINT` and `DEPLOYMENT-NAME` into a `.env` file in the root folder.
        
4. Add the correct filename paths for `data` in Step 1 and gold annotated data filename for the `evaluate()` function in Step 4. 


In [None]:
import pandas as pd
import re

import os
from openai import AzureOpenAI

from dotenv import load_dotenv
load_dotenv() 


### Step 1: Load and pre-process data and prompt library 


#### Step 1.1: Load datasets

In [None]:
# train_text.tsv
# pmid\tfilename\ttext

# TODO: Replace filepath for related data file
data = pd.read_csv("./genovardis_train_dev/train_text.tsv", sep='\t', header=0)

data.head(5)

In [None]:
len(data)

In [None]:
# TODO: remove this after testing
# data = data.head(2)

# data

In [None]:
# clean up text by removing the appended pmid and title abstract tags at the start of each section

pattern = '(?:[\d]{1,10}\|t\|)(?P<title>[\w\W]+)(?:\\n[\d]{1,20}\|a\|)(?P<abstract>[\w\W]+)'

def clean_text(text):
    matches = re.search(pattern, text)
    reformatted_text = f'{matches.group("title")}\n{matches.group("abstract")}'
    return reformatted_text

data['text'] = [clean_text(text) for text in data['text']]


data

In [None]:
len(data)


#### Step 1.2: Load prompt library

Prompt id structure:
`p_<index>_<task>_<language>_<output>`

TODO: Figure out `<guideline>_<paradigm>`

In [None]:
prompt_library = pd.read_json('prompts.json')

prompt_library


#### Step 1.3: Create data+prompt dataset

In [None]:
# TODO: Buff up the prompts with guidelines and examples (shots)
# pmid prompt_id embedded_prompt
def embed_data_in_prompts(row_data):
    prompts = []
    pmid = row_data['pmid']
    data_text = row_data['text']
    
    for index, row_prompt in prompt_library.iterrows():
        instruction = row_prompt['instruction']
        prompt_text = row_prompt['text'].format(data_text)
        # TODO: Figure out the new line characters 
        concatenated_prompt = '{}\n"{}"'.format(instruction, prompt_text)
        
        prompt = {'prompt_id': row_prompt['prompt_id'], 'prompt': concatenated_prompt}
        prompts.append(prompt)
    
    return {'pmid': pmid, 'prompts': prompts}


In [None]:

embedded_prompt_data_list = [embed_data_in_prompts(row_data) for index, row_data in data.iterrows()]

In [None]:
embedded_prompt_data_list[0]


### Step 2: Setting up GPT-3.5

In [None]:

client = AzureOpenAI(
    api_key=os.environ["API-KEY"],  
    api_version=os.environ["API-VERSION"],
    azure_endpoint=os.environ["ENDPOINT"]
    )
    
deployment_name=os.environ["DEPLOYMENT-NAME"]


In [None]:
# Testing the connection
test_response = client.chat.completions.create(model=deployment_name, messages=[{"role": "user", "content": "Hello, World!"}])
print(test_response.choices[0].message.content)

In [None]:
# TODO: Ask Karin whether we should run again and again to see what gpt generates - yes! later!
results_list = []
def generate_results(prompt_items):
    
    pmid = prompt_items['pmid']
    
    for prompt_item in prompt_items['prompts']:
    
        prompt_id = prompt_item['prompt_id']
        prompt = prompt_item['prompt']
        
        # TODO: Look into hyper params like temp 
        response = client.chat.completions.create(model=deployment_name, messages=[{"role": "user", "content": prompt}])
        
        response_result = response.choices[0].message.content
        
        results_list.append({'pmid': pmid, 'prompt_id': prompt_id, 'result': response_result})
    
        print(f'Prompt:\n{prompt}\n\nResponse:\n{response_result} \n----------\n')
    
    return results_list
    

In [None]:
for embedded_prompt_data in embedded_prompt_data_list:
    generate_results(embedded_prompt_data)

In [None]:
results_list

In [None]:
len(results_list)

### Step 3: Post-processing

In [None]:
# create df from results list and data df
# columns = pmid, prompt_id, filename, label, offset1, offset2, span
extracted_entity_results = pd.DataFrame(columns=['pmid','prompt_id','filename','label', 'offset_checked', 'offset1','offset2','span'])

In [None]:
len(extracted_entity_results)

In [None]:
label_entity_pattern = '^(?P<label>gene|disease)\s+(?P<span>[\w\W]+)$'

def extract_tuple(tuple_string):
    stripped_tuple_string = tuple_string.strip()
    matches = re.search(label_entity_pattern, stripped_tuple_string)
    
    if not matches:
        return
    
    label = matches.group("label").strip()
    span = matches.group("span").strip()
    
    return {'label': label, 'span': span}

In [None]:
# extract each entity from the combined result string from gpt-3.5
# add each extracted tuple as a new row in extracted_entity_results df
def extract_ner_results(pmid, prompt_id, result_string):
    extracted_list = result_string.splitlines()
    extracted_tuple_list = [ extract_tuple(result_string) for result_string in extracted_list]
    
    for extracted_tuple in extracted_tuple_list:
        if extracted_tuple:
            row = {
                    "pmid": pmid,
                    "prompt_id": prompt_id,
                    "filename" : data.loc[data['pmid'] == pmid, 'filename'].iloc[0],
                    "label": extracted_tuple['label'],
                    "offset_checked": False,
                    "offset1": '',
                    "offset2": '',
                    "span": extracted_tuple['span']
                }
        
            extracted_entity_results.loc[len(extracted_entity_results)] = row
    

In [None]:
# extract the concatenated results strings into a new line for each tuple 
for result_dict in results_list:
    extract_ner_results(result_dict['pmid'], result_dict['prompt_id'], result_dict['result'])


In [None]:
extracted_entity_results

In [None]:
len(extracted_entity_results)

In [None]:
# Find offsets 

# loop df, find each span, calculate the word length, find the indexes of each occurance 
for _, row in extracted_entity_results.iterrows():
    pmid = row['pmid']
    prompt_id = row['prompt_id']
    text = data.loc[data['pmid'] == pmid, 'text'].iloc[0]
    
    if not row['offset_checked'] and row['offset1'] == '':
        span = row['span']
        span_length = len(span)
        
        span_start_indexes = [m.start() for m in re.finditer(span, text)]
        span_count = 0
        
        matching_spans = extracted_entity_results[(extracted_entity_results['pmid']==pmid) & (extracted_entity_results['prompt_id']==prompt_id) & (extracted_entity_results['span']==span) & (extracted_entity_results['offset1']=='') & (extracted_entity_results['offset_checked']==False)]
        
        for index, matched_span in matching_spans.iterrows(): 
            if span_count < len(span_start_indexes):
                extracted_entity_results.loc[index, 'offset1'] = str(span_start_indexes[span_count])
                extracted_entity_results.loc[index, 'offset2'] = str(span_start_indexes[span_count] + (span_length - 1))
                
                span_count = span_count + 1
            else: 
                # Add -1 to extra or missing ones 
                extracted_entity_results.loc[index, 'offset1'] = '-1'
                extracted_entity_results.loc[index, 'offset2'] = '-1'
                
            extracted_entity_results.loc[index, 'offset_checked'] = True
            
        # testing code
        # test_matching_spans = extracted_entity_results[(extracted_entity_results['pmid']==pmid) & (extracted_entity_results['prompt_id']==prompt_id) & (extracted_entity_results['span']==span)]
        # 
        # print(test_matching_spans)

In [None]:
extracted_entity_results

In [None]:
len(extracted_entity_results)


### Step 4: Evaluation

*Skip this part for evaluation dataset as there is no gold standard data to compare against.*

In [None]:
# train_annotations.tsv
# pmid\tfilename\tmark\tlabel\toffset1\toffset2\tspan

# TODO: Keep track of the variations between the runs eg: hyperparams (fixed), prompt that worked best etc. to add the metrics for result 
# Read and find what other people have done 

# brat format for NER
# <unique_id>   <label>  <offset1> <offset2>   <span> 
def bratify(eval_filepath=None, results=None):
    if eval_filepath is not None:
        
        gold_standard_annotations = pd.read_csv(eval_filepath, sep='\t', header=0)
        # TODO: Get gold standard data in brat formation for evaluation
        print(gold_standard_annotations.sample(5))
        # TODO: Save file in desired output file     
        
    if results is not None:
        # TODO: Get results in the brat format for evaluation
        # TODO: Remove extra whitespaces and new lines from the response for JSON format
        # TODO: Extract each new line as a row in the results 
        # formatted_response = re.sub('[^\S\t]', '', response.choices[0].message.content)
        results = results
        # TODO: Save file in desired output format
    

In [None]:
# TODO: Replace filepath of to convert to brat format
bratify("./genovardis_train_dev/train_annotation.tsv")

In [None]:
# https://github.com/READ-BioMed/brateval


### Step 5: Saving output

`.tsv` file containing the annotations in the following format: 

`pmid   filename   label   offset1   offset2   span`.



In [None]:
# Extracting results of a specific prompt
def save_output(prompt_id):
    extracted_entity_results_subset = extracted_entity_results[(extracted_entity_results['prompt_id']==prompt_id)]
    extracted_entity_results_subset = extracted_entity_results_subset.drop(['prompt_id', 'offset_checked'], axis=1)
    print(f'Original len: {len(extracted_entity_results)}, subset len: {len(extracted_entity_results_subset)}\n\n')
    print('Sample:\n', extracted_entity_results_subset.sample(5))
    
    # get results for tsv in the format
    # `pmid   filename   label   offset1   offset2   span`.
    filename = f'genovardis_{prompt_id}.tsv'
    extracted_entity_results_subset.to_csv(filename, sep ='\t', index=False, header=True)
    
    print(f'\nSaved to {filename}\n------------\n')
    

In [None]:
for _, prompt in prompt_library.iterrows():
   save_output(prompt['prompt_id']) 