# Processing of PHEE adverse-event data to .CoNLL

Related thoughts for use cases: 
- https://docs.google.com/document/d/1HDpOcQQOmT5A398j_qr-Hm3my4fjuZd8To1YTXhlXNg/edit?tab=t.0#heading=h.kt9y0vodzhfp
- https://docs.google.com/document/d/1KGyr_zPBAdTophA-XGzkW0ObhVNMoKBhioJdpVpdnA0/edit?tab=t.0#heading=h.kfugo3aqnalh
    - With this second example, we can see value in having models which can capture genevar, drug targets and adverse events

In [None]:
from datasets import load_dataset

phee_dataset = load_dataset("phee_builder.py", "json", trust_remote_code=True)

# "phee_dataset" = DatasetDict, split for 'train', 'test', and 'dev'
print("Full dataset:")
print(phee_dataset)

train_split = phee_dataset['train']
test_split = phee_dataset['test']
eval_split = phee_dataset['dev']

Full dataset:
DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'is_mult_event', 'annotations'],
        num_rows: 2898
    })
    test: Dataset({
        features: ['id', 'context', 'is_mult_event', 'annotations'],
        num_rows: 968
    })
    dev: Dataset({
        features: ['id', 'context', 'is_mult_event', 'annotations'],
        num_rows: 961
    })
})
Train set:
Dataset({
    features: ['id', 'context', 'is_mult_event', 'annotations'],
    num_rows: 2898
})


In [None]:
from Data_mining.adverse_event.phee_builder import PHEE
out = PHEE._generate_examples(phee_dataset, ".")

<generator object PHEE._generate_examples at 0x1500d07b0>

In [None]:
import ast
from pprint import pprint
from typing import Dict

annot = train_split[0]["annotations"]
context = train_split[0]["context"]
for e in annot:
    events = e["events"]
    for text in events:
        # pprint(text)
        event_data = ast.literal_eval("".join(text["event_data"]))
        pprint(event_data)
# print(context)
int_map = {}
for i, char in enumerate(list(context)):
    int_map[i] = char
# int_map = {0: "T", 1: "h", 2: "i", 3: "s" ... and so on}

def capture_ae(event_data: List, int_map: Dict, context: str)-> str:
    """
    Catching the adverse event recorded in annotations in the context
    of surrounding text 
    """
    if event_data["event_type"] == "Adverse_event": # Sanity check
        try:
            ae_desc = event_data["Effect"]
        except:
            # Something odd here, see example context:
            # "Pulmonary edema during acute infusion of epoprostenol in a patient with pulmonary hypertension and limited scleroderma."
            # Tagged as an AdEv, but no Event described in data, returning with no annotations for now but TODO - Review
            return context
        ae_start = ae_desc["start"][0][0]
        if len(ae_desc["text"]) == 1:
            ae_str = ae_desc["text"][0][0] # TODO - this may span multiple words - check
        else:
            ae_str = ae_desc["text"] # Will break if caught?
        beginning_chars = max(0, ae_start - len(context))
        beginning_text = context[beginning_chars:ae_start]

        output = f"{beginning_text} __START__ {ae_str} __END__ {context[ae_start + len(ae_str):]}"

        return output
    else:
        # This seems to often be "potential therapeutic event"
        # No adverse event recorded, so treat as 0 annotations of AdEv
        return context

out = capture_ae(event_data=event_data, int_map=int_map, context=context)
print(out)


{'Effect': {'entity_id': ['T5'], 'start': [[170]], 'text': [['lipoatrophies']]},
 'Treatment': {'Disorder': {'entity_id': ['T7'],
                            'start': [[189]],
                            'text': [['diabetic']]},
               'Drug': {'entity_id': ['T6'],
                        'start': [[154]],
                        'text': [['insulin']]},
               'entity_id': ['T3'],
               'start': [[154]],
               'text': [['insulin']]},
 'Trigger': {'entity_id': ['T4'], 'start': [[162]], 'text': [['induced']]},
 'event_id': 'E1',
 'event_type': 'Adverse_event'}
OBJECTIVE: To test the hypothesis that tumor necrosis factor (TNF)-alpha may mediate the loss and the dedifferentiation of subcutaneous fat tissue in the insulin-induced  __START__ lipoatrophies __END__  of a diabetic patient who presented extensive lesions.


In [None]:
import re

def format_to_conll(text):
    """Formats a string with __START__/__END__ into CoNLL format."""

    conll_lines = []
    for sentence in text:

        # Split the sentence into tokens
        tokens = sentence.strip().split()
        
        # Flag to check if we are inside an annotated span
        in_entity = False
        tag = "O"
        
        for token in tokens:
            if token == '__START__':
                in_entity = True
                continue
            elif token == '__END__':
                in_entity = False
                continue
            # Remove punctuation from tokens for cleaner tagging
            clean_token = re.sub(r'[.,;:]$', '', token)
            
            if in_entity and tag == "O":
                tag = "B-Adev" # Entity starts
            elif in_entity and tag == "B-Adev":
                tag = "I-Adev" # Inside entity
            elif in_entity:
                tag = "I-Adev" # Following I- tag
            else:
                tag = "O"
            conll_lines.append(f"{clean_token}\t{tag}")


        # Add a blank line to separate sentences
        conll_lines.append("")
        
    return "\n".join(conll_lines)

In [184]:
import json

full_data = {"train": train_split,
             "test": test_split,
             "eval": eval_split}

for name in full_data:
    data = full_data[name]

    res_list = []
    for i, data in enumerate(data):
        if i > 1000:
            break
        annot = data["annotations"]
        context = data["context"]
        for e in annot:
            events = e["events"]
            for text in events:
                event_data = json.loads(text["event_data"])
                int_map = {}
                for i, char in enumerate(list(context)):
                    int_map[i] = char
                out = capture_ae(event_data=event_data, int_map=int_map, context=context)
                # print(out)
                res_list.append(out)

    res_list
    conll_output = format_to_conll(res_list)
    with open(f"PHEE_{name}_v1.conll", "w+") as f:
        f.writelines(conll_output)
    f.close()