## Load libraries

In [1]:

import os
from langchain.chat_models import ChatOpenAI, AzureChatOpenAI, ChatAnthropic
from langchain.llms import AzureOpenAI, LlamaCpp
from langchain.embeddings import OpenAIEmbeddings
from langchain import PromptTemplate, LLMChain

from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

## Instantiate OpenAI models

In [2]:
llm_gpt_35_turbo = AzureChatOpenAI(

        deployment_name="gpt-35-turbo",

        model_name="gpt-35-turbo",

        temperature= 0.7,

        openai_api_version="2023-05-15",

        openai_api_key= "cd826423871544a486d616f14805725a",

        openai_api_base="https://testavinx.openai.azure.com/",

        openai_api_type="azure",

    )

In [3]:
embeddings = OpenAIEmbeddings(deployment="embedding1",

        model="text-embedding-ada-002",

        openai_api_version="2023-05-15",

        openai_api_key= "cd826423871544a486d616f14805725a",

        openai_api_base="https://testavinx.openai.azure.com/",

        openai_api_type="azure",

        chunk_size = 1)

## Extract entites using GPT

In [4]:
template = """Defn: An entity is an age (age), gender of the person (sex), history (history), symptomns of the disease (symptoms), the duration of the 
                        symptoms (duration_symptom), disease that has been diagnosed (diagnosis), treament of the disease (treatment)
                        "Abstract concepts, descriptions, processes and adjectives are not entities"
            Example 1: CASE: A 28-year-old previously healthy man presented with a 6-week history of palpitations.
                        The symptoms occurred during rest, 2–3 times per week, lasted up to 30 minutes at a time and were associated with dyspnea.
                        Except for a grade 2/6 holosystolic tricuspid regurgitation murmur (best heard at the left sternal border with inspiratory
                        accentuation), physical examination yielded unremarkable findings. An electrocardiogram (ECG) revealed normal sinus rhythm
                        and a Wolff– Parkinson– White pre-excitation pattern, produced by a right-sided accessory pathway.
                        Transthoracic echocardiography demonstrated the presence of Ebstein's anomaly of the tricuspid valve, with apical displacement 
                        of the valve and formation of an “atrialized” right ventricle (a functional unit between the right atrium and the inlet [inflow] portion of the right ventricle) (Fig.2).
                        The anterior tricuspid valve leaflet was elongated (Fig.2C, arrow), whereas the septal leaflet was rudimentary (Fig.2C, arrowhead).
                        Contrast echocardiography using saline revealed a patent foramen ovale with right-to-left shunting and bubbles in the left atrium (Fig.2D).
                        The patient underwent an electrophysiologic study with mapping of the accessory pathway, followed by radiofrequency ablation 
                        (interruption of the pathway using the heat generated by electromagnetic waves at the tip of an ablation catheter).
                        His post-ablation ECG showed a prolonged PR interval and an odd “second” QRS complex in leads III, aVF and V2–V4 (Fig.1Bottom), 
                        a consequence of abnormal impulse conduction in the “atrialized” right ventricle.
                        The patient reported no recurrence of palpitations at follow-up 6 months after the ablation.

            Answer:
                        - 28-year-old | True | as it represents the age of the patient (age)
                        - previously healthy | True | as it represents the history of the patient (history)
                        - 6-week | True | as it represents the duration of the symptom palpitations (duration_symptom)
                        - palpitations | True | as it one of the symptoms (symptoms)
                        - dyspnea | True | as it is one of the symptoms (symptoms)
                        - electrocardiogram (ECG) | False | as it is not an entity of interest
                        - Ebstein's anomaly | True | as it is the disease being diagnosed (diagnosis) 
                

            Text: {text}
            Answer: 

             """

In [5]:
prompt = ChatPromptTemplate.from_template(template)

model = llm_gpt_35_turbo

In [6]:
chain = (
    {"text": RunnablePassthrough()} 
    | prompt 
    | model 
    | StrOutputParser()
)

## Test

#### Example 1

In [7]:
text = """The patient was a 34-yr-old man who presented with complaints of fever and a chronic cough.
He was a smoker and had a history of pulmonary tuberculosis that had been treated and cured.
A computed tomographic (CT) scan revealed multiple tiny nodules in both lungs.
A thoracoscopic lung biopsy was taken from the right upper lobe.
The microscopic examination revealed a typical LCH.
The tumor cells had vesicular and grooved nuclei, and they formed small aggregations around the bronchioles (Fig.1).
The tumor cells were strongly positive for S-100 protein, vimentin, CD68 and CD1a.
There were infiltrations of lymphocytes and eosinophils around the tumor cells.
With performing additional radiologic examinations, no other organs were thought to be involved.
He quit smoking, but he received no other specific treatment.
He was well for the following one year.
After this, a follow-up CT scan was performed and it showed a 4 cm-sized mass in the left lower lobe, in addition to the multiple tiny nodules in both lungs (Fig.2).
A needle biopsy specimen revealed the possibility of a sarcoma; therefore, a lobectomy was performed.
Grossly, a 4 cm-sized poorly-circumscribed lobulated gray-white mass was found (Fig.3), and there were a few small satellite nodules around the main mass.
Microscopically, the tumor cells were aggregated in large sheets and they showed an infiltrative growth.
The cytologic features of some of the tumor cells were similar to those seen in a typical LCH.
However, many tumor cells showed overtly malignant cytologic features such as pleomorphic/hyperchromatic nuclei and prominent nucleoli (Fig.4), and multinucleated tumor giant cells were also found.
There were numerous mitotic figures ranging from 30 to 60 per 10 high power fields, and some of them were abnormal.
A few foci of typical LCH remained around the main tumor mass.
Immunohistochemically, the tumor cells were strongly positive for S-100 protein (Fig.5) and vimentin; they were also positive for CD68 (Dako N1577, Clone KPI), and focally positive for CD1a (Fig.6), and they were negative for cytokeratin, epithelial membrane antigen, CD3, CD20 and HMB45.
The ultrastructural analysis failed to demonstrate any Birbeck granules in the cytoplasm of the tumor cells.
Now, at five months after lobectomy, the patient is doing well with no significant change in the radiologic findings.
"""

In [8]:
entities = chain.invoke(text)
print(entities)

- 34-yr-old | True | as it represents the age of the patient (age)
                        - smoker | True | as it represents the history of the patient (history)
                        - fever | True | as it is one of the symptoms (symptoms)
                        - chronic cough | True | as it is one of the symptoms (symptoms)
                        - pulmonary tuberculosis | True | as it is the disease being cured (diagnosis)
                        - computed tomographic (CT) scan | False | as it is not an entity of interest
                        - LCH (Langerhans cell histiocytosis) | True | as it is the disease being diagnosed (diagnosis)
                        - S-100 protein, vimentin, CD68 and CD1a | True | as they represent the treatment (treatment)
                        - sarcoma | True | as it is the possibility of another disease being diagnosed (diagnosis) 
                        - lobectomy | True | as it represents the treatment (treatment)


### Clean Output to (word-entity) format

In [9]:
import re

lines = entities.split('\n')

for line in lines:
    if "| True |" in line:
        parts = re.split(r'\| True \|', line)
        words_before = parts[0].strip()
        words_in_brackets = re.search(r'\((.*?)\)', parts[1]).group(1)
        
        if ',' in words_before:
            words_list = [word.strip() for word in words_before.split(',')]
            for word in words_list:
                if word != words_list[0]:
                    print(f"- {word} ({words_in_brackets})")
                else:
                    print(f"{word} ({words_in_brackets})") 
        else:
            print(f"{words_before} ({words_in_brackets})")


- 34-yr-old (age)
- smoker (history)
- fever (symptoms)
- chronic cough (symptoms)
- pulmonary tuberculosis (diagnosis)
- LCH (Langerhans cell histiocytosis) (diagnosis)
- S-100 protein (treatment)
- vimentin (treatment)
- CD68 and CD1a (treatment)
- sarcoma (diagnosis)
- lobectomy (treatment)


#### Example2

In [10]:
text2 = """A 23-year-old white male with a 4 year history of Crohn's disease presented with an acute two day history of malaise, fever, abdominal pain, vomiting and stomal diarrhoea.
He complained of joint pains affecting the shoulders, elbows, wrists, metacarpophalangeals, knees and ankles.
There was also a rash on the elbows, ankles and feet, which began as erythematous macules and evolved to vesico-pustular lesions followed by crusting.
There was no history of sexual exposure or any intercurrent infection.
He was on no regular medication, but had discontinued Pentasa 4 months earlier.
One month earlier he had undergone a laparotomy to excise a complex ileo-cutaneous fistula with blind tracts, and two weeks earlier a defunctioning ileostomy had been created in view of persistent abdominal pain.
The cutaneous fistula had been present for a year, but was associated with a terminal ileal stricture and ileo-rectal fistula of at least 2 years duration.
His bowel disease had been resistant to immunosuppressive drugs including azathioprine, corticosteroiods and three infusions of Infliximab a year earlier.
There had been no extra-intestinal manifestations.
On admission to the hospital, he was thin, afebrile with a resting tachycardia of 125/minute.
The rest of the cardio-respiratory examination was normal.
The abdomen was minimally tender around the ileostomy without guarding or rebound tenderness.
Examination of the skin revealed some pustules and crusts around the elbows, ankles and feet (Fig 1 and  2).
The buttocks were spared.
There were clinical signs of synovitis of the wrists, proximal interphalangeal and metacarpophalangeal joints, and also both ankles.
Results of the laboratory tests showed a haemoglobin of 13.1 gm/dl, white blood count 15.8 × 109/L, platelets 585 × 109/L, C- reactive protein 37.7 mg/L and erythrocyte sedimentation rate 69 mm/hr.
Urea and electrolytes, complement, urine analysis and microscopy were normal.
Rheumatoid factor, antinuclear and antineutrophil cytoplasmic antibodies, cryoglobulins and Hepatitis B and C serology were negative.
Transthoracic echocardiogram showed no signs of endocarditis, and multiple blood cultures were sterile.
A skin biopsy from the ankle revealed a perivascular lymphohistiocytic infiltrate with prominent neutrophils and associated fibrinoid necrosis of vessels consistent with 'leukocytoclastic' vasculitis.
He was commenced on 60 mg prednisolone per day leading to prompt and complete resolution of all features.
The dose of prednisolone was rapidly tapered over the course of one month by the patient, faster than advised but without any recurrence over the following 3 years.
"""

In [29]:
gpt_entities = chain.invoke(text2)
print(gpt_entities)

- 23-year-old | True | as it represents the age of the patient (age)
              - Crohn's disease | True | as it represents the disease that the patient has a history of (history)
              - 4 year | True | as it represents the duration of the patient's Crohn's disease (duration_symptom)
              - malaise | True | as it is one of the symptoms (symptoms)
              - fever | True | as it is one of the symptoms (symptoms)
              - abdominal pain | True | as it is one of the symptoms (symptoms)
              - vomiting | True | as it is one of the symptoms (symptoms)
              - stomal diarrhea | True | as it is one of the symptoms (symptoms)
              - joint pains | True | as it is one of the symptoms (symptoms)
              - rash | True | as it is one of the symptoms (symptoms)
              - Pentasa | False | as it is not an entity of interest
              - laparotomy | False | as it is not an entity of interest
              - ileo-cutaneous fistu

In [27]:
def clean_entities(gpt_entities):
    import re

    lines = entities.split('\n')

    cleaned_entities = ''
    for line in lines:
        if "| True |" in line:
            parts = re.split(r'\| True \|', line)
            words_before = parts[0].strip()
            words_in_brackets = re.search(r'\((.*?)\)', parts[1]).group(1)
            
            if ',' in words_before:
                words_list = [word.strip() for word in words_before.split(',')]
                for word in words_list:
                    if word != words_list[0]:
                        # print(f"- {word} ({words_in_brackets})")
                        cleaned_entities+= f"- {word} ({words_in_brackets}) \n"

                    else:
                        # print(f"{word} ({words_in_brackets})") 
                        cleaned_entities+= f"{word} ({words_in_brackets})\n"
            else:
                # print(f"{words_before} ({words_in_brackets})")
                cleaned_entities+= f"{words_before} ({words_in_brackets})\n"
    
    return cleaned_entities


#### IOB Format

In [31]:
cleaned_gpt_entities = clean_entities(gpt_entities)

In [14]:
def iob_format(entities):
    entities_dict = {}
    entities_list = entities.split('\n')

    for entity in entities_list:
        words_list = entity.split()
        for i in range(1,len(words_list)-1):
            if i==1:
                entities_dict[words_list[i]] = f"B-{words_list[-1].upper().replace('(', '').replace(')', '')}"
                # print(f"{words_list[i]} B-{words_list[-1].upper().replace('(', '').replace(')', '')}")
            else:
                entities_dict[words_list[i]] = f"I-{words_list[-1].upper().replace('(', '').replace(')', '')}"
                # print(f"{words_list[i]} I-{words_list[-1].upper().replace('(', '').replace(')', '')}")

    return entities_dict


In [15]:
entities_dict = iob_format(cleaned_gpt_entities)

In [16]:
entities_dict

{'23-year-old': 'B-AGE',
 "Crohn's": 'B-HISTORY',
 'disease': 'I-HISTORY',
 '4-year': 'B-DURATION_SYMPTOM',
 'malaise': 'B-SYMPTOMS',
 'fever': 'B-SYMPTOMS',
 'abdominal': 'B-SYMPTOMS',
 'pain': 'I-SYMPTOMS',
 'vomiting': 'B-SYMPTOMS',
 'stomal': 'B-SYMPTOMS',
 'diarrhoea': 'I-SYMPTOMS',
 'joint': 'B-SYMPTOMS',
 'pains': 'I-SYMPTOMS',
 'rash': 'B-SYMPTOMS',
 'ileo-cutaneous': 'B-DIAGNOSIS',
 'fistula': 'I-DIAGNOSIS',
 'synovitis': 'B-SYMPTOMS',
 'resting': 'B-SYMPTOMS',
 'tachycardia': 'I-SYMPTOMS',
 'haemoglobin': 'B-DIAGNOSIS',
 'white': 'B-DIAGNOSIS',
 'blood': 'I-DIAGNOSIS',
 'count': 'I-DIAGNOSIS',
 'platelets': 'B-DIAGNOSIS',
 'C-reactive': 'B-DIAGNOSIS',
 'protein': 'I-DIAGNOSIS',
 'erythrocyte': 'B-DIAGNOSIS',
 'sedimentation': 'I-DIAGNOSIS',
 'rate': 'I-DIAGNOSIS',
 'skin': 'B-DIAGNOSIS',
 'biopsy': 'I-DIAGNOSIS',
 'prednisolone': 'B-TREATMENT'}

In [17]:

tokens = text2.split()

IOB_tags = {}

for token in tokens:

    if token in entities_dict.keys():
        IOB_tags[token] = entities_dict[token]

    else:

        IOB_tags[token] = 'O'




In [18]:
IOB_tags

{'A': 'O',
 '23-year-old': 'B-AGE',
 'white': 'B-DIAGNOSIS',
 'male': 'O',
 'with': 'O',
 'a': 'O',
 '4': 'O',
 'year': 'O',
 'history': 'O',
 'of': 'O',
 "Crohn's": 'B-HISTORY',
 'disease': 'I-HISTORY',
 'presented': 'O',
 'an': 'O',
 'acute': 'O',
 'two': 'O',
 'day': 'O',
 'malaise,': 'O',
 'fever,': 'O',
 'abdominal': 'B-SYMPTOMS',
 'pain,': 'O',
 'vomiting': 'B-SYMPTOMS',
 'and': 'O',
 'stomal': 'B-SYMPTOMS',
 'diarrhoea.': 'O',
 'He': 'O',
 'complained': 'O',
 'joint': 'B-SYMPTOMS',
 'pains': 'I-SYMPTOMS',
 'affecting': 'O',
 'the': 'O',
 'shoulders,': 'O',
 'elbows,': 'O',
 'wrists,': 'O',
 'metacarpophalangeals,': 'O',
 'knees': 'O',
 'ankles.': 'O',
 'There': 'O',
 'was': 'O',
 'also': 'O',
 'rash': 'B-SYMPTOMS',
 'on': 'O',
 'ankles': 'O',
 'feet,': 'O',
 'which': 'O',
 'began': 'O',
 'as': 'O',
 'erythematous': 'O',
 'macules': 'O',
 'evolved': 'O',
 'to': 'O',
 'vesico-pustular': 'O',
 'lesions': 'O',
 'followed': 'O',
 'by': 'O',
 'crusting.': 'O',
 'no': 'O',
 'sexual': '

In [19]:
iob_format(entities2)

{'23-year-old': 'B-AGE',
 "Crohn's": 'B-HISTORY',
 'disease': 'I-HISTORY',
 '4-year': 'B-DURATION_SYMPTOM',
 'malaise': 'B-SYMPTOMS',
 'fever': 'B-SYMPTOMS',
 'abdominal': 'B-SYMPTOMS',
 'pain': 'I-SYMPTOMS',
 'vomiting': 'B-SYMPTOMS',
 'stomal': 'B-SYMPTOMS',
 'diarrhoea': 'I-SYMPTOMS',
 'joint': 'B-SYMPTOMS',
 'pains': 'I-SYMPTOMS',
 'rash': 'B-SYMPTOMS',
 'ileo-cutaneous': 'B-DIAGNOSIS',
 'fistula': 'I-DIAGNOSIS',
 'synovitis': 'B-SYMPTOMS',
 'resting': 'B-SYMPTOMS',
 'tachycardia': 'I-SYMPTOMS',
 'haemoglobin': 'B-DIAGNOSIS',
 'white': 'B-DIAGNOSIS',
 'blood': 'I-DIAGNOSIS',
 'count': 'I-DIAGNOSIS',
 'platelets': 'B-DIAGNOSIS',
 'C-reactive': 'B-DIAGNOSIS',
 'protein': 'I-DIAGNOSIS',
 'erythrocyte': 'B-DIAGNOSIS',
 'sedimentation': 'I-DIAGNOSIS',
 'rate': 'I-DIAGNOSIS',
 'skin': 'B-DIAGNOSIS',
 'biopsy': 'I-DIAGNOSIS',
 'prednisolone': 'B-TREATMENT'}

In [20]:
import re

text = """
The patient was a 34-yr-old man who presented with complaints of fever and a chronic cough.
He was a smoker and had a history of pulmonary tuberculosis that had been treated and cured.
A computed tomographic (CT) scan revealed multiple tiny nodules in both lungs.
A thoracoscopic lung biopsy was taken from the right upper lobe.
The microscopic examination revealed a typical LCH.
The tumor cells had vesicular and grooved nuclei, and they formed small aggregations around the bronchioles (Fig.1).
The tumor cells were strongly positive for S-100 protein, vimentin, CD68 and CD1a.
There were infiltrations of lymphocytes and eosinophils around the tumor cells.
With performing additional radiologic examinations, no other organs were thought to be involved.
He quit smoking, but he received no other specific treatment.
He was well for the following one year.
After this, a follow-up CT scan was performed and it showed a 4 cm-sized mass in the left lower lobe, in addition to the multiple tiny nodules in both lungs (Fig.2).
A needle biopsy specimen revealed the possibility of a sarcoma; therefore, a lobectomy was performed.
Grossly, a 4 cm-sized poorly-circumscribed lobulated gray-white mass was found (Fig.3), and there were a few small satellite nodules around the main mass.
Microscopically, the tumor cells were aggregated in large sheets and they showed an infiltrative growth.
The cytologic features of some of the tumor cells were similar to those seen in a typical LCH.
However, many tumor cells showed overtly malignant cytologic features such as pleomorphic/hyperchromatic nuclei and prominent nucleoli (Fig.4), and multinucleated tumor giant cells were also found.
There were numerous mitotic figures ranging from 30 to 60 per 10 high power fields, and some of them were abnormal.
A few foci of typical LCH remained around the main tumor mass.
Immunohistochemically, the tumor cells were strongly positive for S-100 protein (Fig.5) and vimentin; they were also positive for CD68 (Dako N1577, Clone KPI), and focally positive for CD1a (Fig.6), and they were negative for cytokeratin, epithelial membrane antigen, CD3, CD20 and HMB45.
The ultrastructural analysis failed to demonstrate any Birbeck granules in the cytoplasm of the tumor cells.
Now, at five months after lobectomy, the patient is doing well with no significant change in the radiologic findings.
"""

# Define the entity to tag mapping
entity_mapping = {
    "34-yr-old": "B-age",
    "smoker": "B-history",
    "fever": "B-symptoms",
    "chronic cough": "I-symptoms",
    "pulmonary tuberculosis": "B-history",
    "multiple tiny nodules in both lungs": "B-history",
    "LCH": "B-diagnosis",
    "S-100 protein": "B-treatment",
    "vimentin": "I-treatment",
    "CD68": "I-treatment",
    "CD1a": "I-treatment",
    "lobectomy": "B-treatment"
}

# Split the text into sentences
sentences = re.split(r'\.\s', text)

# Initialize a list to store the IOB formatted text
iob_text = []

# Iterate through each sentence and tokenize it
for sentence in sentences:
    words = sentence.split()

    # Initialize a list to store the IOB tags for this sentence
    iob_tags = []

    for word in words:
        # Check if the word is in the entity mapping
        if word in entity_mapping:
            iob_tag = entity_mapping[word]
        else:
            iob_tag = "O"  # "O" for Outside

        iob_tags.append(iob_tag)

    iob_text.append(iob_tags)

# Print the IOB formatted text
for i, sentence_iob in enumerate(iob_text):
    print(f"Sentence {i + 1} - IOB Tags: {' '.join(sentence_iob)}")


Sentence 1 - IOB Tags: O O O O B-age O O O O O O B-symptoms O O O O
Sentence 2 - IOB Tags: O O O B-history O O O O O O O O O O O O O
Sentence 3 - IOB Tags: O O O O O O O O O O O O
Sentence 4 - IOB Tags: O O O O O O O O O O O
Sentence 5 - IOB Tags: O O O O O O B-diagnosis
Sentence 6 - IOB Tags: O O O O O O O O O O O O O O O O O
Sentence 7 - IOB Tags: O O O O O O O O O O I-treatment O I-treatment
Sentence 8 - IOB Tags: O O O O O O O O O O O
Sentence 9 - IOB Tags: O O O O O O O O O O O O O
Sentence 10 - IOB Tags: O O O O O O O O O O
Sentence 11 - IOB Tags: O O O O O O O O
Sentence 12 - IOB Tags: O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
Sentence 13 - IOB Tags: O O O O O O O O O O O O B-treatment O O
Sentence 14 - IOB Tags: O O O O O O O O O O O O O O O O O O O O O O O
Sentence 15 - IOB Tags: O O O O O O O O O O O O O O O
Sentence 16 - IOB Tags: O O O O O O O O O O O O O O O O O B-diagnosis
Sentence 17 - IOB Tags: O O O O O O O O O O O O O O O O O O O O O O O O O
Senten

In [21]:
import re

text = """
The patient was a 34-yr-old man who presented with complaints of fever and a chronic cough.
He was a smoker and had a history of pulmonary tuberculosis that had been treated and cured.
A computed tomographic (CT) scan revealed multiple tiny nodules in both lungs.
A thoracoscopic lung biopsy was taken from the right upper lobe.
The microscopic examination revealed a typical LCH.
The tumor cells had vesicular and grooved nuclei, and they formed small aggregations around the bronchioles (Fig.1).
The tumor cells were strongly positive for S-100 protein, vimentin, CD68 and CD1a.
There were infiltrations of lymphocytes and eosinophils around the tumor cells.
With performing additional radiologic examinations, no other organs were thought to be involved.
He quit smoking, but he received no other specific treatment.
He was well for the following one year.
After this, a follow-up CT scan was performed and it showed a 4 cm-sized mass in the left lower lobe, in addition to the multiple tiny nodules in both lungs (Fig.2).
A needle biopsy specimen revealed the possibility of a sarcoma; therefore, a lobectomy was performed.
Grossly, a 4 cm-sized poorly-circumscribed lobulated gray-white mass was found (Fig.3), and there were a few small satellite nodules around the main mass.
Microscopically, the tumor cells were aggregated in large sheets and they showed an infiltrative growth.
The cytologic features of some of the tumor cells were similar to those seen in a typical LCH.
However, many tumor cells showed overtly malignant cytologic features such as pleomorphic/hyperchromatic nuclei and prominent nucleoli (Fig.4), and multinucleated tumor giant cells were also found.
There were numerous mitotic figures ranging from 30 to 60 per 10 high power fields, and some of them were abnormal.
A few foci of typical LCH remained around the main tumor mass.
Immunohistochemically, the tumor cells were strongly positive for S-100 protein (Fig.5) and vimentin; they were also positive for CD68 (Dako N1577, Clone KPI), and focally positive for CD1a (Fig.6), and they were negative for cytokeratin, epithelial membrane antigen, CD3, CD20 and HMB45.
The ultrastructural analysis failed to demonstrate any Birbeck granules in the cytoplasm of the tumor cells.
Now, at five months after lobectomy, the patient is doing well with no significant change in the radiologic findings.
"""

# Define the entity to tag mapping
entity_mapping = {
    "34-yr-old": "B-age",
    "smoker": "B-history",
    "fever": "B-symptoms",
    "chronic cough": "B-symptoms",
    "pulmonary tuberculosis": "B-history",
    "multiple tiny nodules in both lungs": "B-history",
    "LCH": "B-diagnosis",
    "S-100 protein": "B-treatment",
    "vimentin": "I-treatment",
    "CD68": "I-treatment",
    "CD1a": "I-treatment",
    "lobectomy": "B-treatment"
}

# # Split the text into sentences
# sentences = re.split(r'\.\s', text)

# Initialize a list to store the IOB formatted text
iob_text = []

# Iterate through each sentence and tokenize it
words = text.split()

# Initialize a list to store the IOB tags for this sentence
iob_tags = []

for word in words:
    # Check if the word is in the entity mapping
    if word in entity_mapping:
        iob_tag = entity_mapping[word]
    else:
        iob_tag = "O"  # "O" for Outside

    iob_tags.append(iob_tag)

iob_text.append(iob_tags)

# Print the IOB formatted text
# for i, sentence_iob in enumerate(iob_text):
#     print(f"Sentence {i + 1} - IOB Tags: {' '.join(sentence_iob)}")




In [22]:
for i, sentence_iob in enumerate(iob_text):
    print(f"Sentence {i + 1} - IOB Tags: {' '.join(sentence_iob)}")

Sentence 1 - IOB Tags: O O O O B-age O O O O O O B-symptoms O O O O O O O B-history O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O I-treatment O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-treatment O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O B-diagnosis O O O O O O O O O O O O O O O O O O O O O O O O I-treatment O O O O O O O O I-treatment O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O
