In [None]:
!pip install vllm
!pip install -U huggingface_hub[cli]
!pip install triton
!pip install openai

Collecting vllm
  Downloading vllm-0.6.2-cp38-abi3-manylinux1_x86_64.whl.metadata (2.4 kB)
Collecting transformers>=4.45.0 (from vllm)
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting openai>=1.40.0 (from vllm)
  Downloading openai-1.51.1-py3-none-any.whl.metadata (24 kB)
Collecting uvicorn[standard] (from vllm)
  Downloading uvicorn-0.31.0-py3-none-any.whl.metadata (6.6 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.0.0-py3-none-any.whl.metadata (13 kB)
Collecting tiktoken>=0.6.0 (from vllm)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting lm-format-enforcer==0.10.6 (from vllm)
  Downloading lm_format_enforcer-0.10.6-py3-none-any.whl.metadata (16 kB)
Collecting outlines<0.1,>=0.0.43 (from vllm)
  Down

In [None]:
!huggingface-cli login --token #write your hugging face token here

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from collections import defaultdict
from openai import OpenAI
from tqdm.notebook import tqdm
import re
import json

In [None]:
folder_dev = "Artificial dataset DISEASE.tsv"

with open (folder_dev, encoding="utf-8") as file:
    text = file.read()
    abstracts = [line for line in text.split("\n\n") if line]
    print(len(abstracts))

parsed_data = defaultdict(lambda:defaultdict(list))

for item in abstracts:
    lines = item.split("\n")
    pid, abstract = lines[0].strip(), lines[1].strip()
    parsed_data[pid]["text"] = abstract

    re_sentences = re.split(r'(?<=[.!?])\s+', abstract)
    sentences = [sentence.strip() for sentence in re_sentences]

    parsed_data[pid]["sentences"] = sentences

    for line in lines[2:]:
        data = line.split("\t")
        if len(data) < 3 or len(data) > 3:
            print(data)
        else:
            start, end, mention =  data
            parsed_data[pid]["annotations"].append({
                "span":start+":"+end,
                "mention":mention
            })

10


In [None]:
!nohup vllm serve "microsoft/Phi-3-small-8k-instruct" --dtype half --api-key noapikey --trust_remote_code &

nohup: appending output to 'nohup.out'


In [None]:
client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="noapikey",
)

In [None]:
MODEL_NAME = "microsoft/Phi-3-small-8k-instruct"

New base prompts:

In [None]:
SYSTEM_MESSAGE_EXTRACT = """
Your task is to identify and extract all disease mentions in the text provided within triple backticks. Format the output as the following JSON object:
```json:\n{"diseases": "a semicolon-separated list of specific disease names or disease classes."}```
Avoid adding any additional remarks and explanations.

### Example:
Text:
```
The tumor suppressor gene Brca1 is required for embryonic cellular proliferation in the mouse. Mutations of the BRCA1 gene in humans are associated with predisposition to breast and ovarian cancers.
```
Output:
```
json:
{"diseases": "tumor; breast and ovarian cancers"}
```
"""

SYSTEM_MESSAGE_MARKUP = """
Your task is to identify and mark up all disease mentions in the text provided within triple backticks using the following HTML tags:
Use the <entity type=disease></entity> tag to mark all specific disease names or disease classes.
Avoid adding any additional remarks and explanations.

### Example:
Text:
```
The tumor suppressor gene Brca1 is required for embryonic cellular proliferation in the mouse. Mutations of the BRCA1 gene in humans are associated with predisposition to breast and ovarian cancers.
```
Text with markup:
```
The <entity type=disease>tumor</entity> suppressor gene Brca1 is required for embryonic cellular proliferation in the mouse. Mutations of the BRCA1 gene in humans are associated with predisposition to <entity type=disease>breast and ovarian cancers</entity>.
```
"""

New prompts with positive rules, 3-shot:

In [None]:
POSITIVE_RULES_EXTRACT_MESSAGE = """
Please follow the rules below to extract all disease mentions correctly from the input text.

### Extraction Rules:
1. Extract Multiple Disease Mentions together if cannot be separated:
### Examples:
Text:
```
The conference highlighted recent advancements in treating Alzheimer's and Lewy body dementia to improve patient quality of life.
```
Output:
```
json:
{"diseases": "Alzheimer's and Lewy body dementia"}
```

Text:
```
The patient was diagnosed with liver, kidney, and pancreatic cancer after a series of tests.
```
Output:
```
json:
{"diseases": "liver, kidney, and pancreatic cancer"}
```

Text:
```
The study investigated the prevalence of lupus and rheumatoid arthritis among women of childbearing age.
```
Output:
```
json:
{"diseases": "lupus and rheumatoid arthritis"}
```
2. Extract ONLY the Disease Mention when it modifies other concepts:
### Examples:
Text:
```
Diabetes patients often experience complications in their cardiovascular health.
```
Output:
```
json:
{"diseases": "Diabetes"}
```

Text:
```
The various forms of epilepsy can lead to distinct seizure types and management strategies.
```
Output:
```
json:
{"diseases": "epilepsy"}
```

Text:
```
Swelling, breast pain, and nipple retraction are included in the list of possible breast cancer symptoms.
```
Output:
```
json:
{"diseases": "breast cancer"}
```
3. Extract All Disease Mentions, even if repeated:
- Ensure that every instance of a disease is extracted, even if it appears multiple times in the text.
### Examples:
Text:
```
Breast cancer is often diagnosed through mammograms, and breast cancer awareness is crucial for early detection.
```
Output:
```
json:
{"diseases": "Breast cancer; breast cancer"}
```

Text:
```
Patients with diabetes must monitor their blood sugar levels, as diabetes can lead to serious complications if not managed properly.
```
Output:
```
json:
{"diseases": "diabetes; diabetes"}
```

Text:
```
The treatment options for lung cancer vary, and lung cancer research continues to advance our understanding of the disease.
```
Output:
```
json:
{"diseases": "lung cancer; lung cancer"}
```
4. Extract the Full, Specific Disease Mention:
### Examples:
Text:
```
Individuals with chronic obstructive pulmonary disease frequently experience shortness of breath and require ongoing management to maintain their quality of life.
```
Output:
```
json:
{"diseases": "chronic obstructive pulmonary disease"}
```

Text:
```
The effects of end-stage renal disease on a patient's health can be profound, necessitating dialysis or kidney transplantation.
```
Output:
```
json:
{"diseases": "end-stage renal disease"}
```

Text:
```
Individuals diagnosed with idiopathic pulmonary fibrosis often face progressive lung decline, making early intervention and specialized care critical.
```
Output:
```
json:
{"diseases": "idiopathic pulmonary fibrosis"}
```
5. Extract Disease Full Names and Abbreviations Separately:
### Examples:
Text:
```
Patients diagnosed with rheumatoid arthritis (RA) often experience chronic joint pain and inflammation that can affect their daily activities.
```
Output:
```
json:
{"diseases": "rheumatoid arthritis; RA"}
```

Text:
```
The effects of human immunodeficiency virus (HIV) on the immune system can lead to serious health complications if left untreated.
```
Output:
```
json:
{"diseases": "human immunodeficiency virus; HIV"}
```

Text:
```
Papillary thyroid carcinoma (PTC) requires a multidisciplinary approach for treatment, including surgery, radioactive iodine therapy, and ongoing monitoring for recurrence.
```
Output:
```
json:
{"diseases": "Papillary thyroid carcinoma; PTC"}
```

Please ensure that all disease entities are extracted following these rules.
"""

In [None]:
POSITIVE_RULES_MARKUP_MESSAGE = """
Please follow the rules below to tag all disease mentions correctly in the input text.

### Tagging Rules:
1. Tag Multiple Disease Mentions together if cannot be separated:
### Examples:
Text:
```
The conference highlighted recent advancements in treating Alzheimer's and Lewy body dementia to improve patient quality of life.
```
Text with markup:
```
The conference highlighted recent advancements in treating <entity type=disease>Alzheimer's and Lewy body dementia</entity> to improve patient quality of life.
```

Text:
```
The patient was diagnosed with liver, kidney, and pancreatic cancer after a series of tests.
```
Text with markup:
```
The patient was diagnosed with <entity type=disease>liver, kidney, and pancreatic cancer</entity> after a series of tests.
```

Text:
```
The study investigated the prevalence of lupus and rheumatoid arthritis among women of childbearing age.
```
Text with markup:
```
The study investigated the prevalence of <entity type=disease>lupus and rheumatoid arthritis</entity> among women of childbearing age.
```
2. Tag ONLY the Disease Mention when it modifies other concepts:
### Examples:
Text:
```
Diabetes patients often experience complications in their cardiovascular health.
```
Text with markup:
```
<entity type=disease>Diabetes</entity> patients often experience complications in their cardiovascular health.
```

Text:
```
The various forms of epilepsy can lead to distinct seizure types and management strategies.
```
Text with markup:
```
The various forms of <entity type=disease>epilepsy</entity> can lead to distinct seizure types and management strategies.
```

Text:
```
Swelling, breast pain, and nipple retraction are included in the list of possible breast cancer symptoms.
```
Text with markup:
```
Swelling, breast pain, and nipple retraction are included in the list of possible <entity type=disease>breast cancer</entity> symptoms.
```
3. Tag All Disease Mentions, even if repeated:
- Ensure every instance of the disease in the text is tagged, even if it appears multiple times.
### Examples:
Text:
```
Breast cancer is often diagnosed through mammograms, and breast cancer awareness is crucial for early detection.
```
Text with markup:
```
<entity type=disease>Breast cancer</entity> is often diagnosed through mammograms, and <entity type=disease>breast cancer</entity> awareness is crucial for early detection.
```

Text:
```
Patients with diabetes must monitor their blood sugar levels, as diabetes can lead to serious complications if not managed properly.
```
Text with markup:
```
Patients with <entity type=disease>diabetes</entity> must monitor their blood sugar levels, as <entity type=disease>diabetes</entity> can lead to serious complications if not managed properly.
```

Text:
```
The treatment options for lung cancer vary, and lung cancer research continues to advance our understanding of the disease.
```
Text with markup:
```
The treatment options for <entity type=disease>lung cancer</entity> vary, and <entity type=disease>lung cancer</entity> research continues to advance our understanding of the disease.
```
4. Tag the Full, Specific Disease Mention:
### Examples:
Text:
```
Individuals with chronic obstructive pulmonary disease frequently experience shortness of breath and require ongoing management to maintain their quality of life.
```
Text with markup:
```
Individuals with <entity type=disease>chronic obstructive pulmonary disease</entity> frequently experience shortness of breath and require ongoing management to maintain their quality of life.
```

Text:
```
The effects of end-stage renal disease on a patient's health can be profound, necessitating dialysis or kidney transplantation.
```
Text with markup:
```
The effects of <entity type=disease>end-stage renal disease</entity> on a patient's health can be profound, necessitating dialysis or kidney transplantation.
```

Text:
```
Individuals diagnosed with idiopathic pulmonary fibrosis often face progressive lung decline, making early intervention and specialized care critical.
```
Text with markup:
```
Individuals diagnosed with <entity type=disease>idiopathic pulmonary fibrosis</entity> often face progressive lung decline, making early intervention and specialized care critical.
```
5. Separate Tags for Disease Full Name and its Abbreviation:
### Examples:
Text:
```
Patients diagnosed with rheumatoid arthritis (RA) often experience chronic joint pain and inflammation that can affect their daily activities.
```
Text with markup:
```
Patients diagnosed with <entity type=disease>rheumatoid arthritis</entity> (<entity type=disease>RA</entity>) often experience chronic joint pain and inflammation that can affect their daily activities.
```

Text:
```
The effects of human immunodeficiency virus (HIV) on the immune system can lead to serious health complications if left untreated.
```
Text with markup:
```
The effects of <entity type=disease>human immunodeficiency virus</entity> (<entity type=disease>HIV</entity>) on the immune system can lead to serious health complications if left untreated.
```

Text:
```
Papillary thyroid carcinoma (PTC) requires a multidisciplinary approach for treatment, including surgery, radioactive iodine therapy, and ongoing monitoring for recurrence.
```
Text with markup:
```
<entity type=disease>Papillary thyroid carcinoma</entity> (<entity type=disease>PTC</entity>) requires a multidisciplinary approach for treatment, including surgery, radioactive iodine therapy, and ongoing monitoring for recurrence.
```

Please ensure that all disease entities adhere to these rules.
"""

In [None]:
SYS_MESSAGE_EXTRACT_AG = SYSTEM_MESSAGE_EXTRACT + POSITIVE_RULES_EXTRACT_MESSAGE
SYS_MESSAGE_MARKUP_AG = SYSTEM_MESSAGE_MARKUP + POSITIVE_RULES_MARKUP_MESSAGE

In [None]:
# Markup without annotations
for pid in tqdm(list(parsed_data)):
  parsed_data[pid]["response_markup_phi3_small"] = []
  for sentence in parsed_data[pid]["sentences"]:
    response_markup = client.chat.completions.create(
        model=MODEL_NAME,
        temperature=0,
        n=1,
        messages=[
            {"role": "system", "content": SYSTEM_MESSAGE_MARKUP},
            {"role": "user", "content": "\nText:\n```" + sentence + "```"}
        ],
    )
    parsed_data[pid]["response_markup_phi3_small"].append(response_markup.choices[0].message.content)

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
# Markup with annotations
for pid in tqdm(list(parsed_data)):
  parsed_data[pid]["response_markup_rules_phi3_small"] = []
  for sentence in parsed_data[pid]["sentences"]:
    response_markup_rules = client.chat.completions.create(
        model=MODEL_NAME,
        temperature=0,
        n=1,
        messages=[
            {"role": "system", "content": SYS_MESSAGE_MARKUP_AG},
            {"role": "user", "content": "\nText:\n```" + sentence + "```"}
        ],
    )
    parsed_data[pid]["response_markup_rules_phi3_small"].append(response_markup_rules.choices[0].message.content)

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
# Extract without annotations
for pid in tqdm(list(parsed_data)):
  parsed_data[pid]["response_extract_phi3_small"] = []
  for sentence in parsed_data[pid]["sentences"]:
    response_extract = client.chat.completions.create(
        model=MODEL_NAME,
        temperature=0,
        n=1,
        messages=[
            {"role": "system", "content": SYSTEM_MESSAGE_EXTRACT},
            {"role": "user", "content": "\nText:\n```" + sentence + "```"}
        ],
    )
    parsed_data[pid]["response_extract_phi3_small"].append(response_extract.choices[0].message.content)

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
# Extract with annotations
for pid in tqdm(list(parsed_data)):
  parsed_data[pid]["response_extract_rules_phi3_small"] = []
  for sentence in parsed_data[pid]["sentences"]:
    response_extract_rules = client.chat.completions.create(
        model=MODEL_NAME,
        temperature=0,
        n=1,
        messages=[
            {"role": "system", "content": SYS_MESSAGE_EXTRACT_AG},
            {"role": "user", "content": "\nText:\n```" + sentence + "```"}
        ],
    )
    parsed_data[pid]["response_extract_rules_phi3_small"].append(response_extract_rules.choices[0].message.content)

  0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
import json
with open('phi3_small_results_rules_3_shot.json', 'w') as json_file:
    json.dump(parsed_data, json_file, indent=4)