In [1]:
!pip install transformers
!pip install datasets
!pip install nervaluate
!pip install accelerate -U

Collecting accelerate
  Obtaining dependency information for accelerate from https://files.pythonhosted.org/packages/d9/92/2d3aecf9f4a192968035880be3e2fc8b48d541c7128f7c936f430d6f96da/accelerate-0.23.0-py3-none-any.whl.metadata
  Downloading accelerate-0.23.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
   ---------------------------------------- 0.0/258.1 kB ? eta -:--:--
   ------ -------------------------------- 41.0/258.1 kB 991.0 kB/s eta 0:00:01
   -------------- ------------------------- 92.2/258.1 kB 1.1 MB/s eta 0:00:01
   ------------------------- -------------- 163.8/258.1 kB 1.2 MB/s eta 0:00:01
   ---------------------------------- ----- 225.3/258.1 kB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 258.1/258.1 kB 1.2 MB/s eta 0:00:00
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.22.0
    Uninstalling accelerate-0.22.0:
      Successful

In [2]:
from datasets import load_dataset, ClassLabel, Sequence
custom_headers = ["Id", "tags", "text"] 
dataset = load_dataset('csv', data_files={'train': ['medical_ner.tsv']}, delimiter="\t",column_names=custom_headers)


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
lebel_set = ['B-age', 'I-age', 'B-allergy_name', 'I-allergy_name', 'B-bmi', 'I-bmi', 'B-cancer', 'I-cancer', 'B-chronic_disease', 'I-chronic_disease', 'B-clinical_variable', 'I-clinical_variable', 'B-contraception_consent', 'I-contraception_consent', 'B-ethnicity', 'I-ethnicity', 'B-gender', 'I-gender', 'B-language_fluency', 'I-language_fluency', 'B-lower_bound', 'I-lower_bound', 'B-pregnancy', 'I-pregnancy', 'B-technology_access', 'I-technology_access', 'B-treatment', 'I-treatment', 'B-upper_bound', 'I-upper_bound', 'O']
label2id={label: i for i,label in enumerate(lebel_set)}
label2id

{'B-age': 0,
 'I-age': 1,
 'B-allergy_name': 2,
 'I-allergy_name': 3,
 'B-bmi': 4,
 'I-bmi': 5,
 'B-cancer': 6,
 'I-cancer': 7,
 'B-chronic_disease': 8,
 'I-chronic_disease': 9,
 'B-clinical_variable': 10,
 'I-clinical_variable': 11,
 'B-contraception_consent': 12,
 'I-contraception_consent': 13,
 'B-ethnicity': 14,
 'I-ethnicity': 15,
 'B-gender': 16,
 'I-gender': 17,
 'B-language_fluency': 18,
 'I-language_fluency': 19,
 'B-lower_bound': 20,
 'I-lower_bound': 21,
 'B-pregnancy': 22,
 'I-pregnancy': 23,
 'B-technology_access': 24,
 'I-technology_access': 25,
 'B-treatment': 26,
 'I-treatment': 27,
 'B-upper_bound': 28,
 'I-upper_bound': 29,
 'O': 30}

In [4]:
dataset['train'].num_rows

49903

In [6]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
model = AutoModel.from_pretrained("medicalai/ClinicalBERT")


Downloading pytorch_model.bin:   0%|          | 0.00/542M [00:00<?, ?B/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Id', 'tags', 'text'],
        num_rows: 49903
    })
})

In [8]:
from tokenizers.pre_tokenizers import Whitespace
pre_tokenizer = Whitespace()
pre_tokenizer.pre_tokenize_str("Hello! How# are$ you? I'm% &fine, thank\" you.")


[('Hello', (0, 5)),
 ('!', (5, 6)),
 ('How', (7, 10)),
 ('#', (10, 11)),
 ('are', (12, 15)),
 ('$', (15, 16)),
 ('you', (17, 20)),
 ('?', (20, 21)),
 ('I', (22, 23)),
 ("'", (23, 24)),
 ('m', (24, 25)),
 ('%', (25, 26)),
 ('&', (27, 28)),
 ('fine', (28, 32)),
 (',', (32, 33)),
 ('thank', (34, 39)),
 ('"', (39, 40)),
 ('you', (41, 44)),
 ('.', (44, 45))]

In [9]:
from tqdm import tqdm
def get_ner(position, tags):
  all_ner_positions = list(tags.keys())
  prefix='N'
  for ner_position in all_ner_positions:
    if position[0] >= ner_position[0] and position[1] <= ner_position[1]:
      if position[0] == ner_position[0]:
        prefix = 'B-'
      else:
        prefix = 'I-'
      return  prefix+tags[ner_position]
  return 'O'

all_tokens = []
all_ner_tags = []
for idx in tqdm(range(dataset['train'].num_rows)):
  # print(dataset['train'][idx]['text'])
  position_pairs = []
  tags = {}
  tokens = []
  ner_tags = []
  for position_and_tag in dataset['train'][idx]['tags'].split(','):
    start_position, end_position, tag = int(position_and_tag.split(':')[0]), int(position_and_tag.split(':')[1]), position_and_tag.split(':')[2]
    tags[(start_position-1, end_position-1)] = tag
    position_pairs.append((start_position-1, end_position-1))
    tagged_text = dataset['train'][idx]['text'][start_position-1:end_position-1]
    # print(start_position, end_position, tag, tagged_text, dataset['train'][idx]['text'][start_position-1:end_position-1])
  # print(tags)
  # print(position_pairs)
  pre_tokenized_text = pre_tokenizer.pre_tokenize_str(dataset['train'][idx]['text'])
  for (word, position) in pre_tokenized_text:
    label = get_ner(position, tags)
    tokens.append(word)
    ner_tags.append(label2id[label])
  all_tokens.append(tokens)
  all_ner_tags.append(ner_tags)

print(len(all_tokens), len(all_ner_tags), all_tokens[100], all_ner_tags[100])


  0%|          | 0/49903 [00:00<?, ?it/s][A
  0%|          | 130/49903 [00:00<00:42, 1169.06it/s][A
  1%|          | 284/49903 [00:00<00:38, 1297.66it/s][A
  1%|          | 414/49903 [00:00<00:39, 1260.58it/s][A
  1%|          | 540/49903 [00:00<00:39, 1253.65it/s][A
  1%|▏         | 671/49903 [00:00<00:39, 1232.74it/s][A
  2%|▏         | 800/49903 [00:00<00:39, 1238.43it/s][A
  2%|▏         | 924/49903 [00:00<00:40, 1209.65it/s][A
  2%|▏         | 1059/49903 [00:00<00:40, 1211.88it/s][A
  2%|▏         | 1181/49903 [00:00<00:40, 1198.05it/s][A
  3%|▎         | 1313/49903 [00:01<00:39, 1233.33it/s][A
  3%|▎         | 1437/49903 [00:01<00:39, 1217.91it/s][A
  3%|▎         | 1566/49903 [00:01<00:39, 1223.73it/s][A
  3%|▎         | 1690/49903 [00:01<00:39, 1226.33it/s][A
  4%|▎         | 1813/49903 [00:01<00:40, 1197.37it/s][A
  4%|▍         | 1967/49903 [00:01<00:37, 1272.27it/s][A
  4%|▍         | 2095/49903 [00:01<00:37, 1266.94it/s][A
  4%|▍         | 2222/49903 [00:0

 76%|███████▌  | 37861/49903 [00:30<00:13, 906.56it/s][A
 76%|███████▌  | 37955/49903 [00:30<00:13, 915.05it/s][A
 76%|███████▌  | 38048/49903 [00:30<00:13, 872.42it/s][A
 76%|███████▋  | 38138/49903 [00:30<00:13, 878.24it/s][A
 77%|███████▋  | 38227/49903 [00:30<00:13, 865.44it/s][A
 77%|███████▋  | 38316/49903 [00:30<00:13, 859.18it/s][A
 77%|███████▋  | 38403/49903 [00:30<00:14, 805.57it/s][A
 77%|███████▋  | 38509/49903 [00:30<00:13, 856.58it/s][A
 77%|███████▋  | 38600/49903 [00:31<00:13, 868.56it/s][A
 78%|███████▊  | 38719/49903 [00:31<00:11, 958.99it/s][A
 78%|███████▊  | 38833/49903 [00:31<00:10, 1010.16it/s][A
 78%|███████▊  | 38949/49903 [00:31<00:10, 1012.16it/s][A
 78%|███████▊  | 39086/49903 [00:31<00:09, 1113.59it/s][A
 79%|███████▊  | 39212/49903 [00:31<00:09, 1148.65it/s][A
 79%|███████▉  | 39328/49903 [00:31<00:09, 1127.88it/s][A
 79%|███████▉  | 39452/49903 [00:31<00:09, 1159.94it/s][A
 79%|███████▉  | 39572/49903 [00:31<00:08, 1169.34it/s][A
 80%|██

49903 49903 ['Chronic', 'hepatobiliary', 'disease', ',', 'conservatively', 'defined', 'as', 'liver', 'function', 'tests', '(', 'AST', ',', 'ALT', ',', 'alkaline', 'phosphatase', ',', 'Total', 'Bilirubin', ')', '>', '1', '.', '5', 'times', 'the', 'upper', 'limit', 'of', 'normal'] [8, 9, 9, 30, 30, 30, 30, 30, 30, 30, 30, 10, 30, 10, 30, 10, 11, 30, 10, 11, 30, 30, 20, 21, 21, 21, 21, 21, 21, 21, 21]





In [10]:
list(tags.keys())

[(4, 12)]

In [11]:
from datasets import Dataset
my_dict = {"tokens": all_tokens, "ner_tags": all_ner_tags}
dataset = Dataset.from_dict(my_dict)


In [12]:
inputs = tokenizer(dataset[10]["tokens"], is_split_into_words=True)
inputs.tokens()


['[CLS]',
 'end',
 '-',
 'stage',
 'organ',
 'disease',
 'or',
 'medical',
 'condition',
 'with',
 'subsequent',
 'vision',
 'loss',
 '(',
 'e',
 '.',
 'g',
 '.',
 ',',
 'diabetes',
 ',',
 'stroke',
 ')',
 '[SEP]']

In [13]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

inputs = tokenizer(dataset[0]["tokens"], is_split_into_words=True)
inputs.tokens()

labels = dataset[0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))


[30, 30, 30, 30, 30, 8]
[-100, 30, 30, 30, 30, 30, 8, -100]


In [14]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [15]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset.column_names,
)

Map:   0%|          | 0/49903 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [16]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


In [17]:
!pip install seqeval
!pip install evaluate



In [18]:
import evaluate

metric = evaluate.load("seqeval")


In [19]:
import numpy as np
from nervaluate import collect_named_entities
from nervaluate import compute_metrics as ner_compute_metrics
from nervaluate import Evaluator

label_names = lebel_set
def compute_metrics(eval_preds):

    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    true_labels_1 = [[l for l in label] for label in true_labels]
    true_predictions_1 = [[l for l in label] for label in true_predictions]
    evaluator = Evaluator(true_labels_1, true_predictions_1, tags=[l[2:] for l in label_names if l !='O'], loader="list")
    results, results_by_tag = evaluator.evaluate()

    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
        "results": results,
        #"results_by_tag": results_by_tag,
    }


In [20]:
id2label = {i: label for i, label in enumerate(lebel_set)}
label2id = {v: k for k, v in id2label.items()}

In [21]:
from transformers import TrainingArguments

args = TrainingArguments(
    "clinical-bert-finetuned-ner",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

In [22]:
tokenized_datasets= tokenized_datasets.train_test_split(test_size=0.20)

In [23]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 39922
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9981
    })
})

In [24]:
from transformers import AutoModelForTokenClassification

model_checkpoint = "medicalai/ClinicalBERT"
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 23427a9f-ee6b-484d-9a8e-4e1f98ab6f64)')' thrown while requesting HEAD https://huggingface.co/medicalai/ClinicalBERT/resolve/main/config.json
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at medicalai/ClinicalBERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
tokenized_datasets_1000 = tokenized_datasets.filter(lambda example, indice: indice <= 1000, with_indices=True)


Filter:   0%|          | 0/39922 [00:00<?, ? examples/s]

Filter:   0%|          | 0/9981 [00:00<?, ? examples/s]

In [26]:
tokenized_datasets_1000

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1001
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1001
    })
})

In [28]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy,Results
1,0.4491,0.369593,0.7749,0.820641,0.797115,0.876897,"{'ent_type': {'correct': 59802, 'incorrect': 5145, 'partial': 0, 'missed': 4205, 'spurious': 8287, 'possible': 69152, 'actual': 73234, 'precision': 0.8165879236420241, 'recall': 0.8647906062008329, 'f1': 0.8399983144410265}, 'partial': {'correct': 60825, 'incorrect': 0, 'partial': 4122, 'missed': 4205, 'spurious': 8287, 'possible': 69152, 'actual': 73234, 'precision': 0.8586995111560204, 'recall': 0.9093880148079593, 'f1': 0.8833171800598373}, 'strict': {'correct': 56749, 'incorrect': 8198, 'partial': 0, 'missed': 4205, 'spurious': 8287, 'possible': 69152, 'actual': 73234, 'precision': 0.7748996367807303, 'recall': 0.8206414854234151, 'f1': 0.7971148848903684}, 'exact': {'correct': 60825, 'incorrect': 4122, 'partial': 0, 'missed': 4205, 'spurious': 8287, 'possible': 69152, 'actual': 73234, 'precision': 0.8305568451812, 'recall': 0.8795841045812124, 'f1': 0.854367704690068}}"
2,0.349,0.338022,0.799501,0.824936,0.81202,0.885756,"{'ent_type': {'correct': 59842, 'incorrect': 4745, 'partial': 0, 'missed': 4565, 'spurious': 6765, 'possible': 69152, 'actual': 71352, 'precision': 0.8386870725417648, 'recall': 0.8653690421101342, 'f1': 0.8518191652906679}, 'partial': {'correct': 60821, 'incorrect': 0, 'partial': 3766, 'missed': 4565, 'spurious': 6765, 'possible': 69152, 'actual': 71352, 'precision': 0.8787980715326831, 'recall': 0.9067561314206386, 'f1': 0.8925582189830895}, 'strict': {'correct': 57046, 'incorrect': 7541, 'partial': 0, 'missed': 4565, 'spurious': 6765, 'possible': 69152, 'actual': 71352, 'precision': 0.7995010651418321, 'recall': 0.8249363720499768, 'f1': 0.8120195866309856}, 'exact': {'correct': 60821, 'incorrect': 3766, 'partial': 0, 'missed': 4565, 'spurious': 6765, 'possible': 69152, 'actual': 71352, 'precision': 0.8524077811413836, 'recall': 0.8795262609902823, 'f1': 0.8657547116096339}}"
3,0.2997,0.332872,0.802412,0.830215,0.816077,0.887881,"{'ent_type': {'correct': 60114, 'incorrect': 4741, 'partial': 0, 'missed': 4297, 'spurious': 6693, 'possible': 69152, 'actual': 71548, 'precision': 0.8401912003130765, 'recall': 0.8693024062933827, 'f1': 0.854498933901919}, 'partial': {'correct': 61198, 'incorrect': 0, 'partial': 3657, 'missed': 4297, 'spurious': 6693, 'possible': 69152, 'actual': 71548, 'precision': 0.8808981383127411, 'recall': 0.9114197709393799, 'f1': 0.8958990760483299}, 'strict': {'correct': 57411, 'incorrect': 7444, 'partial': 0, 'missed': 4297, 'spurious': 6693, 'possible': 69152, 'actual': 71548, 'precision': 0.8024123665231733, 'recall': 0.8302145997223508, 'f1': 0.8160767590618336}, 'exact': {'correct': 61198, 'incorrect': 3657, 'partial': 0, 'missed': 4297, 'spurious': 6693, 'possible': 69152, 'actual': 71548, 'precision': 0.8553418683960419, 'recall': 0.8849780194354465, 'f1': 0.869907604832978}}"


TrainOutput(global_step=3744, training_loss=0.40842092954195464, metrics={'train_runtime': 79314.1469, 'train_samples_per_second': 1.51, 'train_steps_per_second': 0.047, 'total_flos': 2621426290776948.0, 'train_loss': 0.40842092954195464, 'epoch': 3.0})

In [3]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification
import re

# Load the CSV dataset
data = pd.read_json('combined.jsonl'  , lines=True)['text'].values
print(data)
# Specify the number of rows to process (e.g., 500)
num_rows_to_process = 325

# Filter the dataset to select the first 'num_rows_to_process' rows
#data = data.head(num_rows_to_process)

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("clinical-bert-finetuned-ner/checkpoint-1248")
model = AutoModelForTokenClassification.from_pretrained("clinical-bert-finetuned-ner/checkpoint-1248")

# Function to extract token indices
def extract_token_indices(text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt")
    
    # Get token logits
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Predict token labels
    predictions = torch.argmax(logits, dim=2)
    
    # Convert token ids to tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
    
    # Combine tokens and labels
    combined_tokens = []
    current_token = ""
    current_label = ""
    
    for token, label in zip(tokens, predicted_token_class):
        if token.startswith("##"):
            current_token += token[2:]  # Append '##' tokens to the previous token
        else:
            if current_token:
                combined_tokens.append((current_token, current_label))
            current_token = token
            current_label = label
    
    if current_token:
        combined_tokens.append((current_token, current_label))
    tokens_to_find = [token for token, label in combined_tokens if label != 'O']
    cleaned_sentence = re.sub(r'[^\w\s]', '', text.lower())
    
    # Find token indices
    token_indices = []
    start_index = 0
    
    current_word = ""
    
    for token in tokens_to_find:
#         print("Cleaned_sentence", cleaned_sentence)
        start = text.lower().find(token)
        end = start + len(token)
        label = None
        for t, l in combined_tokens:
            if t == token:
                label = l
                break
        token_indices.append({
            "label": label,
            "start_index": start,
            "end_index": end,
            "token": token
        })
        start_index = end + 1
    
    return token_indices

# Process each row in the dataset
result_list = []

for index, row in data.iterrows():
    text = row['cleaned_sent']  # Replace 'cleaned_sent' with the actual column name in your CSV
    token_indices = extract_token_indices(text)
    result_list.append(token_indices)
print(result_list)
# Print the results
# for result in result_list:
#     print(result)


['NCT00351611 Insufficient response to pregabalin in the treatment of partial seizure'
 'NCT00862446 Anticipated TPN treatment for at least one month'
 'NCT00862446 TPN cholestasis of at least 2.5 mg/dl'
 'NCT01175044 Patients scheduled to undergo revision total knee arthroplasty for infectious reasons'
 'NCT01175044 Any condition requiring antibiotics 14 days prior to arriving for surgery'
 'NCT01446094 Severe lung disease (active wheezing) Severe bradycardia (heart rate < 40 beats/min) Second- or third-degree atrioventricular heart block Sick sinus syndrome History of Long QT syndrome Severe hypotension (systolic BP < 80 mm Hg) Decompensated heart failure'
 'NCT01446094 Inability to lie flat for 20-30 minutes (the anticipated amount of time to complete the MRI procedure)'
 'NCT01446094 Stage 4 or 5 chronic kidney disease (eGFR < 30 ml/min/1.73 m2) Known allergy to GBCA'
 'NCT01581749 history of an invasive malignancy (other than this prostate cancer,or basal or squamous skin cancers)

AttributeError: 'numpy.ndarray' object has no attribute 'iterrows'

In [4]:
import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification
import re

# Load the CSV dataset
data = pd.read_json('combined.jsonl', lines=True)['text'].values

# Convert the NumPy array to a Pandas DataFrame
df = pd.DataFrame(data, columns=['text'])

# Specify the number of rows to process (e.g., 500)
num_rows_to_process = 325

# Filter the dataset to select the first 'num_rows_to_process' rows
df = df.head(num_rows_to_process)

# Initialize the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("clinical-bert-finetuned-ner/checkpoint-1248")
model = AutoModelForTokenClassification.from_pretrained("clinical-bert-finetuned-ner/checkpoint-1248")

# Function to extract token indices
def extract_token_indices(text):
    # Tokenize the text
    inputs = tokenizer(text, return_tensors="pt")
    
    # Get token logits
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # Predict token labels
    predictions = torch.argmax(logits, dim=2)
    
    # Convert token ids to tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
    predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
    
    # Combine tokens and labels
    combined_tokens = []
    current_token = ""
    current_label = ""
    
    for token, label in zip(tokens, predicted_token_class):
        if token.startswith("##"):
            current_token += token[2:]  # Append '##' tokens to the previous token
        else:
            if current_token:
                combined_tokens.append((current_token, current_label))
            current_token = token
            current_label = label
    
    if current_token:
        combined_tokens.append((current_token, current_label))
    tokens_to_find = [token for token, label in combined_tokens if label != 'O']
    cleaned_sentence = re.sub(r'[^\w\s]', '', text.lower())
    
    # Find token indices
    token_indices = []
    start_index = 0
    
    current_word = ""
    
    for token in tokens_to_find:
#         print("Cleaned_sentence", cleaned_sentence)
        start = text.lower().find(token)
        end = start + len(token)
        label = None
        for t, l in combined_tokens:
            if t == token:
                label = l
                break
        token_indices.append({
            "label": label,
            "start_index": start,
            "end_index": end,
            "token": token
        })
        start_index = end + 1
    
    return token_indices

# Process each row in the dataset
result_list = []

for index, row in df.iterrows():
    text = row['text']  # Replace 'text' with the actual column name in your DataFrame
    token_indices = extract_token_indices(text)
    result_list.append(token_indices)

# Print the results
for result in result_list:
    print(result)


[{'label': 'B-treatment', 'start_index': 0, 'end_index': 11, 'token': 'nct00351611'}, {'label': 'B-treatment', 'start_index': 37, 'end_index': 47, 'token': 'pregabalin'}, {'label': 'B-treatment', 'start_index': 55, 'end_index': 64, 'token': 'treatment'}, {'label': 'I-chronic_disease', 'start_index': 76, 'end_index': 83, 'token': 'seizure'}]
[{'label': 'B-treatment', 'start_index': 0, 'end_index': 11, 'token': 'nct00862446'}, {'label': 'B-treatment', 'start_index': 24, 'end_index': 27, 'token': 'tpn'}, {'label': 'I-treatment', 'start_index': 28, 'end_index': 37, 'token': 'treatment'}, {'label': 'B-lower_bound', 'start_index': 51, 'end_index': 54, 'token': 'one'}, {'label': 'I-lower_bound', 'start_index': 55, 'end_index': 60, 'token': 'month'}]
[{'label': 'B-treatment', 'start_index': 0, 'end_index': 11, 'token': 'nct00862446'}, {'label': 'B-clinical_variable', 'start_index': 12, 'end_index': 15, 'token': 'tpn'}, {'label': 'I-clinical_variable', 'start_index': 16, 'end_index': 27, 'token

In [5]:
filtered_clinical_result = []

for result in result_list:
    clinical_result = []
    for sub in result:
        if sub['start_index'] >= 0 and sub['label'] != 'O':
            print(sub['label'])
            label = sub['label'].split('-')[1]  # Remove 'B-' and 'I-' prefixes
            clinical_result.append({
                'label': label,
                'start_index': sub['start_index'],
                'end_index': sub['end_index'],
                'token': sub['token']
            })
    filtered_clinical_result.append(clinical_result)

print(filtered_clinical_result)


B-treatment
B-treatment
B-treatment
I-chronic_disease
B-treatment
B-treatment
I-treatment
B-lower_bound
I-lower_bound
B-treatment
B-clinical_variable
I-clinical_variable
B-lower_bound
I-lower_bound
I-lower_bound
I-lower_bound
I-lower_bound
I-lower_bound
B-treatment
B-treatment
B-treatment
I-treatment
I-treatment
B-treatment
B-treatment
B-upper_bound
I-upper_bound
I-upper_bound
B-treatment
B-treatment
B-chronic_disease
B-chronic_disease
I-chronic_disease
I-chronic_disease
B-chronic_disease
B-clinical_variable
I-clinical_variable
B-upper_bound
I-upper_bound
I-upper_bound
I-upper_bound
I-chronic_disease
I-chronic_disease
B-clinical_variable
I-chronic_disease
B-chronic_disease
I-chronic_disease
I-chronic_disease
B-chronic_disease
I-chronic_disease
I-chronic_disease
I-chronic_disease
B-clinical_variable
I-clinical_variable
B-upper_bound
I-upper_bound
I-upper_bound
B-clinical_variable
I-chronic_disease
B-treatment
B-lower_bound
B-upper_bound
I-upper_bound
B-treatment
B-treatment
B-chronic_di

In [6]:
filtered_clinical_result

[[{'label': 'treatment',
   'start_index': 0,
   'end_index': 11,
   'token': 'nct00351611'},
  {'label': 'treatment',
   'start_index': 37,
   'end_index': 47,
   'token': 'pregabalin'},
  {'label': 'treatment',
   'start_index': 55,
   'end_index': 64,
   'token': 'treatment'},
  {'label': 'chronic_disease',
   'start_index': 76,
   'end_index': 83,
   'token': 'seizure'}],
 [{'label': 'treatment',
   'start_index': 0,
   'end_index': 11,
   'token': 'nct00862446'},
  {'label': 'treatment', 'start_index': 24, 'end_index': 27, 'token': 'tpn'},
  {'label': 'treatment',
   'start_index': 28,
   'end_index': 37,
   'token': 'treatment'},
  {'label': 'lower_bound', 'start_index': 51, 'end_index': 54, 'token': 'one'},
  {'label': 'lower_bound',
   'start_index': 55,
   'end_index': 60,
   'token': 'month'}],
 [{'label': 'treatment',
   'start_index': 0,
   'end_index': 11,
   'token': 'nct00862446'},
  {'label': 'clinical_variable',
   'start_index': 12,
   'end_index': 15,
   'token': 'tp

In [7]:
import json
json_file_path = "ClinicalBert_output.json"
with open(json_file_path, "w") as json_file:
    json.dump(filtered_clinical_result, json_file)

In [8]:
import os
import pandas as pd
import openai
import time
import warnings
from copy import deepcopy
warnings.filterwarnings("ignore")
from typing import List, Tuple, Dict, Optional, Union, Sequence
from nervaluate import Evaluator
openai.api_key = "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"

In [9]:
df = pd.read_csv("All_purpose_test_data.csv")
df.drop("Unnamed: 0", axis=1, inplace=True)
df.head(500)


Unnamed: 0,cleaned_sent,actaul_index,Formatted
0,Subject is suitable for oral administration of...,"[{'label': 'treatment', 'start': 24, 'end': 57...",- Subject is suitable for [oral administration...
1,Current pregnancy,"[{'label': 'pregnancy', 'start': 0, 'end': 17,...","- [Current pregnancy]{""entity"":""pregnancy""}"
2,Singleton pregnancy,"[{'label': 'pregnancy', 'start': 10, 'end': 19...","- Singleton [pregnancy]{""entity"":""pregnancy""}"
3,All patients currently taking steroids at the ...,"[{'label': 'treatment', 'start': 30, 'end': 38...","- All patients currently taking [steroids]{""en..."
4,rheumatic valve disease since this produces a ...,"[{'label': 'chronic_disease', 'start': 0, 'end...","- [rheumatic valve disease]{""entity"":""chronic_..."
...,...,...,...
495,Men must agree to use adequate contraception p...,"[{'label': 'gender', 'start': 0, 'end': 3, 'va...","- [Men]{""entity"":""gender""} must [agree to use ..."
496,Serum creatinine 1.5 x ULN,"[{'label': 'clinical_variable', 'start': 0, 'e...","- [Serum creatinine]{""entity"":""clinical_variab..."
497,History of seizure disorder,"[{'label': 'chronic_disease', 'start': 11, 'en...","- History of [seizure disorder]{""entity"":""chro..."
498,Participants who have received any other inves...,"[{'label': 'treatment', 'start': 41, 'end': 63...",- Participants who have received any other [in...


In [10]:
df_test = df.iloc[:500]
sentences_to_predict = list(df_test.cleaned_sent.values)
sentences_to_predict

['Subject is suitable for oral administration of study drug',
 'Current pregnancy',
 'Singleton pregnancy',
 'All patients currently taking steroids at the time of surgery or during the six-week recovery period as well as patients with betamethasone hypersensitivity',
 'rheumatic valve disease since this produces a unique AF phenotype',
 'Severe liver dysfunction LFT 3X upper limit of normal',
 'Serum M-protein >= 1.0 g/dL',
 'Males or females',
 'Negative for malignancy for past 5 years',
 'available for up to 6 hrs every day for two 2-week test periods with an intervening break of 7- 10 days each',
 'Diagnosis of recurrent and/or metastatic thyroid cancer',
 'No other investigational or commercial therapeutic agents may be given concurrently with the paclitaxel',
 'Mean arterial pressure < 65',
 'White blood cells WBC > 15,000 cells/mcL at screening',
 'Acute medical illness',
 'At least one prior chemotherapy',
 'History of prior thromboembolism with known thrombophilia',
 'Inabilit

In [4]:
input_cost_per_1k_tokens = 0.0015
output_cost_per_1k_tokens = 0.002
total_cost =  0
doller_to_rs = 83.19


def calculate_gpt_cost(result:Dict, input_cost_per_1k_tokens:float, output_cost_per_1k_tokens:float)->float:
    
    """
    Args:
    result: Response json from the gpt output
    input_cost_per_1k_tokens: The model usage cost for input tokens as mentioned by openai for model being used
    output_cost_per_1k_tokens: The model usage cost for output tokens as mentioned by openai for model being used
    
    Returns
    Total cost for GPT model in the Session
    """
    
    prompt_tokens = result["usage"]['prompt_tokens']
    completion_tokens = result["usage"]['completion_tokens']

    total_input_cost = (prompt_tokens / 1000) * input_cost_per_1k_tokens
    total_output_cost = (completion_tokens / 1000) * output_cost_per_1k_tokens
    total_cost = total_input_cost + total_output_cost

    return total_cost

In [5]:
instructions = """ Identify and extract entities from the inclusion/exclusion criteria of a clinical trial. Categorize the extracted entities based on the provided tags. If there are any numerical values, categorize them with appropriate lower_bound or upper_bound tags as needed.

Categories and sample entities for each tag:

- technology_access: ["working cellphone or landline", "cell phone", "touch-tone telephone keypad"]
- bmi: ["Body Mass Index", "BMI"]
- chronic_disease: ["metabolic bone disorder", "hypertension", "GI disease", "class IV heart failure"]
- gender: ["Female participants", "Men or women", "male", "Women"]
- contraception_consent: ["Agree to practice effective barrier contraception", "agree to use adequate contraception"]
- age: ["age"]
- Cancer: ["cervical cancer", "acute lymphoblastic leukemia ALL", "lymphoblastic lymphoma LL", "cancer"]
- language_fluency: ["Lack of English fluency", "not English speaking", "must be English speaking"]
- treatment: ["structured barium esophagram", "therapy", "radiotherapy", "investigational agents"]
- ethnicity: ["European ancestry", "Caucasian", "Black", "Latino"]
- clinical_variable: ["fundus autofluorescence", "Gleason score", "Hemoglobin", "eGFR", "Heart rate"]
- pregnancy: ["currently pregnant", "Pregnant", "unable to become pregnant"]
- allergy_name: ["stainless steel", "polyester", "abiraterone acetate"]

- For each sentence, extract and categorize the entities.
- If there are numerical values, use the format lower_bound/upper_bound as appropriate.
  for example, less than gives upper_bound, greater than gives lower_bound  

Examples 1:
Input Sentence: 
[1. "Subject has a history of hospitalization for acute illness in the previous 3 months"
2. "age less than 30"
3. "Auto-immune disease, acute stage (e.g., rheumatoid arthritis)"
4. "Singleton pregnancy"
5. "Current pregnancy"]

Output:
{"senttence_1":[{"label": "treatment","value": "hospitalization"},{"label": "chronic_disease", "value": "acute illness"},{"label": "upper_bound", "value": "previous 3 months"}],
"senttence_2":[{"label": "age", "value": "age"},{"label": "upper_bound", "value": "30"}],
"senttence_3":[{"label": "chronic_disease", "value": "Auto-immune disease"}, {"label": "clinical_variable", "value": "acute stage"}, {"label": "clinical_variable", "value": "rheumatoid arthritis"}],
"senttence_4" :[{"label": "pregnancy", "value": "Singleton pregnancy"}}],
"senttence_5" :[{"label": "pregnancy", "value": "Current pregnancy"}}]
}

it very very important to Always follow below Instruction 
## Give the reult only in the mentioned format that is Dict[str,List[Dict[str,str]]]and do not give the sentences
### Extract for the below-mentioned  5 sentences:

"""

In [6]:
def prompt_maker(instructions:str , sentences_to_predict:str)->str:
    
    """
    Args:
    This function takes the input as instructions to the model and the sentences to predict and returns prompt
    instructions: Common Instructions to the model
    sentences_to_predict: The senetnces/words to be predicted
    
    Returns:
    Final prompt 
    """
    
    prompt = instructions + sentences_to_predict
    
    return prompt

In [7]:
def entity_extractor(prompt:str)-> Sequence[[str, float]]:
    
    """
    Args 
    prompt : prompt containing the insturnctions and sentences from which entities to be extracted
    
    Returns
    result : The extracted ENtities from the sentences
    cost : Cost for the GPT model for the Session
    """
    
    messages = [
        {"role": "system", "content":"You are a helpful assistant who is helping in extracting entites from the sentences"},
        {"role": "user", "content": prompt}
    ] 
    response  = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=messages,
      max_tokens=1000,
      temperature=0
    )
    
    cost = calculate_gpt_cost(response,input_cost_per_1k_tokens, output_cost_per_1k_tokens )
    result = response["choices"][0].message['content']
    
    return result , cost 

In [8]:
batch_size= 5
start_index =  0
end_index = batch_size
total_batches = int(len(sentences_to_predict)/batch_size)
final_result_gpt = []
error_results =[]
for i in range(total_batches):
    print(f"Current Iteration :  {i} ")
    sentences_to_extract = sentences_to_predict[start_index:end_index]
    print(f"Sentences to predict {sentences_to_extract}")
    start_index= end_index
    end_index = end_index+batch_size
    
    sent = []
    num_of_sentences = len(sentences_to_extract)
    for j in range(num_of_sentences):
        sent.append(f'{j+1}. "{sentences_to_extract[j]}"')
        
    extract = "\n".join(sent)
    
    prompt = prompt_maker(instructions, extract)
    result , cost = entity_extractor(prompt)
    total_cost = total_cost + cost
    try:
        result_dict = eval(result)
        if len(result_dict.keys())==batch_size:
            for key in result_dict.keys():
                final_result_gpt.append(result_dict[key])
        else:
            for k in range(batch_size):
                final_result_gpt.append("None")
                
    except Exception as e:
        print(f"Error {e}")
        print(f"The result is {result}")
        error_results.append(result)
        for jk in range(batch_size):
            final_result_gpt.append("None")
            
    print(f"iteration {i} completed and complted for total {len(final_result_gpt)} sentences")
    print("-"*100)

Current Iteration :  0 
Sentences to predict ['Subject is suitable for oral administration of study drug', 'Current pregnancy', 'Singleton pregnancy', 'All patients currently taking steroids at the time of surgery or during the six-week recovery period as well as patients with betamethasone hypersensitivity', 'rheumatic valve disease since this produces a unique AF phenotype']
iteration 0 completed and complted for total 5 sentences
----------------------------------------------------------------------------------------------------
Current Iteration :  1 
Sentences to predict ['Severe liver dysfunction LFT 3X upper limit of normal', 'Serum M-protein >= 1.0 g/dL', 'Males or females', 'Negative for malignancy for past 5 years', 'available for up to 6 hrs every day for two 2-week test periods with an intervening break of 7- 10 days each']
iteration 1 completed and complted for total 10 sentences
----------------------------------------------------------------------------------------------

iteration 14 completed and complted for total 75 sentences
----------------------------------------------------------------------------------------------------
Current Iteration :  15 
Sentences to predict ['at least 6 times duration of the reported half life or minimum four 4 months for other depot or long-acting antipsychotics', 'Known positive Hepatitis B antigen HBs Ag unless positive test can be attributed to receipt of Hepatitis B vaccination in childhood or Hepatitis C viral antibody HCV with evidence of active hepatitis i.e., AST ALT greater than two times the ULN', 'Sexually active males must use a condom during intercourse', 'Active, uncontrolled infection and/or human immunodeficiency virus HIV positive constitute progressive disease', 'a contraceptive implant']
Error unmatched '}' (<string>, line 2)
The result is {"sentence_1": [{"label": "clinical_variable", "value": "duration"}, {"label": "clinical_variable", "value": "reported half life"}, {"label": "lower_bound", "value

iteration 26 completed and complted for total 135 sentences
----------------------------------------------------------------------------------------------------
Current Iteration :  27 
Sentences to predict ['Bilirubin 2 x the ULN', 'History of organ allograft including corneal transplant', 'Autologous stem cell transplantation SCT within 100 days prior to study drug or any prior allogeneic SCT or solid organ transplantation', 'Diagnosis of type 2 diabetes', 'History of an active malignancy within the last 3 years']
iteration 27 completed and complted for total 140 sentences
----------------------------------------------------------------------------------------------------
Current Iteration :  28 
Sentences to predict ['All adverse events Grade > 1 related to prior therapies chemotherapy radiotherapy and/or surgery must be resolved, except for alopecia', 'Pre-bronchodilator 50% FEV1 of < 85 % of the predicted normal value for the patient after withholding bronchodilators at both Visit

iteration 37 completed and complted for total 190 sentences
----------------------------------------------------------------------------------------------------
Current Iteration :  38 
Sentences to predict ['uncontrolled bacterial viral or fungal infection currently taking medication and with progression or no clinical improvement at time of enrollment', 'Women of childbearing potential must have a negative pregnancy test', 'Patient must consent to a biopsy of a site of disease unless the only site of disease is lung/pleura, bone, or deemed unsafe by the principal investigator', 'No distant metastases', 'Insulin requirement of > 1.0 U/kg/day or, > 60 U/day total or < 15 U/day']
iteration 38 completed and complted for total 195 sentences
----------------------------------------------------------------------------------------------------
Current Iteration :  39 
Sentences to predict ['Residual symptoms will be defined as a total score of 110 and 60 of PANSS per Visit 1 evaluations', '10

iteration 51 completed and complted for total 260 sentences
----------------------------------------------------------------------------------------------------
Current Iteration :  52 
Sentences to predict ['Bilirubin < 3x AST <3x, Serum creatinine < 2x upper limit of normal Hgb > 8.0 plts > 20', 'have a diagnosis of mTBI based upon Veterans Health Administration VHA /Department of Defense DoD criteria with persisting symptoms > 3 months post injury', 'Must be optimally medicated at the start of the study. This means that there should be no change in mediction type or dosage in 3 months prior to enrolling in the study. The medications should not be causing significant or serious advese effects', 'women who are breast feeding', 'uncontrolled coagulation blood disorders like haemophilia malignant tumors and fear of needles']
iteration 52 completed and complted for total 265 sentences
----------------------------------------------------------------------------------------------------
Cur

iteration 63 completed and complted for total 320 sentences
----------------------------------------------------------------------------------------------------
Current Iteration :  64 
Sentences to predict ['Acute leukemia in 1st or 2nd CR', 'Absolute CD4 count > 100 cells/uL', 'Patients with underlying abnormal brain pathology e.g. mass or bleed as the potential cause of the migraine', 'Clinical or radiologic evidence of untreated and/or progressive brain metastases', 'Albumin >= 2.5 g/dL']
iteration 64 completed and complted for total 325 sentences
----------------------------------------------------------------------------------------------------
Current Iteration :  65 
Sentences to predict ['History of allergic/hypersensitivity reaction to any substance having required hospitalization and/or treatment with IV steroids epinephrine', 'Uncontrolled angina within 3 months of Screening visit', 'Female participants who are breast feeding must agree to stop breast feeding', 'at least on

Error unmatched '}' (<string>, line 2)
The result is {"sentence_1": [{"label": "treatment", "value": "prostacyclin therapy"}, {"label": "treatment", "value": "Epoprostenol Flolan"}, {"label": "treatment", "value": "Veltri"}, {"label": "treatment", "value": "Iloprost Ventavis"}, {"label": "treatment", "value": "Treprostinil Orenitram oral Remodulin"}, {"label": "treatment", "value": "IV or SC"}]},
"sentence_2": [{"label": "treatment", "value": "chemotherapy"}, {"label": "treatment", "value": "immunotherapy"}, {"label": "treatment", "value": "hematopoietic stem cell transplantation"}]},
"sentence_3": [{"label": "treatment", "value": "Systemic corticosteroids"}, {"label": "clinical_variable", "value": "physiologic doses"}, {"label": "clinical_variable", "value": "not to exceed 10 mg/day of prednisone or equivalent"}]},
"sentence_4": [{"label": "clinical_variable", "value": "autoimmune neurological disease"}, {"label": "clinical_variable", "value": "neurological evaluation"}]},
"sentence_5

iteration 84 completed and complted for total 425 sentences
----------------------------------------------------------------------------------------------------
Current Iteration :  85 
Sentences to predict ['Breath alcohol level > 0.01 participants with a positive screen will be allowed to re-screen once', 'Total bilirubin 1.5 x ULN upper limit of normal', 'ALT and AST 2.5 X upper limit of normal ULN', 'Patients must not have an uncontrolled bacterial fungal or viral infection defined as progressive symptoms despite therapy at the time of the CD8+ memory T-cell infusion', 'Commit to using birth control during the study all participants']
iteration 85 completed and complted for total 430 sentences
----------------------------------------------------------------------------------------------------
Current Iteration :  86 
Sentences to predict ['Recurrent ovarian cancer', 'Be able to read, write, and speak English', 'Prior whole-lung or hemi-thorax irradiation of greater than 12 Gy recei

iteration 98 completed and complted for total 495 sentences
----------------------------------------------------------------------------------------------------
Current Iteration :  99 
Sentences to predict ['Men must agree to use adequate contraception prior to enrollment, for the duration of study participation and for at least 3 months thereafter', 'Serum creatinine 1.5 x ULN', 'History of seizure disorder', 'Participants who have received any other investigational agents within the 4 weeks prior to enrollment; concurrent radiation therapy', 'ALT or BUN > 2.0 ULN or estimated creatinine > 1.5 X ULN for age or any other laboratory abnormality considered by the Investigator to be clinically significant within 14 days before Screening']
iteration 99 completed and complted for total 500 sentences
----------------------------------------------------------------------------------------------------


In [9]:
final_result_gpt

[[{'label': 'treatment', 'value': 'oral administration'},
  {'label': 'treatment', 'value': 'study drug'}],
 [{'label': 'pregnancy', 'value': 'Current pregnancy'}],
 [{'label': 'pregnancy', 'value': 'Singleton pregnancy'}],
 [{'label': 'clinical_variable', 'value': 'steroids'},
  {'label': 'clinical_variable', 'value': 'surgery'},
  {'label': 'clinical_variable', 'value': 'recovery period'},
  {'label': 'clinical_variable', 'value': 'betamethasone hypersensitivity'}],
 [{'label': 'clinical_variable', 'value': 'rheumatic valve disease'},
  {'label': 'clinical_variable', 'value': 'AF phenotype'}],
 [{'label': 'chronic_disease', 'value': 'Severe liver dysfunction'},
  {'label': 'clinical_variable', 'value': 'LFT'},
  {'label': 'upper_bound', 'value': '3X upper limit of normal'}],
 [{'label': 'clinical_variable', 'value': 'Serum M-protein'},
  {'label': 'lower_bound', 'value': '1.0 g/dL'}],
 [{'label': 'gender', 'value': 'Males or females'}],
 [{'label': 'Cancer', 'value': 'Negative for ma

In [10]:
print(f"Total cost in for {total_batches*batch_size} sentences is : $ {round(total_cost,4)}")
print(f"Total cost in for {total_batches*batch_size} sentences is : Rs. {round(total_cost*doller_to_rs, 4)}")

Total cost in for 500 sentences is : $ 0.1811
Total cost in for 500 sentences is : Rs. 15.0661


In [11]:
# This function finds the indexes of the entites identifed by the GPT Model
def find_entities_in_sentence(entities : List, sentence:str)->List:
    """
    This Function finds the start index, end index for the entites from the given sentence
    Args
    entities: List of Entities for which indexes to be found
    sentence : Respective sentence from which index will be found
    
    Returns
    List of entities with start index, end index
    """
    
    matched_entities = []
    
    for entity in entities:
        label = entity['label']
        value = entity['value']
        start_idx = sentence.find(value)
        
        while start_idx != -1:
            end_idx = start_idx + len(value)
            matched_entities.append({
                'label': label,
                'start': start_idx,
                'end': end_idx,
                'value': value
            })
            
            start_idx = sentence.find(value, end_idx)
    
    return matched_entities

In [12]:
final_result_gpt

[[{'label': 'treatment', 'value': 'oral administration'},
  {'label': 'treatment', 'value': 'study drug'}],
 [{'label': 'pregnancy', 'value': 'Current pregnancy'}],
 [{'label': 'pregnancy', 'value': 'Singleton pregnancy'}],
 [{'label': 'clinical_variable', 'value': 'steroids'},
  {'label': 'clinical_variable', 'value': 'surgery'},
  {'label': 'clinical_variable', 'value': 'recovery period'},
  {'label': 'clinical_variable', 'value': 'betamethasone hypersensitivity'}],
 [{'label': 'clinical_variable', 'value': 'rheumatic valve disease'},
  {'label': 'clinical_variable', 'value': 'AF phenotype'}],
 [{'label': 'chronic_disease', 'value': 'Severe liver dysfunction'},
  {'label': 'clinical_variable', 'value': 'LFT'},
  {'label': 'upper_bound', 'value': '3X upper limit of normal'}],
 [{'label': 'clinical_variable', 'value': 'Serum M-protein'},
  {'label': 'lower_bound', 'value': '1.0 g/dL'}],
 [{'label': 'gender', 'value': 'Males or females'}],
 [{'label': 'Cancer', 'value': 'Negative for ma

In [13]:
gpt_res_with_indexes = []
for entity, sentence in zip(final_result_gpt, sentences_to_predict):
    if entity is not "None":
        print(entity)
        gpt_res_with_indexes.append(find_entities_in_sentence(entity, sentence))

[{'label': 'treatment', 'value': 'oral administration'}, {'label': 'treatment', 'value': 'study drug'}]
[{'label': 'pregnancy', 'value': 'Current pregnancy'}]
[{'label': 'pregnancy', 'value': 'Singleton pregnancy'}]
[{'label': 'clinical_variable', 'value': 'steroids'}, {'label': 'clinical_variable', 'value': 'surgery'}, {'label': 'clinical_variable', 'value': 'recovery period'}, {'label': 'clinical_variable', 'value': 'betamethasone hypersensitivity'}]
[{'label': 'clinical_variable', 'value': 'rheumatic valve disease'}, {'label': 'clinical_variable', 'value': 'AF phenotype'}]
[{'label': 'chronic_disease', 'value': 'Severe liver dysfunction'}, {'label': 'clinical_variable', 'value': 'LFT'}, {'label': 'upper_bound', 'value': '3X upper limit of normal'}]
[{'label': 'clinical_variable', 'value': 'Serum M-protein'}, {'label': 'lower_bound', 'value': '1.0 g/dL'}]
[{'label': 'gender', 'value': 'Males or females'}]
[{'label': 'Cancer', 'value': 'Negative for malignancy'}, {'label': 'clinical_v

In [14]:
gpt_res_with_indexes

[[{'label': 'treatment',
   'start': 24,
   'end': 43,
   'value': 'oral administration'},
  {'label': 'treatment', 'start': 47, 'end': 57, 'value': 'study drug'}],
 [{'label': 'pregnancy', 'start': 0, 'end': 17, 'value': 'Current pregnancy'}],
 [{'label': 'pregnancy',
   'start': 0,
   'end': 19,
   'value': 'Singleton pregnancy'}],
 [{'label': 'clinical_variable', 'start': 30, 'end': 38, 'value': 'steroids'},
  {'label': 'clinical_variable', 'start': 54, 'end': 61, 'value': 'surgery'},
  {'label': 'clinical_variable',
   'start': 85,
   'end': 100,
   'value': 'recovery period'},
  {'label': 'clinical_variable',
   'start': 126,
   'end': 156,
   'value': 'betamethasone hypersensitivity'}],
 [{'label': 'clinical_variable',
   'start': 0,
   'end': 23,
   'value': 'rheumatic valve disease'},
  {'label': 'clinical_variable',
   'start': 53,
   'end': 65,
   'value': 'AF phenotype'}],
 [{'label': 'chronic_disease',
   'start': 0,
   'end': 24,
   'value': 'Severe liver dysfunction'},
  

In [15]:
sentences_to_predict[1]

'Current pregnancy'

In [16]:
# making the result of GPT in same format of Rasa
for res in gpt_res_with_indexes:
    for ent in res:
        ent.pop("value")

In [17]:
gpt_res_with_indexes[0]

[{'label': 'treatment', 'start': 24, 'end': 43},
 {'label': 'treatment', 'start': 47, 'end': 57}]

In [18]:
sentences_to_predict[0][47:57]

'study drug'

In [31]:
!pip install requests




In [32]:
import requests



# Define the endpoint URL of the Rasa server
rasa_endpoint = "http://192.168.29.197:5005" 

In [33]:
conversation_outcome = []

# Send API requests for each message and store the response
i = 1
for sent in sentences_to_predict:
    print(f"iteration {i}")
    i+=1
    payload = {
        "text": sent,
        "message_id": None
    }
    response = requests.post(rasa_endpoint, json=payload)
    conversation_outcome.append(response.json())

iteration 1


ConnectionError: HTTPConnectionPool(host='192.168.29.197', port=5005): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000211536442D0>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))