## Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!python -m spacy download
# maybe try en_core_sci_lg that is biomedical ? https://allenai.github.io/scispacy/

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import pandas as pd
import pickle

from torch.utils.data import DataLoader
import random
import numpy as np
import pandas as pd

from tqdm import tqdm

from spacy.training import Example
import random

import spacy
import os
import datetime
from google.colab import userdata, runtime
import re

import warnings

import nltk
from nltk.corpus import stopwords
import string

import spacy
from spacy.training import Example
import random
from tqdm import tqdm
import copy

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Load Dataset & Model

In [13]:
DATASET_DIR = '/content/drive/My Drive/_collaborations_/Synda_Health_Mile2/Dataset/' # data set, e.g. free text for deployment
GENERATIONS_DIR = '/content/drive/My Drive/_collaborations_/Synda_Health_Mile2/EVALUATION/Medical_Generations/'
SAVE_DIR = '/content/drive/My Drive/_collaborations_/Synda_Health_Mile2/EVALUATION/Downstream_Results/Medical/'

training_set = pd.read_csv(DATASET_DIR + 'train.csv')
# generations = pd.read_csv('path/to/generations')
validation_set = pd.read_csv(DATASET_DIR + 'valid.csv')
test_set = pd.read_csv(DATASET_DIR + 'test.csv')

MASKING_RATIO = 0.5
MODEL_NAME = 'roberta-large'

REAL_EVALUATE = True
GENERATIONS_EVALUATE = True

stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

RUN_NAME = ["roberta-large - Run - 02-11--14:40", "BiomedNLP-BiomedBERT-large-uncased-abstract - Run Medical - 03-02--18:56"][1]

In [14]:
nlp_sci = spacy.load("/content/drive/My Drive/_collaborations_/Synda_Health_Mile2/NER_Models/en_core_sci_lg")



In [15]:
data_train = training_set['Clinical Letters']
data_valid = validation_set['Clinical Letters']
data_test = test_set['Clinical Letters']

## Get Entities & Train the Model

In [16]:
data_train

0      Please shower daily including washing incision...
1      * Increasing pain * Fever * Inability to eat o...
2      You were admitted to the hospital with a react...
3      Dear Mr. Known lastname , You were admitted to...
4      Please note: you have a mm nodule that was not...
                             ...                        
202    Please call the Hospital Clinic at Telephone/F...
203    Dr. Known lastname , it was a pleasure to part...
204    please call the Transplant Office Telephone/Fa...
205                                          To Hospital
206    # You were admitted to the hospital for shortn...
Name: Clinical Letters, Length: 207, dtype: object

In [18]:
def get_entities(data):
    all_entities = []

    for doc in nlp_sci.pipe(data):
        entities = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        all_entities.append((doc.text, {'entities': entities}))

    return all_entities


# if len(generations.keys()) < 10:
#     warnings.warn(f"generations keys length: {len(generations.keys())}")

entities = dict()

# entities['real'] = get_entities(data_train)
generations_dict = dict()

# with open(GENERATIONS_DIR + RUN_NAME + ".pkl", "rb") as f:
#     model_generations = pickle.load(f)

# for option in model_generations.keys():
#     if option == 'random':
#         for key in model_generations[option].keys():
#             generations = model_generations[option][key]
#             generations_dict[key] = generations
#     else:
#         generations = model_generations[option]
#         generations_dict[option] = generations

# for key in generations_dict.keys():
#     entities[key] = get_entities(list(generations_dict[key].values()))

entities['train'] = get_entities(data_train)
# entities['valid'] = get_entities(data_valid)
# entities['test'] = get_entities(data_test)

In [21]:
entities.keys()

dict_keys(['train'])

In [33]:
def evaluate_model(model, validation_data):
    losses = {}
    for text, annotations in validation_data:
        doc = model.make_doc(text)
        example = Example.from_dict(doc, annotations)
        model.update([example], drop=0.0, losses=losses, sgd=None)

    return losses['ner']

EPOCHS = 3

def train_model(training_data, validation_data):
    model = spacy.load("en_core_web_lg")

    if "ner" not in model.pipe_names:
        ner = model.create_pipe("ner")
        model.add_pipe(ner, last=True)
    else:
        ner = model.get_pipe("ner")

    ner.add_label('ENTITY')
    other_pipes = [pipe for pipe in model.pipe_names if pipe != "ner"]

    with model.disable_pipes(*other_pipes):
        optimizer = model.resume_training()
        best_loss = float('inf')
        best_model = None

        for epoch in range(EPOCHS): # setup epoch num
            random.shuffle(training_data)
            losses = {}

            for text, annotations in training_data:
                doc = model.make_doc(text) #model to be trained
                example = Example.from_dict(doc, annotations) # pack predication and expected/gold labels
                model.update([example], drop=0.5, losses=losses, sgd=optimizer) # the loss to be minimised

            val_loss = evaluate_model(model, validation_data) # validation - not really need in some situration

            if val_loss < best_loss:
                best_loss = val_loss
                best_model = copy.deepcopy(model)

    return best_model # return the best model that can be deployed.


models = dict()

for key in tqdm(list(entities.keys())):
    if key in ['valid', 'test']:
        continue

    models[key] = train_model(training_data=entities[key], validation_data=entities['train'])

100%|██████████| 1/1 [01:45<00:00, 105.02s/it]


## Evaluate The Model

You were admitted with severe pancreatitis from gallstones. You were treated supportively during this and you improved. You required intubation twice to support your breathing. You were treated for pneumonia, acute kidney failure and aspiration. You are now improving and being sent to an acute rehabilitation facility. In the future it will be important for you to have a cholecystectomy in - months, once you have completely recovered from this hospitalization. You will also need to have follow up imaging of your lungs in months given the pulmonary nodules found during this admission. Some of your medications have changed: We have stopped actos and metformin. These were changed to insulin while you were sick. Once you go home you can restart these. We have stopped amlodipine. We have halved your lisinopril. We have started trazodone, simethicone, senna, colace, bisacodyl, insulin and lidocaine patch.

In [32]:
def evaluate_model(model):
    correct = 0
    predicted_total = 0
    actual_total = 0

    for text, original_annotation in zip(training_set['Clinical Letters'], entities['train']):
        doc = model(text) # in real setting, this returns the labels of input text by deploying saved best model.
        predicted_entities = set((ent.text, ent.label_) for ent in doc.ents) # extracting entities attached to text
        original_entities_set = set((text[start:end], label) for start, end, label in original_annotation[1]['entities']) # this gold label for evaluation purpose, not needed for deployment
        correct += len(predicted_entities.intersection(original_entities_set))
        predicted_total += len(predicted_entities)
        actual_total += len(original_entities_set)

    precision = correct / predicted_total if predicted_total > 0 else 0
    recall = correct / actual_total if actual_total > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    return f1


results = dict()

for key in tqdm(models.keys()):
    results[key] = evaluate_model(models[key])

results

100%|██████████| 1/1 [00:02<00:00,  2.17s/it]

Precision: 0.030669285390542377
Recall: 0.03789434384916931
F1 Score: 0.033901135604542415





{'train': 0.033901135604542415}

In [35]:
doc = models['train']("You were admitted with severe pancreatitis from gallstones. You were treated supportively during this and you improved. You required intubation twice to support your breathing. You were treated for pneumonia, acute kidney failure and aspiration. You are now improving and being sent to an acute rehabilitation facility. In the future it will be important for you to have a cholecystectomy in - months, once you have completely recovered from this hospitalization. You will also need to have follow up imaging of your lungs in months given the pulmonary nodules found during this admission. Some of your medications have changed: We have stopped actos and metformin. These were changed to insulin while you were sick. Once you go home you can restart these. We have stopped amlodipine. We have halved your lisinopril. We have started trazodone, simethicone, senna, colace, bisacodyl, insulin and lidocaine patch.")

In [37]:
list((ent.text, ent.label_) for ent in doc.ents)

[('admitted with', 'ENTITY'),
 ('severe', 'ENTITY'),
 ('pancreatitis', 'ENTITY'),
 ('gallstones', 'ENTITY'),
 ('treated supportively', 'ENTITY'),
 ('improved', 'ENTITY'),
 ('intubation', 'ENTITY'),
 ('breathing', 'ENTITY'),
 ('treated', 'ENTITY'),
 ('pneumonia', 'ENTITY'),
 ('acute kidney failure', 'ENTITY'),
 ('aspiration', 'ENTITY'),
 ('improving', 'ENTITY'),
 ('acute rehabilitation facility', 'ENTITY'),
 ('cholecystectomy', 'ENTITY'),
 ('months', 'ENTITY'),
 ('recovered', 'ENTITY'),
 ('hospitalization', 'ENTITY'),
 ('follow up', 'ENTITY'),
 ('imaging', 'ENTITY'),
 ('lungs', 'ENTITY'),
 ('months', 'ENTITY'),
 ('pulmonary nodules', 'ENTITY'),
 ('admission', 'ENTITY'),
 ('medications', 'ENTITY'),
 ('actos', 'ENTITY'),
 ('metformin', 'ENTITY'),
 ('insulin', 'ENTITY'),
 ('sick', 'ENTITY'),
 ('restart', 'ENTITY'),
 ('amlodipine', 'ENTITY'),
 ('halved', 'ENTITY'),
 ('lisinopril', 'ENTITY'),
 ('trazodone', 'ENTITY'),
 ('simethicone', 'ENTITY'),
 ('senna', 'ENTITY'),
 ('colace', 'ENTITY'),
 

In [38]:
list((ent.text) for ent in doc.ents)

['admitted with',
 'severe',
 'pancreatitis',
 'gallstones',
 'treated supportively',
 'improved',
 'intubation',
 'breathing',
 'treated',
 'pneumonia',
 'acute kidney failure',
 'aspiration',
 'improving',
 'acute rehabilitation facility',
 'cholecystectomy',
 'months',
 'recovered',
 'hospitalization',
 'follow up',
 'imaging',
 'lungs',
 'months',
 'pulmonary nodules',
 'admission',
 'medications',
 'actos',
 'metformin',
 'insulin',
 'sick',
 'restart',
 'amlodipine',
 'halved',
 'lisinopril',
 'trazodone',
 'simethicone',
 'senna',
 'colace',
 'bisacodyl',
 'insulin',
 'lidocaine patch']

## Save Results

In [None]:
with open(SAVE_DIR + f'{RUN_NAME}_movie_evaluation_results.pkl', 'wb') as f:
    pickle.dump(results, f)

with open(SAVE_DIR + f'{RUN_NAME}_movie_evaluation_results.txt', 'w') as f:
    f.write(str(results))

In [None]:
from google.colab import runtime
runtime.unassign()