### Set Up

In [None]:
# installing Med7 (GLOVE and roberta embeddings) and it's related libraries
# !python -m pip install jedi --quiet
# !python -m pip install -U wheel pip setuptools pip install spacy==3.4.4 pip install spacy-transformers==1.1.9 --quiet
# !python -m pip install https://huggingface.co/kormilitzin/en_core_med7_lg/resolve/main/en_core_med7_lg-any-py3-none-any.whl --quiet
# !python -m pip install https://huggingface.co/kormilitzin/en_core_med7_trf/resolve/main/en_core_med7_trf-any-py3-none-any.whl --quiet

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# CD to Drive Directory for imports
import sys
sys.path.append('drive/MyDrive/SynDa_Health/DownstreamNERTask')

Mounted at /content/drive


In [None]:
import pandas as pd
import pickle
import ast

from tqdm import tqdm

import spacy

from Modules.label_parser import ner_parser
from Modules.extract_ner_datasets import extract_data
from Modules.finetune_spacy import load_valid_data, load_test_data, train_ner, evaluate_ner

In [None]:
# input_paths = ["Real", "LT3", "T5"]
input_paths = ["RealSynthetic"]
specific_path = None
OUT_PATH = "drive/MyDrive/SynDa_Health/DatasetAnalysis/Generations/Results"

EPOCHS = 10

### Train Model

In [None]:
# def prepare_dataset(specific_path=None):
#     global input_paths

#     if specific_path:
#         input_paths = [specific_path]

#     for input_path in input_paths:

#         path = "drive/MyDrive/SynDa_Health/DatasetAnalysis/Generations" + input_path

#         for split_path in ['train', 'valid', 'test']:
#             df = read_text_into_dataframe(path + f"/{split_path}_descriptions.txt")

#             parsed_data = pd.concat([ner_parser(instruction) for instruction in df['instructions']])
#             parsed_data.reset_index(drop=True, inplace=True)
#             df_with_entities = pd.concat([df, parsed_data], axis=1)

#             data = extract_data(df_with_entities)

#             with open(path + f'/{split_path}_data.pkl', 'wb') as f:
#                 pickle.dump(data, f)

In [None]:
def prepare_dataset(specific_path=None):
    global input_paths

    if specific_path:
        input_paths = [specific_path]

    for input_path in ["Real"]:

        path = "drive/MyDrive/SynDa_Health/DatasetAnalysis/Generations/" + input_path

        # df = read_text_into_dataframe(path + f"/{input_path}-Training-5xGenerations.txt")
        with open(path + f"/train_descriptions.txt") as f:
          lines = f.readlines()

        # Split the data into lines
        # lines = data.split('\n')

        # Create a DataFrame with one column
        df = pd.DataFrame({'instructions': lines})

        # Display the DataFrame
        parsed_data = pd.concat([ner_parser(instruction) for instruction in df['instructions']])
        parsed_data.reset_index(drop=True, inplace=True)
        df_with_entities = pd.concat([df, parsed_data], axis=1)

        data = extract_data(df_with_entities)

        with open(path + f'/5xGenerationsReal_data.pkl', 'wb') as f:
            pickle.dump(data, f)

In [None]:
def prepare_dataset(specific_path=None):
    global input_paths

    if specific_path:
        input_paths = [specific_path]

    for input_path in ["LT3"]:

        path = "drive/MyDrive/SynDa_Health/DatasetAnalysis/Generations/" + input_path
        with open(path + f"/{input_path}-Testing-5xGenerations.txt", 'r') as fp:
            medications = ast.literal_eval(fp.read())

        instructions = []
        for key in medications:
            for value in medications[key]:
                instructions.append(value)

        df = pd.DataFrame(instructions, columns=["instructions"])
        # df = read_text_into_dataframe(path + f"/{input_path}-Training-5xGenerations.txt")
        parsed_data = pd.concat([ner_parser(instruction) for instruction in df['instructions']])
        parsed_data.reset_index(drop=True, inplace=True)
        df_with_entities = pd.concat([df, parsed_data], axis=1)

        data = extract_data(df_with_entities)
        with open(path + f'/5xGenerationsSynthetic_data.pkl', 'wb') as f:
            pickle.dump(data, f)

In [None]:
if __name__ == "__main__":
    prepare_dataset()

In [None]:
def filter_data(data, keep_count=1):
    """
    Filters the data based on the first word of the first element of the tuple.
    Keeps only 'keep_count' number of entries for tuples with the same starting word.

    Parameters:
    - data: A list of tuples.
    - keep_count: Number of entries to keep for tuples with the same starting word.

    Returns:
    - A filtered list of tuples.
    """
    filtered_data = []
    seen = {}

    for item in data:
        # Get the starting word from the first element of the tuple.
        starting_word = item[0].split()[0]

        # Check if we've seen this word before.
        if starting_word in seen:
            seen[starting_word] += 1
            if seen[starting_word] <= keep_count:
                filtered_data.append(item)
        else:
            seen[starting_word] = 1
            filtered_data.append(item)

    return filtered_data

In [None]:
def filter_data_proportional(data, proportion=0.2):
    """
    Filters the data based on the first word of the first element of the tuple.
    Keeps a proportional number of entries for tuples with the same starting word.

    Parameters:
    - data: A list of tuples.
    - proportion: Proportion of entries to keep for tuples with the same starting word.

    Returns:
    - A filtered list of tuples.
    """
    filtered_data = []

    # Count the occurrences of each starting word
    word_count = {}
    for item in data:
        starting_word = item[0].split()[0]
        word_count[starting_word] = word_count.get(starting_word, 0) + 1

    # Calculate the number of entries to keep for each starting word
    keep_count = {word: int(count * proportion) for word, count in word_count.items()}

    # Make sure at least one item is kept for each starting word
    for word in keep_count:
        keep_count[word] = max(1, keep_count[word])

    seen = {}
    for item in data:
        starting_word = item[0].split()[0]

        if starting_word in seen:
            seen[starting_word] += 1
            if seen[starting_word] <= keep_count[starting_word]:
                filtered_data.append(item)
        else:
            seen[starting_word] = 1
            filtered_data.append(item)

    return filtered_data

In [None]:
def oversample_data(data, oversampling_factor):
    oversampled_data = []
    for _ in range(oversampling_factor):
        oversampled_data.extend(data)
    return oversampled_data

In [None]:
def train_evaluate(epochs=1, param=0.2, specific_path=None, squeeze=False, oversampling_factor=1):
    global input_paths

    if specific_path:
        input_paths = [specific_path]

    keep_count = 5
    proportional = True

    for input_path in input_paths:

        path = "drive/MyDrive/SynDa_Health/Dataset/NER_Splits/"

        # Load real and synthetic data
        real_data = load_valid_data("drive/MyDrive/SynDa_Health/DatasetAnalysis/Generations", "/Real/5xGenerationsReal_data.pkl")
        synthetic_data = load_valid_data("drive/MyDrive/SynDa_Health/DatasetAnalysis/Generations", "/LT3/5xGenerationsSynthetic_data.pkl")

        # Use filter_data_proportional to filter synthetic data
        if param > 0:
            filtered_synthetic_data = filter_data_proportional(synthetic_data, param)
        else:
            filtered_synthetic_data = []

        # Concatenate real and filtered synthetic data
        valid_data = real_data + filtered_synthetic_data

        # Oversample the data
        if oversampling_factor > 1:
            valid_data = oversample_data(valid_data, oversampling_factor)

        test_data = load_test_data(path)

        nlp = spacy.load('en_core_web_sm')
        ner = nlp.get_pipe("ner")

        train_ner(nlp, ner, valid_data, epochs=epochs)
        results = evaluate_ner(nlp, test_data)

        df = pd.DataFrame(results).T

        df.to_csv(OUT_PATH + str(param) + "FinalSpacyRealSynthetic" + input_path + (f"squeeze{keep_count}" if squeeze else "") + (f"{param}proportional" if proportional else "") + ".csv")

        return df

In [None]:
if __name__ == "__main__":
    dfs = []
    for proportion in tqdm(range(5)):
        dfs.append(train_evaluate(epochs=EPOCHS, squeeze=False, param=proportion))

    with open("drive/MyDrive/SynDa_Health/DatasetAnalysis/Generations/dfsRealSynthetic.pkl", "wb") as pickle_file:
        pickle.dump(dfs, pickle_file)

from google.colab import runtime
runtime.unassign()

  0%|          | 0/5 [00:00<?, ?it/s]
" with entities "[(0, 9, 'DRUG')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

 10%|█         | 1/10 [01:33<14:00, 93.37s/it][A
 20%|██        | 2/10 [03:25<13:56, 104.61s/it][A
 30%|███       | 3/10 [05:20<12:44, 109.15s/it][A
 40%|████      | 4/10 [07:14<11:07, 111.18s/it][A
 50%|█████     | 5/10 [09:09<09:22, 112.59s/it][A
 60%|██████    | 6/10 [11:05<07:34, 113.55s/it][A
 70%|███████   | 7/10 [13:00<05:42, 114.03s/it][A
 80%|████████  | 8/10 [14:54<03:48, 114.09s/it][A
 90%|█████████ | 9/10 [16:49<01:54, 114.46s/it][A
100%|██████████| 10/10 [18:44<00:00, 112.50s/it]
 20%|██        | 1/5 [19:05<1:16:23, 1145.83s/it]
" with entities "[(0, 9, 'DRUG')]". Use `spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)` to check the alignment. Misaligned entities ('-') will be ignored during training.

 10%|█         | 1/

In [None]:
with open("drive/MyDrive/SynDa_Health/DatasetAnalysis/Generations/dfsSynthetic.pkl", "rb") as pickle_file:
    loaded_dfs = pickle.load(pickle_file)

In [None]:
loaded_dfs
[i*100 for i in range(1, 6)]

Unnamed: 0,Precision,Recall,F1 Score
FREQUENCY,0.905547,0.927091,0.916193
FORM,0.990735,0.991347,0.991041
ROUTE,0.988322,0.998758,0.993513
DRUG,0.990191,0.995071,0.992625
STRENGTH,0.886115,0.942478,0.913428
overall,0.952387,0.972566,0.96237


In [None]:
loaded_dfs[0]['F1 Score']['overall']

0.962370478094892