In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
model_path = '/content/drive/MyDrive/MSc_Project/Models/en_ner_bc5cdr_md/'
# test model path
import os
if os.path.exists(model_path):
    print(f"model path exists: {model_path}")
else:
    print(f"model path dosen't exist: {model_path}")

model path exists: /content/drive/MyDrive/MSc_Project/Models/en_ner_bc5cdr_md/


In [None]:
import os
import pandas as pd

REPO_PATH = 'drive/MyDrive/MSc_Project'
DATASET_PATH = os.path.join(REPO_PATH, 'Results/downstream/results_41lines_0.3ramdom.csv')

# check if a path exists
def check_path_and_file_details(path):
    if os.path.exists(path):
        print(f"path exists: {path}")
        if path.endswith('.csv'):
            try:
                # Reading CSV Files
                df = pd.read_csv(path)
                num_rows, num_cols = df.shape
                col_names = df.columns.tolist()
                print(f"the file has {num_rows} rows and {num_cols} cols")
                print(f"the name of columns are: {col_names}")
            except Exception as e:
                print(f"the file can't be loaded: {e}")
        else:
            print("the path is not a csv document.")
    else:
        print(f"path does't exists: {path}")

# Check path and file details
check_path_and_file_details(DATASET_PATH)


path exists: drive/MyDrive/MSc_Project/Results/downstream/results_41lines_0.3ramdom.csv
the file has 204 rows and 5 cols
the name of columns are: ['Note ID', 'Original Letters', 'Processed Original Letters', 'Masked Letters', 'Generated Letters']


In [None]:
!pip install -U spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m107.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

# load the custom NER model
nlp_scispacy = spacy.load(model_path)
print("The custom NER model has been loaded successfully.")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


The custom NER model has been loaded successfully.


## Entities extraction with SciSpacy

In [None]:
def generate_annotation(texts):
    annotations = []
    for text in texts:
        doc = nlp_scispacy(text)
        entities = []
        for ent in doc.ents:
            entities.append((ent.start_char, ent.end_char, ent.label_, ent.text))
        annotations.append({'text': text, 'entities': entities})
    return annotations

In [None]:
df = pd.read_csv(DATASET_PATH)

# Generate annotations using scispacy
original_annotations = generate_annotation(df['Original Letters'].tolist())
synthetic_annotations = generate_annotation(df['Generated Letters'].tolist())

# Split in Train, Validation and Testing sets
# Original annotation splits (70/10/20)
train_idx, valid_idx = int(0.70 * len(original_annotations)), int(0.80 * len(original_annotations))
original_annotations_train = original_annotations[:train_idx]
original_annotations_valid = original_annotations[train_idx:valid_idx]
original_annotations_test = original_annotations[valid_idx:]

# Synthetic annotation splits (70/10/20)
train_idx, valid_idx = int(0.70 * len(synthetic_annotations)), int(0.80 * len(synthetic_annotations))
synthetic_annotations_train = synthetic_annotations[:train_idx]
synthetic_annotations_valid = synthetic_annotations[train_idx:valid_idx]
synthetic_annotations_test = synthetic_annotations[valid_idx:]

In [None]:
# Print the size of each dataset segment
print("Original Annotations Train Size:", len(original_annotations_train))
print("Original Annotations Validation Size:", len(original_annotations_valid))
print("Original Annotations Test Size:", len(original_annotations_test))

print("Synthetic Annotations Train Size:", len(synthetic_annotations_train))
print("Synthetic Annotations Validation Size:", len(synthetic_annotations_valid))
print("Synthetic Annotations Test Size:", len(synthetic_annotations_test))


Original Annotations Train Size: 142
Original Annotations Validation Size: 21
Original Annotations Test Size: 41
Synthetic Annotations Train Size: 142
Synthetic Annotations Validation Size: 21
Synthetic Annotations Test Size: 41


### Fine-tune Spacy on Original vs Synthetic letters

In [None]:
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans
nlp_blank = spacy.blank('en')



In [None]:
def training_data_to_docBin(training_data, nlp):
    doc_bin = DocBin()
    for training_example in tqdm(training_data):
        text = training_example['text']
        labels = training_example['entities']
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label, _ in labels:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is not None:
                ents.append(span)
        doc.ents = filter_spans(ents)  # Filter overlapping entities
        doc_bin.add(doc)
    return doc_bin


In [None]:
original_train_docbin = training_data_to_docBin(original_annotations_train, nlp_blank)
original_valid_docbin = training_data_to_docBin(original_annotations_valid, nlp_blank)
original_test_docbin = training_data_to_docBin(original_annotations_test, nlp_blank)

synthetic_train_docbin = training_data_to_docBin(synthetic_annotations_train, nlp_blank)
synthetic_valid_docbin = training_data_to_docBin(synthetic_annotations_valid, nlp_blank)
synthetic_test_docbin = training_data_to_docBin(synthetic_annotations_test, nlp_blank)


100%|██████████| 142/142 [00:04<00:00, 34.55it/s]
100%|██████████| 21/21 [00:00<00:00, 36.43it/s]
100%|██████████| 41/41 [00:01<00:00, 34.78it/s]
100%|██████████| 142/142 [00:05<00:00, 28.14it/s]
100%|██████████| 21/21 [00:00<00:00, 41.73it/s]
100%|██████████| 41/41 [00:01<00:00, 37.83it/s]


In [None]:
original_train_docbin.to_disk("original_annotations_train.spacy")
original_valid_docbin.to_disk("original_annotations_valid.spacy")
original_test_docbin.to_disk("original_annotations_test.spacy")

synthetic_train_docbin.to_disk("synthetic_annotations_train.spacy")
synthetic_valid_docbin.to_disk("synthetic_annotations_valid.spacy")
synthetic_test_docbin.to_disk("synthetic_annotations_test.spacy")


#### On original letters

In [None]:
BASE_CONFIG_PATH = f'{REPO_PATH}/SpacyConfig/base_config.cfg'

In [None]:
!python -m spacy init config --lang en --pipeline ner {BASE_CONFIG_PATH}


[38;5;1m✘ The provided output file already exists. To force overwriting the
config file, set the --force or -F flag.[0m



In [None]:
OUTPUT_CONFIG_PATH = f'{REPO_PATH}/SpacyConfig/full_config.cfg'  # The full path to the output configuration file

# Filling in the full configuration file using SpaCy
!python -m spacy init fill-config {BASE_CONFIG_PATH} {OUTPUT_CONFIG_PATH}


[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
drive/MyDrive/MSc_Project/SpacyConfig/full_config.cfg
You can now add your data and train your pipeline:
python -m spacy train full_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train {OUTPUT_CONFIG_PATH} --output ./spacy_ft/on_original --paths.train ./original_annotations_train.spacy --paths.dev ./original_annotations_valid.spacy


[38;5;2m✔ Created output directory: spacy_ft/on_original[0m
[38;5;4mℹ Saving to output directory: spacy_ft/on_original[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    819.40    1.02    1.28    0.85    0.01
  1     200       5744.58  36326.39   80.17   82.57   77.90    0.80
  2     400        999.68   8361.98   84.58   86.90   82.38    0.85
  4     600       2371.48   5931.46   86.70   87.58   85.84    0.87
  5     800       1027.28   4382.35   87.51   90.19   84.98    0.88
  7    1000       1349.77   3829.09   85.34   86.09   84.62    0.85
  8    1200       1072.11   3081.98   86.35   87.51   85.23    0.86
  9    1400       1084.74   2754.67   87.28 

In [None]:
# Load the best trained model
nlp_spacy_on_original = spacy.load("./spacy_ft/on_original/model-best")


In [None]:
from spacy.training import Example

original_examples = []
for sample in original_annotations_test:
    text = sample['text']  # Get text from the test set
    annotations = [ent[:3] for ent in sample['entities']]  # Extract entity start, end position and label
    prediction = nlp_spacy_on_original(text)  # Using the model in predictions
    # Create an Example object to pair the prediction results with the true annotations
    example = Example.from_dict(prediction, {'entities': annotations})
    original_examples.append(example)

In [None]:
from spacy.scorer import Scorer

scorer = Scorer()

# Use the Scorer to evaluate all test cases
original_scores = scorer.score(original_examples)

# print evaluation metrics
original_scores = {
    'f-score': round(original_scores['ents_f'], 3),
    'precision': round(original_scores['ents_p'], 3),
    'recall': round(original_scores['ents_r'], 3),
}

print('Original scores:', original_scores)


Original scores: {'f-score': 0.855, 'precision': 0.865, 'recall': 0.846}


In [None]:
import os

# create log directory If it does not exist,
log_dir = '/content/drive/MyDrive/MSc_Project/logs'
os.makedirs(log_dir, exist_ok=True)

log_file_path = os.path.join(log_dir, 'best_model_test_results.txt')  # Define the log file path


In [None]:
# Save the evaluation results to a log file
with open(log_file_path, 'a') as f:
    f.write('Downstream NER (Spacy trained on Original letters):\n')
    f.write(',\n'.join([f'\t{key}={value}' for key, value in original_scores.items()]) + '.\n')


#### On synthetic letters

In [None]:
OUTPUT_CONFIG_PATH = f'{REPO_PATH}/SpacyConfig/full_config.cfg'  # Output the full path to the configuration file

# Filling in the full configuration file using SpaCy
!python -m spacy init fill-config {BASE_CONFIG_PATH} {OUTPUT_CONFIG_PATH}


[38;5;3m⚠ Nothing to auto-fill: base config is already complete[0m
[38;5;2m✔ Saved config[0m
drive/MyDrive/MSc_Project/SpacyConfig/full_config.cfg
You can now add your data and train your pipeline:
python -m spacy train full_config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!python -m spacy train {OUTPUT_CONFIG_PATH} --output ./spacy_ft/on_synthetic --paths.train ./synthetic_annotations_train.spacy --paths.dev ./synthetic_annotations_valid.spacy


[38;5;2m✔ Created output directory: spacy_ft/on_synthetic[0m
[38;5;4mℹ Saving to output directory: spacy_ft/on_synthetic[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    824.00    0.78    0.50    1.68    0.01
  1     200       6807.06  36114.96   81.25   84.68   78.09    0.81
  2     400        960.92   8501.67   84.82   86.90   82.84    0.85
  4     600       1004.50   5895.38   86.41   88.12   84.77    0.86
  5     800       1012.97   4491.71   87.06   88.49   85.67    0.87
  7    1000        855.55   3728.69   86.51   89.11   84.07    0.87
  8    1200       1172.10   3058.01   87.22   89.14   85.38    0.87
  9    1400        907.69   2732.14   87.1

In [None]:
# Load the best model
nlp_spacy_on_synthetic = spacy.load("./spacy_ft/on_synthetic/model-best")


In [None]:
from spacy.training import Example

synthetic_examples = []
for sample in synthetic_annotations_test:
    text = sample['text']  # Gets text from the testset
    annotations = [ent[:3] for ent in sample['entities']]  # Extracts the start, end, and label of the entity
    prediction = nlp_spacy_on_synthetic(text)  # use the model for prediction
    # Create an Example object to pair the predicted result with the real annotation
    example = Example.from_dict(prediction, {'entities': annotations})
    synthetic_examples.append(example)

In [None]:
from spacy.scorer import Scorer

scorer = Scorer()

# Evaluate all test samples using Scorer
synthetic_scores = scorer.score(synthetic_examples)

# print the evaluation metrics
synthetic_scores = {
    'f-score': round(synthetic_scores['ents_f'], 3),
    'precision': round(synthetic_scores['ents_p'], 3),
    'recall': round(synthetic_scores['ents_r'], 3),
}

print('synthetic scores:', synthetic_scores)


synthetic scores: {'f-score': 0.853, 'precision': 0.863, 'recall': 0.843}


In [None]:
import os

# create log directory If it does not exist,
log_dir = '/content/drive/MyDrive/MSc_Project/logs'
os.makedirs(log_dir, exist_ok=True)

log_file_path = os.path.join(log_dir, 'best_model_test_results.txt')  # Define the log file path


In [None]:
# save the evaluation results
with open(log_file_path, 'a') as f:
    f.write('Downstream NER (Spacy trained on synthetic letters):\n')
    f.write(',\n'.join([f'\t{key}={value}' for key, value in synthetic_scores.items()]) + '.\n')


# Close runtime (save compute units)

In [None]:
# Close google colab runtime to save credits
from google.colab import runtime
runtime.unassign()