### Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# CD to Drive Directory for imports
import sys
sys.path.append('drive/MyDrive/SynDa_Health/LT3')
sys.path.append('drive/MyDrive/SynDa_Health/LT3/Models')

Mounted at /content/drive


In [None]:
!pip install datasets evaluate rouge_score bert_score transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m16.7 MB/s[0

In [None]:
import torch

from Pipeline.FullPipeline import FullPipeline
from Modules.DataHelp import load_and_preprocess_data

### Prepare & Tokenize Data

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenized_dataset = load_and_preprocess_data(data_path="drive/MyDrive/SynDa_Health/Dataset/Splits", method="Split Training", tokenizer=tokenizer, max_input_length=20, max_output_length=140)

Map:   0%|          | 0/2544 [00:00<?, ? examples/s]

Map:   0%|          | 0/157 [00:00<?, ? examples/s]

In [None]:
print(tokenized_dataset["train"][0])

{'keywords': 'phenytoin', 'descriptions': 'phenytoin 125 mg/5 mL Suspension Sig: One [**Age over 90 **]y Five (125) mg PO TID (3 times a day).', 'input_ids': [185, 10436, 25669, 8136, 1179, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [101, 185, 10436, 25669, 8136, 1179, 8347, 17713, 120, 126, 182, 2162, 15463, 20080, 5026, 1988, 14159, 1403, 131, 1448, 164, 115, 115, 4936, 1166, 3078, 115, 115, 166, 194, 4222, 113, 8347, 114, 17713, 153, 2346, 157, 9949, 113, 124, 1551, 170, 1285, 114, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['keywords', 'descriptions', 'input_ids', 'labels'],
        num_rows: 2544
    })
    validation: Dataset({
        features: ['keywords', 'descriptions', 'input_ids', 'labels'],
        num_rows: 157
    })
})

### Pipeline

Notes:

1.   depending on the Tokenizer, preprocessing function (in DataHelp) and postprocessing function (in Generator) might change.
2.   depending on the Tokenizer, weak/strong tokens for ngram repeat penalty (in As_BeamTranslator) might change.
3.   depending on the model/task, weighted combination function for model selection (in FullPipeline > select_best_model fct) might change.

In [None]:
hyperparams = {
    "keyword_max_length": 24, "description_max_length": 155, "vocab_size": len(tokenizer.vocab), "pad_idx": 0,
    "d_model": 515, "d_v": 64, "d_hid": 2038, "n_head": 5, "n_layers": 2, "learning_rate": 0.0004, "dropout": 0.20, "weight_decay": 0.02,
    "tokenizer": "BERT", "embedder": "DefaultEmbeddingLayers", "translator": "A*_Beam",
    "n_epochs": 2, "batch_size": 53, "checkpoint_per_epoch": 2/2,
    "criterion": "AdamW", "loss_function": "Cross-Entropy",
}

translator_params = {
    "max_input_length": hyperparams["keyword_max_length"], "max_seq_length": hyperparams["description_max_length"],
    "beam_size": 4, "maximal_step_probability_difference":1, "alpha":0.6, "tree_length_product":3, "nrp_length": 4,
    "pad_idx": hyperparams["pad_idx"], "sos_idx": 101, "eos_idx": 102,
    "nb_output_multiplier": 2,
}

device = torch.device('cpu')

In [None]:
print("Vocab length:", len(tokenizer.vocab))

Vocab length: 28996


### Pipeline run

In [None]:
model = FullPipeline("TEST-run-9/TO_DELETE", logging=True, save_model=True,
            ).run(hyperparams, tokenized_dataset, translator_params, device, tokenizer)

Model training is starting...
Batch: 53. Batch Training Loss: 10.473698616027832.
Batch: 106. Batch Training Loss: 9.328630447387695.
Batch: 159. Batch Training Loss: 8.644444465637207.
Batch: 212. Batch Training Loss: 8.139925003051758.
Batch: 265. Batch Training Loss: 8.066410064697266.
Batch: 318. Batch Training Loss: 7.540621757507324.
Batch: 371. Batch Training Loss: 7.096751689910889.
Batch: 424. Batch Training Loss: 6.666026592254639.
Batch: 477. Batch Training Loss: 6.2041239738464355.
Batch: 530. Batch Training Loss: 5.947202205657959.
Batch: 583. Batch Training Loss: 5.8095173835754395.
Batch: 636. Batch Training Loss: 5.309929370880127.
Batch: 689. Batch Training Loss: 5.288061618804932.
Batch: 742. Batch Training Loss: 4.78812313079834.
Batch: 795. Batch Training Loss: 5.699391841888428.
Batch: 848. Batch Training Loss: 4.9210968017578125.
Batch: 901. Batch Training Loss: 4.548069953918457.
Batch: 954. Batch Training Loss: 3.835689067840576.
Batch: 1007. Batch Training Loss

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Checkpoint evaluation results: {'BLEU': 39.28, 'ROUGE-1': 43.83, 'ROUGE-2': 32.72, 'ROUGE-L': 43.22, 'BERTScore': 0.22}
Evaluating checkpoint 2 out of 2...


In [None]:
from google.colab import runtime
runtime.unassign()

### Useful results