In [1]:
import pandas as pd
import tarfile
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np

device = torch.device("mps" if getattr(torch,'has_mps',False) else "cuda:0" if torch.cuda.is_available() else "cpu")
device

  from .autonotebook import tqdm as notebook_tqdm
2023-08-09 21:32:53.533506: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-09 21:32:53.592174: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


device(type='cuda', index=0)

## Load data

In [2]:
!rm -r khresmoi-summary-test-set
!wget https://www.statmt.org/wmt14/medical-task/khresmoi-summary-test-set.tgz
tar = tarfile.open('khresmoi-summary-test-set.tgz', 'r')
for item in tar:
    tar.extract(item, '')
!rm khresmoi-summary-test-set.tgz

--2023-08-09 21:32:56--  https://www.statmt.org/wmt14/medical-task/khresmoi-summary-test-set.tgz
Resolving www.statmt.org (www.statmt.org)... 129.215.197.184
Connecting to www.statmt.org (www.statmt.org)|129.215.197.184|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 653377 (638K) [application/x-gzip]
Saving to: ‘khresmoi-summary-test-set.tgz’


2023-08-09 21:32:57 (920 KB/s) - ‘khresmoi-summary-test-set.tgz’ saved [653377/653377]



In [3]:
f = open("khresmoi-summary-test-set/khresmoi-summary-test.en", "r")
train_eng = [x.replace('\n', '') for x in f]
f = open("khresmoi-summary-test-set/khresmoi-summary-test.fr", "r")
train_fr = [x.replace('\n', '') for x in f]

f = open("khresmoi-summary-test-set/khresmoi-summary-dev.en", "r")
test_eng = [x.replace('\n', '') for x in f]
f = open("khresmoi-summary-test-set/khresmoi-summary-dev.fr", "r")
test_fr = [x.replace('\n', '') for x in f]

data = {'eng': train_eng + test_eng, 'fr': train_fr + test_fr}
data = pd.DataFrame(data)
print('Length: ', len(data))
data.head(10)

Length:  1500


Unnamed: 0,eng,fr
0,The aim of this study was to investigate the e...,Le but de cette étude était d’étudier l’effet ...
1,Cardiac arrests are sometimes referred to as c...,Les arrêts cardiaques sont parfois appelés arr...
2,"It’s a long, hollow tube at the end of your di...",C’est un long tube creux à la fin de votre tub...
3,About 5 percent of people with ulcerative coli...,Environ 5 % des personnes souffrant de colite ...
4,Post-transplant cancers which are not virus-in...,Les cancers post-greffe qui ne sont pas causés...
5,Soft tissue injectables and fillers are a non-...,Le remplissage des tissus mous est une option ...
6,We will investigate if there is a change in yo...,Nous étudions s’il y a un changement dans vos ...
7,Patients with type 1 and type 2 diabetes melli...,Les patients souffrant de diabète de type 1 et...
8,The first 6 months will be a wash in period an...,Les 6 premiers mois seront une période de lava...
9,Polyp-like varices are shown here in the gastr...,Les varices semblables aux polypes sont présen...


In [4]:
data = Dataset.from_pandas(data)
data = data.train_test_split(test_size=0.2, shuffle=True, seed=42)
data

DatasetDict({
    train: Dataset({
        features: ['eng', 'fr'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['eng', 'fr'],
        num_rows: 300
    })
})

## Data Preprocessing

In [5]:
tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-fr-en')

prefix = "translate french to english: "

def preprocess_func(examples):
    inputs = [prefix + example for example in examples['fr']]
    targets = [example for example in examples["eng"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=300, truncation=True)
    return model_inputs

tokenized_data = data.map(preprocess_func, batched=True, remove_columns=data['train'].column_names)
tokenized_data

Map: 100%|██████████| 1200/1200 [00:00<00:00, 3223.17 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 3234.57 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})

In [6]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model='Helsinki-NLP/opus-mt-fr-en')

## Training

In [7]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-fr-en")

training_args = Seq2SeqTrainingArguments(
    output_dir="model_output",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=25,
    per_device_eval_batch_size=25,
    weight_decay=0.01,
    num_train_epochs=3,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,0.928344,49.7263,27.23
2,No log,0.915203,50.1324,27.1767
3,No log,0.912549,50.1425,27.3


TrainOutput(global_step=144, training_loss=0.9055769178602431, metrics={'train_runtime': 57.7272, 'train_samples_per_second': 62.362, 'train_steps_per_second': 2.494, 'total_flos': 61943965286400.0, 'train_loss': 0.9055769178602431, 'epoch': 3.0})

In [8]:
trainer.evaluate(tokenized_data["test"])

{'eval_loss': 0.9125491380691528,
 'eval_bleu': 50.1425,
 'eval_gen_len': 27.3,
 'eval_runtime': 13.1701,
 'eval_samples_per_second': 22.779,
 'eval_steps_per_second': 0.911,
 'epoch': 3.0}

In [9]:
trainer.save_model("medical-fr-en")