In [1]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt", use_fast=True)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import pandas as pd
df = pd.read_csv('../data/df_xy.csv', index_col=False)

In [4]:
df = df[["text"]]

In [5]:
df["text"][0]

'INDICATIONS FOR PROCEDURE: Ongoing reflux symptoms. Extent of Exam:  D1  FINDINGS: Columnar lined oesophagus is present. The segment looks flat. Some areas of vascular abnormalities are seen. No abnormal pit pattern is seen. NA'

In [6]:
from datasets import Dataset

In [7]:
ds = Dataset.from_pandas(df)

In [8]:
ds

Dataset({
    features: ['text'],
    num_rows: 37635
})

In [9]:
def tokenize(batch):
    return tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True)

In [10]:
ds = ds.map(tokenize, num_proc=4, batched=True)

                                                                                

In [11]:
ds = ds.remove_columns(["text"])

In [12]:
tts_ds = ds.train_test_split(test_size=0.3)

In [13]:
tts_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 26344
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 11291
    })
})

In [14]:
block_size = 128

In [15]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [16]:
lm_datasets = tts_ds.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

                                                                                

In [17]:
tokenizer.decode(lm_datasets["train"][17]["input_ids"])

'<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad></s>INDICATIONS FOR PROCEDURE: Other- Extent of'

In [25]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("../notebooks/models/checkpoint-10000").to('cuda')

loading configuration file ../notebooks/models/checkpoint-10000/config.json
Model config BioGptConfig {
  "_name_or_path": "../notebooks/models/checkpoint-10000",
  "activation_dropout": 0.0,
  "architectures": [
    "BioGptForCausalLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "layerdrop": 0.0,
  "max_position_embeddings": 1024,
  "model_type": "biogpt",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "scale_embedding": true,
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "use_cache": true,
  "vocab_size": 42384
}

loading weights file ../notebooks/models/checkpoint-10000/pytorch_model.bin
Generate config GenerationConfig {
  "bos_token_id": 0,
  "eos_token_id": 2,
  "pad_token_id": 1,
  "transformers_version": "4.26.1"
}

All mode

In [28]:
from transformers import Trainer, TrainingArguments

In [29]:
training_args = TrainingArguments(
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    output_dir="models",
    report_to=None
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [21]:
#import wandb

In [22]:
#wandb.init(mode="disabled")

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],

)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 36370
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13641
  Number of trainable parameters = 346763264


Epoch,Training Loss,Validation Loss
1,3.9309,4.564204


Saving model checkpoint to models/checkpoint-500
Configuration saved in models/checkpoint-500/config.json
Configuration saved in models/checkpoint-500/generation_config.json
Model weights saved in models/checkpoint-500/pytorch_model.bin
Saving model checkpoint to models/checkpoint-1000
Configuration saved in models/checkpoint-1000/config.json
Configuration saved in models/checkpoint-1000/generation_config.json
Model weights saved in models/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to models/checkpoint-1500
Configuration saved in models/checkpoint-1500/config.json
Configuration saved in models/checkpoint-1500/generation_config.json
Model weights saved in models/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to models/checkpoint-2000
Configuration saved in models/checkpoint-2000/config.json
Configuration saved in models/checkpoint-2000/generation_config.json
Model weights saved in models/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to models/checkp

In [None]:
import torch
torch.cuda.is_available()