In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt", use_fast=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
df = pd.read_csv('../data/df_xy.csv', index_col=False)

In [3]:
df = df[["text"]]
df = df.head(100)

In [4]:
from datasets import Dataset

In [5]:
ds = Dataset.from_pandas(df)

In [6]:
ds

Dataset({
    features: ['text'],
    num_rows: 100
})

In [7]:
def tokenize(batch):
    return tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True)

In [8]:
ds = ds.map(tokenize, num_proc=4, batched=True)

                                                             

In [9]:
ds = ds.remove_columns(["text"])

In [10]:
tts_ds = ds.train_test_split(test_size=0.3)

In [11]:
tts_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 70
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 30
    })
})

In [12]:
block_size = 128

In [13]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [14]:
lm_datasets = tts_ds.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

                                                   

In [15]:
tokenizer.decode(lm_datasets["train"][17]["input_ids"])

': The patient should avoid drinking carbonated beverages, as these can worsen reflux symptoms. RECOMMENDATION: A barium swallow study may be ordered to assess the extent of the inflammation. </s>Other- persumed per DU managed conservativelyPrevious persumed DU ulcer managed conservatively. D1 Normal gastroscopy to the duodenum. FOLLOW UP: The patient should avoid stress and maintain good mental health, as stress can delay the healing of the ulcer. RECOMMENDATION: Follow-up endoscopy should be performed in 4-6 weeks to assess for healing and to repeat biopsies if necessary. <pad><pad><pad><pad><pad><pad><pad>'

In [16]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt").to('cuda')

In [17]:
from transformers import Trainer, TrainingArguments

In [18]:
training_args = TrainingArguments(
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    output_dir="models",
    report_to=None
)

In [19]:
import wandb

In [20]:
#wandb.init(mode="disabled")

In [21]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],

)

In [22]:
trainer.train()

***** Running training *****
  Num examples = 126
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 48
  Number of trainable parameters = 346763264
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


AttributeError: module 'wandb' has no attribute 'run'

In [None]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [None]:
import torch
torch.cuda.is_available()