In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt", use_fast=True)

In [3]:
import pandas as pd
df = pd.read_csv('../data/df_xy.csv', index_col=False)

In [4]:
df = df[["text"]]
df = df.head(1000)

In [5]:
from datasets import Dataset

In [6]:
ds = Dataset.from_pandas(df)

In [7]:
ds

Dataset({
    features: ['text'],
    num_rows: 1000
})

In [8]:
def tokenize(batch):
    return tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True)

In [9]:
ds = ds.map(tokenize, num_proc=4, batched=True)

                                                                

In [10]:
ds = ds.remove_columns(["text"])

In [11]:
tts_ds = ds.train_test_split(test_size=0.3)

In [12]:
tts_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 700
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 300
    })
})

In [13]:
block_size = 128

In [14]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [15]:
lm_datasets = tts_ds.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

                                                               

In [16]:
tokenizer.decode(lm_datasets["train"][17]["input_ids"])

'time to monitor the growth of the polyp. <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [17]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("microsoft/biogpt").to('cuda')

In [18]:
from transformers import Trainer, TrainingArguments

In [19]:
training_args = TrainingArguments(
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    output_dir="models"
)

In [20]:
import wandb

In [21]:
wandb.init(mode="disabled")

AttributeError: module 'wandb' has no attribute 'init'

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],

)

In [25]:
trainer.train()

***** Running training *****
  Num examples = 1664
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 624
  Number of trainable parameters = 346763264


Epoch,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 714
  Batch size = 8


AttributeError: module 'wandb' has no attribute 'log'

In [24]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [None]:
import torch
torch.cuda.is_available()