In [1]:
from model import Model
from transformers import AdamW,PretrainedConfig,AutoTokenizer
from tqdm.auto import tqdm
from datasets import load_dataset
import numpy as np
from transformers import Trainer,TrainingArguments, DefaultDataCollator
import torch
import transformers
from torch.utils.tensorboard import SummaryWriter

In [2]:
ds_train = load_dataset("Salesforce/wikitext", "wikitext-103-v1",split="train",trust_remote_code=True,num_proc=8)
ds_validate = load_dataset("Salesforce/wikitext", "wikitext-103-v1",split="validation",trust_remote_code=True,num_proc=1)
ds_test = load_dataset("Salesforce/wikitext", "wikitext-103-v1",split="test",trust_remote_code=True,num_proc=1)

In [3]:
ds_train = ds_train.filter(lambda x : bool(x["text"]))

Filter:   0%|          | 0/1801350 [00:00<?, ? examples/s]

In [4]:
ds_test = ds_test.filter(lambda x : bool(x["text"]))

Filter:   0%|          | 0/4358 [00:00<?, ? examples/s]

In [5]:
gpt2_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

In [6]:
tokenizer = gpt2_tokenizer
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [7]:
model = Model(PretrainedConfig(max_length=256,min_length=256)).to("cuda")

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to("cuda")

optimizer = AdamW(model.parameters(), lr=1e-5)



In [9]:
def preprocess(examples):
    model_inputs = tokenizer(examples["text"], max_length=768,padding="max_length",truncation=True)
    return model_inputs

In [10]:
tokeniezed_train_ds = ds_train.map(preprocess,batched=True)

Map:   0%|          | 0/1165029 [00:00<?, ? examples/s]

In [11]:
tokeniezed_test_ds = ds_test.map(preprocess,batched=True)

Map:   0%|          | 0/2891 [00:00<?, ? examples/s]

In [13]:
training_args = TrainingArguments(
    output_dir="my_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    remove_unused_columns=False,
    logging_steps = 50,
    save_steps= 10000,
    logging_dir = 'log',
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokeniezed_train_ds["input_ids"],
    eval_dataset=tokeniezed_test_ds["input_ids"],
    tokenizer=tokenizer,
    data_collator=DefaultDataCollator(),
    compute_metrics=transformers.integrations.TensorBoardCallback(SummaryWriter("./tensorboard")),
)

trainer.train()
trainer.save_model("./")
        