In [5]:
import pandas as pd
from datasets import Dataset
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForCausalLM

from src.config import settings
from src.preparation import load_training_data
from src.training import tokenize_and_prepare_labels

In [ ]:
# Loading training data as a dataframe to see what it looks like
training_data_df = pd.read_json(settings.TRAINING_DATA_FILE)
training_data_df

In [6]:
# Loading base model an related tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("distilgpt2")

In [7]:
# Loading training data as a list
training_data = load_training_data()

# Preparing training data and using the tokenizer to tokenize it
dataset = Dataset.from_dict({"text": training_data})
tokenized_dataset = dataset.map(tokenize_and_prepare_labels, batched=True)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [8]:
# Setting up training arguments and trainer
training_args = TrainingArguments(
	output_dir=settings.MODEL_FOLDER,
	num_train_epochs=3,
	per_device_train_batch_size=2,
	warmup_steps=500,
	weight_decay=0.01,
	logging_dir=settings.LOG_PATH,
)

trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_dataset,
)

In [ ]:
# Running training
trainer.train()

Step,Training Loss


In [ ]:
# Saving model and tokenizer
model.save_pretrained(settings.MODEL_WEIGHT_PATH)
tokenizer.save_pretrained(settings.MODEL_WEIGHT_PATH)