<a href="https://colab.research.google.com/github/Hydrochilli/LLM-Training/blob/main/GPT2__invoice_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Install the required libraries
!pip install transformers datasets wandb

# Import required libraries
import wandb
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset

# Log in to W&B
wandb.login()

# Load the model and tokenizer
model_name = "distilgpt2"  # Using a smaller model for training
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token  # Use the end-of-sequence token as the padding token

print(f"Loaded model: {model_name}")

# Create an updated JSONL dataset with clear delimiters
with open("sample_invoices.jsonl", "w") as file:
    file.write('{"prompt": "Extract the customer number: \'Invoice number: 100-13, Customer number: 2, Total: 744.00 GBP\' | Answer:", "completion": "2"}\n')
    file.write('{"prompt": "What is the invoice date? \'Invoice date: 24/08/2021, Due date: 23/09/2021\' | Answer:", "completion": "24/08/2021"}\n')
    file.write('{"prompt": "Identify the invoice number: \'Invoice number: 100-14, Total: 525.00 GBP\' | Answer:", "completion": "100-14"}\n')
    file.write('{"prompt": "Find the total amount due: \'Total excl. VAT: 435.00 GBP, VAT 20%: 90.00 GBP, Total: 525.00 GBP\' | Answer:", "completion": "525.00 GBP"}\n')
    file.write('{"prompt": "Extract the VAT amount: \'Total excl. VAT: 620.00 GBP, VAT 20%: 124.00 GBP, Total: 744.00 GBP\' | Answer:", "completion": "124.00 GBP"}\n')

# Load the dataset
dataset = load_dataset("json", data_files="sample_invoices.jsonl")
train_dataset = dataset["train"]

# Tokenize the dataset
def tokenize_function(example):
    tokenized = tokenizer(example["prompt"], padding="max_length", truncation=True, max_length=64)
    with tokenizer.as_target_tokenizer():  # Ensure labels use the same tokenizer
        tokenized["labels"] = tokenizer(example["completion"], padding="max_length", truncation=True, max_length=64)["input_ids"]
    return tokenized

tokenized_datasets = train_dataset.map(tokenize_function, batched=True)

# Set up training arguments with W&B integration
training_args = TrainingArguments(
    output_dir="./results",                # Directory for model checkpoints
    per_device_train_batch_size=1,         # Small batch size for limited resources
    num_train_epochs=3,                    # Number of training epochs
    logging_dir="./logs",                  # Directory for logs
    logging_steps=5,                       # Log every 5 steps
    save_steps=10,                         # Save the model every 10 steps
    gradient_checkpointing=True,           # Reduces memory usage
    report_to="wandb",                     # Enable W&B integration
    run_name="invoice-field-extraction",   # Name of the W&B run
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

# Train the model
trainer.train()

# Test the trained model
prompt = "Extract the customer number: 'Invoice number: 100-15, Customer number: 3, Total: 600.00 GBP' | Answer:"
inputs = tokenizer(prompt, return_tensors="pt")

# Generate output with explicit control over the length
outputs = model.generate(**inputs, max_new_tokens=5)  # Limit generation to 5 new tokens
print("Generated Output:", tokenizer.decode(outputs[0], skip_special_tokens=True))






Loaded model: distilgpt2


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]



Step,Training Loss
5,4.0197
10,0.5613
15,0.4769


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Output: Extract the customer number: 'Invoice number: 100-15, Customer number: 3, Total: 600.00 GBP' | Answer:


