# Model Training & Evaluation

This notebook covers fine-tuning DistilGPT-2 on the GSM8K dataset, hyperparameter tuning, and evaluation using BLEU, F1, and qualitative tests.

# Imports and Data Loading

In [12]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, TFAutoModelForCausalLM, DataCollatorForLanguageModeling, create_optimizer
from transformers import TextDataset, TrainingArguments, Trainer
import tensorflow as tf

# Load preprocessed data
train_df = pd.read_csv('../data/main_train_clean.csv').iloc[:1000]
test_df = pd.read_csv('../data/main_test_clean.csv').iloc[:200]

## Prepare Data for DistilGPT-2

We will concatenate each question and answer as a single string for training, separated by a special token.

In [13]:
SEP_TOKEN = "<|sep|>"

train_texts = (train_df['question'] + f" {SEP_TOKEN} " + train_df['answer']).tolist()
test_texts = (test_df['question'] + f" {SEP_TOKEN} " + test_df['answer']).tolist()

# Save to .txt files for Hugging Face TextDataset
with open('../data/train_gpt2.txt', 'w', encoding='utf-8') as f:
    for line in train_texts:
        f.write(line + '\n')

with open('../data/test_gpt2.txt', 'w', encoding='utf-8') as f:
    for line in test_texts:
        f.write(line + '\n')

## Load Tokenizer and Model

In [14]:
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
tokenizer.add_special_tokens({'additional_special_tokens': [SEP_TOKEN]})

tokenizer.pad_token = tokenizer.eos_token
model = TFAutoModelForCausalLM.from_pretrained('distilgpt2')
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.eos_token_id

model = TFAutoModelForCausalLM.from_pretrained('distilgpt2')
model.resize_token_embeddings(len(tokenizer))

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


<keras.src.layers.core.embedding.Embedding at 0x28dc185ada0>

## Encode Data for Training

We will encode the data as input IDs and attention masks for TensorFlow training.

In [15]:
def encode_examples(texts, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []
    for text in texts:
        encodings = tokenizer(
            text,
            truncation=True,
            max_length=max_length,
            padding='max_length',
            return_tensors='tf'
        )
        input_ids.append(encodings['input_ids'][0])
        attention_masks.append(encodings['attention_mask'][0])
    return tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': tf.stack(input_ids),
            'attention_mask': tf.stack(attention_masks)
        },
        tf.stack(input_ids)
    ))

max_length = 128
train_dataset = encode_examples(train_texts, tokenizer, max_length).batch(8)
test_dataset = encode_examples(test_texts, tokenizer, max_length).batch(8)

## Fine-Tune DistilGPT-2

We will use TensorFlow's Keras API for training.

In [16]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss)

# Train for 1 epoch for demo; increase epochs for better results
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=1
)
model.save_pretrained('../model/distilgpt2-finetuned')
tokenizer.save_pretrained('../model/distilgpt2-finetuned')



('../model/distilgpt2-finetuned\\tokenizer_config.json',
 '../model/distilgpt2-finetuned\\special_tokens_map.json',
 '../model/distilgpt2-finetuned\\vocab.json',
 '../model/distilgpt2-finetuned\\merges.txt',
 '../model/distilgpt2-finetuned\\added_tokens.json',
 '../model/distilgpt2-finetuned\\tokenizer.json')

## Evaluation

We will evaluate the model using BLEU, F1, and qualitative tests.

In [18]:
def generate_answer(question, max_new_tokens=50):
    input_ids = tokenizer.encode(question + f" {SEP_TOKEN}", return_tensors='tf')
    output = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )
    answer = tokenizer.decode(output[0], skip_special_tokens=True)
    # Extract only the answer part
    if SEP_TOKEN in answer:
        answer = answer.split(SEP_TOKEN)[-1].strip()
    return answer

# BLEU and F1 on a small sample
bleu_scores = []
f1_scores = []
for i in range(10):  # Evaluate on 10 samples for speed
    q = test_df.iloc[i]['question']
    true_a = test_df.iloc[i]['answer']
    pred_a = generate_answer(q)
    bleu = sentence_bleu([true_a.split()], pred_a.split())
    bleu_scores.append(bleu)
    # F1 (token-level, simple)
    true_tokens = set(true_a.split())
    pred_tokens = set(pred_a.split())
    common = true_tokens & pred_tokens
    if len(true_tokens) > 0:
        f1 = 2 * len(common) / (len(true_tokens) + len(pred_tokens))
    else:
        f1 = 0
    f1_scores.append(f1)
print('Average BLEU:', np.mean(bleu_scores))
print('Average F1:', np.mean(f1_scores))

Average BLEU: 0.0419808595443527
Average F1: 0.3747262145001923


In [19]:
for i in range(5):
    q = test_df.iloc[i]['question']
    print(f"Q: {q}")
    print(f"Model: {generate_answer(q)}")
    print(f"True: {test_df.iloc[i]['answer']}")
    print('-'*40)

Q: janet’s ducks lay 16 eggs per day. she eats three for breakfast every morning and bakes muffins for her friends every day with four. she sells the remainder at the farmers' market daily for $2 per fresh duck egg. how much in dollars does she make every day at the farmers' market?
Model: janet’s ducks lay 16 eggs per day. she eats three for breakfast every morning and bakes muffins for her friends every day with four. she sells the remainder at the farmers' market daily for $2 per fresh duck egg. how much in dollars does she make every day at the farmers' market? 
True: janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
she makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18
----------------------------------------
Q: a robe takes 2 bolts of blue fiber and half that much white fiber.  how many bolts in total does it take?
Model: a robe takes 2 bolts of blue fiber and half that much white fiber.  how many bolts in total does it take? 
True: it takes 2/2=<<2/2=1>