In [None]:
from datasets import load_dataset

# Load the TruthfulQA dataset
dataset = load_dataset('truthful_qa', 'generation')
print(dataset)

# Split into train and test sets (80/20 split for example)
train_test = dataset['validation'].train_test_split(test_size=0.2)
train_dataset = train_test['train']
test_dataset = train_test['test']

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset

# Select GPT-2 small
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Format data for language modeling: question + best answer
def format_example(example):
    return {'text': f"Question: {example['question']}\nAnswer: {example['best_answer']}"}

train_formatted = train_dataset.map(format_example)
test_formatted = test_dataset.map(format_example)

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

train_tokenized = train_formatted.map(tokenize_function, batched=True, remove_columns=['text'])
test_tokenized = test_formatted.map(tokenize_function, batched=True, remove_columns=['text'])

train_tokenized = train_tokenized.rename_column('attention_mask', 'labels')  # For causal LM, labels are inputs shifted
test_tokenized = test_tokenized.rename_column('attention_mask', 'labels')