In [1]:
!pip install opendatasets datasets trl



In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
from trl import SFTTrainer
import warnings
warnings.filterwarnings("ignore")

In [19]:
batch_size = 4
num_workers = os.cpu_count()
max_steps = 6000
bf16 = False
fp16 = True
gradient_accumulation_steps = 2
context_length = 512
logging_steps = 500
save_steps = 500
learning_rate = 0.0001
model_name = 'openai-community/gpt2'
out_dir = 'outputs/gpt2-finetuned'

In [4]:
if bf16:
    model = AutoModelForCausalLM.from_pretrained(model_name).to(dtype=torch.bfloat16)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name)
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
124,439,808 total parameters.
124,439,808 training parameters.


In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token

In [6]:
dataset = load_dataset('JM-Lee/Phi-3-mini-128k-instruct_instruction')
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['system', 'instruction', 'response'],
        num_rows: 61135
    })
})


In [7]:
full_dataset = dataset['train'].train_test_split(test_size=0.05, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']

print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['system', 'instruction', 'response'],
    num_rows: 58078
})
Dataset({
    features: ['system', 'instruction', 'response'],
    num_rows: 3057
})


In [8]:
def preprocess_function(example):
    """
    Formatting function returning a list of samples (kind of necessary for SFT API).
    """
    text = f"### Instruction:\n{example['system']}\n\n### Input:\n{example['instruction']}\n\n### Response:\n{example['response']}"
    return text

In [20]:
training_args = TrainingArguments(
    output_dir=f"{out_dir}/logs",
    evaluation_strategy='steps',
    weight_decay=0.01,
    load_best_model_at_end=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy='steps',
    save_strategy='steps',
    logging_steps=logging_steps,
    save_steps=save_steps,
    save_total_limit=2,
    bf16=bf16,
    fp16=fp16,
    report_to='tensorboard',
    max_steps=max_steps,
    dataloader_num_workers=num_workers,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    lr_scheduler_type='constant',
)

In [21]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    max_seq_length=context_length,
    tokenizer=tokenizer,
    args=training_args,
    formatting_func=preprocess_function,
    packing=True
)

max_steps is given, it will override any value given in num_train_epochs


In [11]:
dataloader = trainer.get_train_dataloader()
for i, sample in enumerate(dataloader):
    print(tokenizer.decode(sample['input_ids'][0]))
    print('#'*50)
    if i == 5:
        break


<Constitutions>
1. You are a helpful and informative AI assistant.
2. You should not respond with false information.
3. When you solve the problem, you should think step by step.

Make sure you follow the rules.

### Input:
Detailed Instructions: You are given an original reference as well as a system reference. Your task is to judge the quality of the system reference. If the utterance is grammatically correct and fluent output 1, else output 0.
Problem:System Reference: what part of the city are you looking for?
Original Reference: in what area are you looking for a hotel?.
Solution:

### Response:
<|start_markup_id|>[[[1]]]<|end_markup_id|>
<|start_physics_explanation_id|>The system reference provided is grammatically correct and fluent. It effectively conveys the same meaning as the original reference, asking about the specific area or location within the city where the user is seeking a hotel. Therefore, the system reference meets the criteria for a high-quality output. <|end_phy

In [22]:
history = trainer.train()

Step,Training Loss,Validation Loss
500,1.5808,1.681918
1000,1.5551,1.672241
1500,1.5487,1.669227
2000,1.5567,1.667471
2500,1.5613,1.665127
3000,1.5385,1.6385
3500,1.6995,1.623
4000,1.7035,1.61668
4500,1.7114,1.610285
5000,1.6956,1.602721


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


In [23]:
model.save_pretrained(f"{out_dir}/best_model")
tokenizer.save_pretrained(f"{out_dir}/best_model")

('outputs/gpt2-finetuned/best_model/tokenizer_config.json',
 'outputs/gpt2-finetuned/best_model/special_tokens_map.json',
 'outputs/gpt2-finetuned/best_model/vocab.json',
 'outputs/gpt2-finetuned/best_model/merges.txt',
 'outputs/gpt2-finetuned/best_model/added_tokens.json')

# Inference

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pipe = pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    max_length=512, # Prompt + new tokens to generate.
    device_map=device
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [29]:
template = """### Instruction:
{}
### Input:
{}
### Response:
{}"""

instructions = 'Write three tips to study well for exam.'
inputs = ''
response = ''
prompt = template.format(instructions, inputs, response)

In [30]:
outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.1,
)
print(outputs[0]['generated_text'])

### Instruction:
Write three tips to study well for exam.
### Input:

### Response:
1. **Understanding Your Learning Style**: Begin by understanding the structure of your exams and the key topics. This will help you understand how they are designed, the learning style, and the overall learning experience.

2. **Set Boundaries and Expectations**: Make sure that your answers align with your learning style. This includes understanding your subject matter, understanding the topics, and the objectives of each topic.
   - **Understand Your Topic**: Learn about different types of questions and explain why they are relevant to your learning style.
   - **Follow the Process**: Follow the process of setting up the questions and providing them with clear instructions.
   - **Review and Adjust**: Once everything is set in place, review your answers and adjust your schedule as needed.

3. **Practice and Experiment**: Continuously practice and experiment with different subjects to see how you learn.