In [1]:
!pip install opendatasets datasets trl

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Collecting trl
  Downloading trl-0.10.1-py3-none-any.whl.metadata (12 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.10-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Downloading trl-0.10.1-py3-none-any.whl (280 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.1/280.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading tyro-0.8.10-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.7/105.7 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading shtab-1.7.1-py3-none-any.whl (14 kB)
Installing collected packages: shtab, tyro, opendatasets, trl
Successfully installed opendatasets-0.1.22 shtab-1.7.1 trl-0.10.1 tyro-0.8.10


In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
from trl import SFTTrainer
import warnings
warnings.filterwarnings("ignore")

In [3]:
batch_size = 4
num_workers = os.cpu_count()
max_steps = 6000
bf16 = False
fp16 = True
gradient_accumulation_steps = 2
context_length = 512
logging_steps = 500
save_steps = 500
learning_rate = 0.0001
model_name = 'openai-community/gpt2-medium'
out_dir = 'outputs/gpt2-finetuned'

In [4]:
if bf16:
    model = AutoModelForCausalLM.from_pretrained(model_name).to(dtype=torch.bfloat16)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name)
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)
354,823,168 total parameters.
354,823,168 training parameters.


In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [6]:
dataset = load_dataset('JM-Lee/Phi-3-mini-128k-instruct_instruction')
print(dataset)

Downloading readme:   0%|          | 0.00/359 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/68.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/61135 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['system', 'instruction', 'response'],
        num_rows: 61135
    })
})


In [7]:
full_dataset = dataset['train'].train_test_split(test_size=0.05, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']

print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['system', 'instruction', 'response'],
    num_rows: 58078
})
Dataset({
    features: ['system', 'instruction', 'response'],
    num_rows: 3057
})


In [8]:
def preprocess_function(example):
    """
    Formatting function returning a list of samples (kind of necessary for SFT API).
    """
    text = f"### Instruction:\n{example['system']}\n\n### Input:\n{example['instruction']}\n\n### Response:\n{example['response']}"
    return text

In [9]:
training_args = TrainingArguments(
    output_dir=f"{out_dir}/logs",
    evaluation_strategy='steps',
    weight_decay=0.01,
    load_best_model_at_end=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy='steps',
    save_strategy='steps',
    logging_steps=logging_steps,
    save_steps=save_steps,
    save_total_limit=2,
    bf16=bf16,
    fp16=fp16,
    report_to='tensorboard',
    max_steps=max_steps,
    dataloader_num_workers=num_workers,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    lr_scheduler_type='constant',
)

In [10]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    max_seq_length=context_length,
    tokenizer=tokenizer,
    args=training_args,
    formatting_func=preprocess_function,
    packing=True
)

Generating train split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1399 > 1024). Running this sequence through the model will result in indexing errors


Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


# Sample

In [11]:
dataloader = trainer.get_train_dataloader()
for i, sample in enumerate(dataloader):
    print(tokenizer.decode(sample['input_ids'][0]))
    print('#'*50)
    if i == 5:
        break

 = \sum_{i=0}^{2} nums[i] = 12$.
\end{document}
```

However, this approach is not recommended because it's not practical to compute the sum directly in LaTeX. It's better to calculate the sum in your programming language and then typeset the result in LaTeX.<|endoftext|>### Instruction:

You are helpful and informative ai assistant.

<Constitutions>
1. You are a helpful and informative AI assistant.
2. You should not respond with false information.
3. When you solve the problem, you should think step by step.

Make sure you follow the rules.

### Input:
instruction:
In this task, you are given a sentence in the English language and your task is to convert it into the Japanese language. In translation, keep numbers as it is and make it sentence case (capitalize only the first word of each sentence and noun).
question:
Before the act, he posted a suicide note on his website.
answer:
自殺する前に、彼はウェブサイトに遺書を公表した。


question:
The train was heading from the northeastern town of Bijelo Polje to 

# Training

In [12]:
history = trainer.train()

Step,Training Loss,Validation Loss
500,1.7153,1.586389
1000,1.6364,1.548085
1500,1.6054,1.51155
2000,1.5574,1.487412
2500,1.5374,1.47286
3000,1.5493,1.459531
3500,1.5129,1.449006
4000,1.509,1.435389
4500,1.4965,1.428405
5000,1.489,1.421715


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


# Inference

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pipe = pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    max_length=512, # Prompt + new tokens to generate.
    device_map=device
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [16]:
template = """### Instruction:
{}
### Input:
{}
### Response:
{}"""

instructions = 'Write three tips to learn computer science.'
inputs = ''
response = ''
prompt = template.format(instructions, inputs, response)

In [17]:
outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.1,
)
print(outputs[0]['generated_text'])

### Instruction:
Write three tips to learn computer science.
### Input:

### Response:
1. **Learn Python Programming Language**: Python is a free, open-source, and widely used programming language that can be used for web development, web application development, and more. It's a powerful tool for beginners, as it's easy to understand and versatile for different projects.

2. **Learn Java Programming Language**: Java is a popular choice for developing web applications, but it's also known as the "language of the browser" due to its simplicity and ease of use.
   - Learn Java syntax, syntax error handling, and object-oriented concepts.

3. **Learn JavaScript Programming Language**: JavaScript is a JavaScript-based scripting language that can be used in various programming languages. It's known for its simplicity and efficiency, making it a great choice for beginners.
