In [1]:
!pip install opendatasets datasets trl

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Collecting trl
  Downloading trl-0.10.1-py3-none-any.whl.metadata (12 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.10-py3-none-any.whl.metadata (8.4 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Downloading trl-0.10.1-py3-none-any.whl (280 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.1/280.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tyro-0.8.10-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.7/105.7 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading shtab-1.7.1-py3-none-any.whl (14 kB)
Installing collected packages: shtab, tyro, opendatasets, trl
Successfully installed opendatasets-0.1.22 shtab-1.7.1 trl-0.10.1 tyro-0.8.10


In [38]:
import os
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    TrainingArguments,
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    logging,
)
from trl import SFTTrainer
import warnings
warnings.filterwarnings("ignore")

In [14]:
batch_size = 16
num_workers = os.cpu_count()
max_steps = 6000
bf16 = False
fp16 = True
gradient_accumulation_steps = 2
context_length = 256
logging_steps = 500
save_steps = 500
learning_rate = 0.0001
model_name = 'openai-community/gpt2'
out_dir = 'outputs/gpt2-as-Controller'

In [15]:
if bf16:
    model = AutoModelForCausalLM.from_pretrained(model_name).to(dtype=torch.bfloat16)
else:
    model = AutoModelForCausalLM.from_pretrained(model_name)
print(model)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)
124,439,808 total parameters.
124,439,808 training parameters.


In [16]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token

In [17]:
dataset = load_dataset('NathanGavenski/LunarLander-v2')
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['obs', 'actions', 'rewards', 'episode_starts'],
        num_rows: 383994
    })
})


In [18]:
full_dataset = dataset['train'].train_test_split(test_size=0.05, shuffle=True)
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']

print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['obs', 'actions', 'rewards', 'episode_starts'],
    num_rows: 364794
})
Dataset({
    features: ['obs', 'actions', 'rewards', 'episode_starts'],
    num_rows: 19200
})


In [19]:
def preprocess_function(example):
    """
    Formatting function returning a list of samples (kind of necessary for SFT API).
    """
    text = f"### Instruction:\n Assume You are required for control lunar lander. So you want to control the lunar lander efficiently. The task that is you want to predict the action for control the lunar lander\n\n### Input:\n{example['obs']}\n\n### Response:\n{example['actions']}"
    return text

In [20]:
training_args = TrainingArguments(
    output_dir=f"{out_dir}/logs",
    evaluation_strategy='steps',
    weight_decay=0.01,
    load_best_model_at_end=True,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_strategy='steps',
    save_strategy='steps',
    logging_steps=logging_steps,
    save_steps=save_steps,
    save_total_limit=2,
    bf16=bf16,
    fp16=fp16,
    report_to='tensorboard',
    max_steps=max_steps,
    dataloader_num_workers=num_workers,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    lr_scheduler_type='constant',
)

In [21]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_valid,
    max_seq_length=context_length,
    tokenizer=tokenizer,
    args=training_args,
    formatting_func=preprocess_function,
    packing=True
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


# Sample format

In [22]:
dataloader = trainer.get_train_dataloader()
for i, sample in enumerate(dataloader):
    print(tokenizer.decode(sample['input_ids'][0]))
    print('#'*50)
    if i == 5:
        break

:
[0.03406524658203125, 0.04434901103377342, 0.08408885449171066, -0.16637031733989716, 0.023665497079491615, 0.016206054016947746, 0.0, 0.0]

### Response:
2<|endoftext|>### Instruction:
 Assume You are required for control lunar lander. So you want to control the lunar lander efficiently. The task that is you want to predict the action for control the lunar lander

### Input:
[-0.01038217544555664, -0.0006284856935963035, -3.273901165812276e-05, 1.6760782983737954e-08, -0.006565684452652931, 9.54290953814052e-06, 1.0, 1.0]

### Response:
0<|endoftext|>### Instruction:
 Assume You are required for control lunar lander. So you want to control the lunar lander efficiently. The task that is you want to
##################################################
 the action for control the lunar lander

### Input:
[-0.10898733139038086, 0.3769143223762512, 0.05202445387840271, -0.12600871920585632, -0.021979112178087234, -0.03903788700699806, 0.0, 0.0]

### Response:
1<|endoftext|>### Instruction:

# Training

In [23]:
history = trainer.train()

Step,Training Loss,Validation Loss
500,1.9031,1.833427
1000,1.8426,1.812069
1500,1.8257,1.802644
2000,1.8151,1.794165
2500,1.8079,1.789648
3000,1.8024,1.78322
3500,1.7947,1.78164
4000,1.7904,1.77572
4500,1.7852,1.769558
5000,1.7794,1.761404


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


# Evaluation

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
pipe = pipeline(
    task='text-generation',
    model=model,
    tokenizer=tokenizer,
    max_length=context_length, # Prompt + new tokens to generate.
    device_map=device
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [26]:
template = """### Instruction:
{}
### Input:
{}
### Response:
{}"""

instructions = """
                Assume You are required for control lunar lander. So you want to control the lunar lander efficiently. The task that is you want to predict the action for control the lunar lander.
"""
inputs = dataset_train['obs'][0]
response = ''
prompt = template.format(instructions, inputs, response)

In [27]:
prompt

'### Instruction:\n\n                Assume You are required for control lunar lander. So you want to control the lunar lander efficiently. The task that is you want to predict the action for control the lunar lander.\n\n### Input:\n[-0.06165885925292969, 1.2601277828216553, -0.5038522481918335, -0.5353798270225525, 0.008095242083072662, -0.04861641302704811, 0.0, 0.0]\n### Response:\n'

In [28]:
outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.1,
)
print(outputs[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


### Instruction:

                Assume You are required for control lunar lander. So you want to control the lunar lander efficiently. The task that is you want to predict the action for control the lunar lander.

### Input:
[-0.06165885925292969, 1.2601277828216553, -0.5038522481918335, -0.5353798270225525, 0.008095242083072662, -0.04861641302704811, 0.0, 0.0]
### Response:
2


In [29]:
int(outputs[0]['generated_text'].split('\n')[-1])

2

## Prediction

In [34]:
import time
start_time = time.time()

predicted_actions = []

for state in dataset_valid['obs']:
    # Generate response from GPT-2 model
    prompt = f"### Instruction:\nAssume You are required for control lunar lander. So you want to control the lunar lander efficiently. The task that is you want to predict the action for control the lunar lander.\n\n### Input:\n{state}\n### Response:"
    
    # Generate output from the model
    outputs = pipe(
        prompt,
        do_sample=True,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.1,
    )
    
    # Extract predicted action (after the 'Response:')
    generated_text = outputs[0]['generated_text'].split('Response:')[-1].strip()
    
    # Check if the prediction is empty, if so, default to 0
    try:
        predicted_action = int(generated_text)
    except ValueError:
        predicted_action = 0  # In case the output is not a valid integer

    # Append predicted action to the list
    predicted_actions.append(predicted_action)

end_time = time.time()
print(f"Time took : {end_time - start_time}")

Time took : 944.0735626220703


In [40]:
actual_actions = dataset_valid['actions']
actual_actions = np.array(actual_actions)

predicted_actions = np.array(predicted_actions)

In [42]:
from sklearn.metrics import confusion_matrix, accuracy_score

conf_matrix = confusion_matrix(actual_actions, predicted_actions)
accuracy = accuracy_score(actual_actions, predicted_actions)

print("Confusion Matrix:")
print(conf_matrix)
print(f"Accuracy: {accuracy * 100:.2f}%")

Confusion Matrix:
[[3850  425  655  759]
 [ 237 1295  360  151]
 [ 141  197 8777  310]
 [ 246   29  509 1259]]
Accuracy: 79.07%


In [43]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [44]:
model.push_to_hub("ErnestBeckham/GPT-2-as-Controller")

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Non-default generation parameters: {'max_length': 50, 'do_sample': True}


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ErnestBeckham/GPT-2-as-Controller/commit/261b35caa24eceea1197a9def7e4c679f3e1b2a7', commit_message='Upload model', commit_description='', oid='261b35caa24eceea1197a9def7e4c679f3e1b2a7', pr_url=None, pr_revision=None, pr_num=None)

In [45]:
tokenizer.push_to_hub("ErnestBeckham/GPT-2-as-Controller")

CommitInfo(commit_url='https://huggingface.co/ErnestBeckham/GPT-2-as-Controller/commit/05089b18addb0abb2f1ec638ca8ce565bf85d589', commit_message='Upload tokenizer', commit_description='', oid='05089b18addb0abb2f1ec638ca8ce565bf85d589', pr_url=None, pr_revision=None, pr_num=None)