In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model
from transformers import Trainer, TrainingArguments

In [2]:
model_name  = "EleutherAI/pythia-410m"

In [3]:
data_files = {
    "train": [
        # "data/datascience_2000.jsonl",
        "data/datascience_1000_multistep.jsonl",
        # "data/datascience_1000_errors.jsonl"
    ]
}

In [4]:
raw_ds = load_dataset("json", data_files=data_files, split="train")
splits = raw_ds.train_test_split(test_size=0.1, seed=42)
train_ds, eval_ds = splits["train"], splits["test"]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

In [6]:
def format_example(ex):
    prompt = f"### Instruction:\n{ex['instruction']}\n\n### Response:\n{ex['output']}"
    prompt += tokenizer.eos_token
    return {"text": prompt}

train_ds = train_ds.map(format_example, remove_columns=train_ds.column_names)
eval_ds  = eval_ds.map(format_example,  remove_columns=eval_ds.column_names)

In [7]:
def tokenize_fn(ex):
    return tokenizer(ex["text"], truncation=True, max_length=512, padding="max_length")
train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=["text"])
eval_tok  = eval_ds.map(tokenize_fn,  batched=True, remove_columns=["text"])

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [9]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
)

In [10]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)

In [11]:
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (query_key_value): Linear4bit(in_features=1024, out_features=3072, bias=True)
          (dense): Linear4bit(in_features=1024, out_features=1024, bias=True)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear4bit(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear4bit(in_features=4096, out_features=1024, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((102

In [12]:
instruction = "Filter df for rows where 'category' contains 'A', then encode 'city' with one-hot, and train/test split with 80-20%."

In [13]:
inputs = tokenizer(instruction, return_tensors="pt").to("cuda")
tokens = model.generate(**inputs, max_new_tokens=512, eos_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(tokens[0]))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Filter df for rows where 'category' contains 'A', then encode 'city' with one-hot, and train/test split with 80-20%.

I am not sure if this is the best way to go about it.

A:

I think this is the best way to go about it.
I am not sure if this is the best way to go about it.

I think it is.  I would recommend using a one-hot encoding.  One-hot encoding is a way to encode a vector of one-hot values.  One-hot values are one-hot, but not necessarily one-to-one.  One-to-one means that the value is one-to-one.  One-to-one means that the value is one-to-one.  One-to-one means that the value is one-to-one.  One-to-one means that the value is one-to-one.  One-to-one means that the value is one-to-one.  One-to-one means that the value is one-to-one.  One-to-one means that the value is one-to-one.  One-to-one means that the value is one-to-one.  One-to-one means that the value is one-to-one.  One-to-one means that the value is one-to-one.  One-to-one means that the value is one-to-one.  One-to-o

In [14]:
# model.gradient_checkpointing_enable()
# model.enable_input_require_grads()
# model.config.use_cache = False

In [15]:
lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [16]:
model = get_peft_model(model, lora_cfg)

In [17]:
training_args = TrainingArguments(
    output_dir="./pythia-lora",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=10,
    learning_rate=3e-4,
    fp16=True,
    logging_steps=10,
    save_total_limit=2,
    report_to="none",
)

In [18]:
print(model.get_memory_footprint()/1e6)

360.82896


In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    data_collator=data_collator,
    tokenizer=tokenizer
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [20]:
trainer.train()

Step,Training Loss
10,1.5518
20,0.8939
30,0.5219
40,0.3004
50,0.1791
60,0.1281
70,0.1031
80,0.0916
90,0.0871
100,0.0817


TrainOutput(global_step=150, training_loss=0.28791256388028463, metrics={'train_runtime': 47644.2927, 'train_samples_per_second': 0.189, 'train_steps_per_second': 0.003, 'total_flos': 9804233834496000.0, 'train_loss': 0.28791256388028463, 'epoch': 10.0})

In [32]:
model.save_pretrained("./pythia-lora-final")
tokenizer.save_pretrained("./pythia-lora-final")

('./pythia-lora-final\\tokenizer_config.json',
 './pythia-lora-final\\special_tokens_map.json',
 './pythia-lora-final\\tokenizer.json')

In [22]:
"FINISHED"

'FINISHED'

In [23]:
prompt_template = f"### Instruction:\n{instruction}\n\n### Response:\n"

In [31]:
inputs = tokenizer(prompt_template, return_tensors="pt").to("cuda")
tokens = model.generate(**inputs, max_new_tokens=512, eos_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(tokens[0]))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


### Instruction:
Filter df for rows where 'category' contains 'A', then encode 'city' with one-hot, and train/test split with 80-20%.

### Response:
import pandas as pd
from sklearn.model_selection import train_test_split
df = df[df['category'].str.contains('A', na=False)]
df_ohe = pd.get_dummies(df, columns=['city'])
train, test = train_test_split(df_ohe, test_size=0.2, random_state=42)
print(train.shape, test.shape)
df_ohe.to_dict(orient='records')
df_ohe.to_csv('test.csv', index=False)
df_ohe.drop('city', 1)
df_ohe.to_csv('city.csv', index=False)

### Response:
df_ohe = df_ohe.to_dict(orient='records')
df_ohe.to_csv('city.csv', index=False)
df_ohe.drop('city', 1)
df_ohe.to_csv('city.csv', index=False)

### Response:
df_ohe = df_ohe.to_dict(orient='records')
df_ohe = pd.get_dummies(df_ohe, columns=['city'])
df_ohe.to_csv('city.csv', index=False)
df_ohe.drop('city', 1)
df_ohe.to_csv('city.csv', index=False)

### Response:
df_ohe = df_ohe.to_dict(orient='records')
df_ohe = pd.get_dummi