In [2]:
import os
os.environ['TRANSFORMERS_CACHE'] = "/media/data/flowers/OpenELM/models"
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "Salesforce/codegen-350M-mono" # mono is better
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")

Downloading (…)okenizer_config.json: 100%|██████████| 240/240 [00:00<00:00, 1.85MB/s]
Downloading (…)olve/main/vocab.json: 100%|██████████| 798k/798k [00:00<00:00, 8.34MB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 1.86MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 2.11M/2.11M [00:00<00:00, 21.9MB/s]
Downloading (…)in/added_tokens.json: 100%|██████████| 1.00k/1.00k [00:00<00:00, 8.36MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 749kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 999/999 [00:00<00:00, 6.31MB/s]



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /media/data/flowers/conda/envs/codegpt/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /media/data/flowers/conda/envs/codegpt/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /media/data/flowers/conda/envs/codegpt/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


Downloading pytorch_model.bin: 100%|██████████| 797M/797M [00:07<00:00, 109MB/s]  


In [3]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [4]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [5]:
model

CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-19): 20 x CodeGenBlock(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear4bit(in_features=1024, out_features=3072, bias=False)
          (out_proj): Linear4bit(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): CodeGenMLP(
          (fc_in): Linear4bit(in_features=1024, out_features=4096, bias=True)
          (fc_out): Linear4bit(in_features=4096, out_features=1024, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=51200, bi

In [6]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["qkv_proj"],#["query_key_value"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 655360 || all params: 231538688 || trainable%: 0.2830455703368242


In [7]:
from datasets import load_dataset
path= "/media/data/flowers/OpenELM/logs/elm/23-08-16_16:29/step_9/save_all.json"
squad_it_dataset = load_dataset("json", data_files=path)


In [8]:
path_test = "/media/data/flowers/OpenELM/preprocess_p3.json" # trainset from P3
test_set = load_dataset("json", data_files=path_test)

In [9]:
def filter_correct_puzz(example):
    return (
        example["fitness"] >0.5
    )


In [10]:
correct_dataset = squad_it_dataset.filter(filter_correct_puzz)


In [11]:
# def tokenize(element):
#     context_length=2048
#     outputs = tokenizer(
#         element["program_str"],
#         truncation=True,
#         max_length=context_length,
#         # return_length=True,
#     )
#     input_batch = []
#     input_ids = outputs["input_ids"]
#     input_batch.append(input_ids)
#     # for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
#     #     if length == context_length:
#     #         input_batch.append(input_ids)
#     return {"input_ids": input_batch}


# tokenized_datasets = correct_dataset.map(
#     tokenize, batched=True, remove_columns=correct_dataset["train"].column_names
# )
tokenized_datasets = correct_dataset.map(lambda samples: tokenizer(samples["program_str"]), batched=True)
tok_test_set = test_set.map(lambda samples: tokenizer(samples["program_str"]), batched=True)

Map: 100%|██████████| 536/536 [00:00<00:00, 11610.23 examples/s]


In [12]:
import transformers

# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_datasets["train"],
    eval_dataset = tok_test_set,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy="epoch", #"steps",
        gradient_accumulation_steps=1,
        # warmup_steps=2,
        warmup_ratio=0.1,
        num_train_epochs = 5,
        # max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjulien_p[0m ([33mflowers-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/50 [00:00<?, ?it/s]You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  2%|▏         | 1/50 [00:00<00:44,  1.11it/s]

{'loss': 1.3192, 'learning_rate': 4e-05, 'epoch': 0.1}


  4%|▍         | 2/50 [00:01<00:29,  1.65it/s]

{'loss': 1.2797, 'learning_rate': 8e-05, 'epoch': 0.2}


  6%|▌         | 3/50 [00:01<00:22,  2.08it/s]

{'loss': 1.2789, 'learning_rate': 0.00012, 'epoch': 0.3}


  8%|▊         | 4/50 [00:02<00:26,  1.70it/s]

{'loss': 1.0197, 'learning_rate': 0.00016, 'epoch': 0.4}


 10%|█         | 5/50 [00:03<00:37,  1.20it/s]

{'loss': 0.9057, 'learning_rate': 0.0002, 'epoch': 0.5}


 12%|█▏        | 6/50 [00:04<00:30,  1.45it/s]

{'loss': 1.21, 'learning_rate': 0.00019555555555555556, 'epoch': 0.6}


 14%|█▍        | 7/50 [00:04<00:30,  1.40it/s]

{'loss': 0.9978, 'learning_rate': 0.00019111111111111114, 'epoch': 0.7}


 16%|█▌        | 8/50 [00:05<00:26,  1.58it/s]

{'loss': 0.9828, 'learning_rate': 0.0001866666666666667, 'epoch': 0.8}


 20%|██        | 10/50 [00:05<00:18,  2.16it/s]

{'loss': 1.065, 'learning_rate': 0.00018222222222222224, 'epoch': 0.9}
{'loss': 0.9693, 'learning_rate': 0.00017777777777777779, 'epoch': 1.0}


                                               
 20%|██        | 10/50 [00:19<00:18,  2.16it/s]

{'eval_train_loss': 1.660199522972107, 'eval_train_runtime': 13.1991, 'eval_train_samples_per_second': 40.609, 'eval_train_steps_per_second': 5.076, 'epoch': 1.0}


 22%|██▏       | 11/50 [00:20<03:04,  4.73s/it]

{'loss': 0.7039, 'learning_rate': 0.00017333333333333334, 'epoch': 1.1}


 24%|██▍       | 12/50 [00:20<02:08,  3.39s/it]

{'loss': 0.8582, 'learning_rate': 0.00016888888888888889, 'epoch': 1.2}


 26%|██▌       | 13/50 [00:21<01:35,  2.59s/it]

{'loss': 0.7853, 'learning_rate': 0.00016444444444444444, 'epoch': 1.3}


 28%|██▊       | 14/50 [00:21<01:10,  1.96s/it]

{'loss': 0.8413, 'learning_rate': 0.00016, 'epoch': 1.4}


 30%|███       | 15/50 [00:22<00:53,  1.52s/it]

{'loss': 0.7625, 'learning_rate': 0.00015555555555555556, 'epoch': 1.5}


 32%|███▏      | 16/50 [00:22<00:40,  1.20s/it]

{'loss': 0.6985, 'learning_rate': 0.0001511111111111111, 'epoch': 1.6}


 34%|███▍      | 17/50 [00:23<00:32,  1.02it/s]

{'loss': 0.7867, 'learning_rate': 0.00014666666666666666, 'epoch': 1.7}


 36%|███▌      | 18/50 [00:23<00:26,  1.23it/s]

{'loss': 0.6317, 'learning_rate': 0.00014222222222222224, 'epoch': 1.8}


 38%|███▊      | 19/50 [00:24<00:21,  1.42it/s]

{'loss': 0.7655, 'learning_rate': 0.0001377777777777778, 'epoch': 1.9}


 40%|████      | 20/50 [00:24<00:17,  1.76it/s]

{'loss': 0.6709, 'learning_rate': 0.00013333333333333334, 'epoch': 2.0}


                                               
 40%|████      | 20/50 [00:37<00:17,  1.76it/s]

{'eval_train_loss': 1.5563864707946777, 'eval_train_runtime': 13.0779, 'eval_train_samples_per_second': 40.985, 'eval_train_steps_per_second': 5.123, 'epoch': 2.0}


 42%|████▏     | 21/50 [00:38<02:12,  4.55s/it]

{'loss': 0.6116, 'learning_rate': 0.00012888888888888892, 'epoch': 2.1}


 44%|████▍     | 22/50 [00:38<01:33,  3.34s/it]

{'loss': 0.652, 'learning_rate': 0.00012444444444444444, 'epoch': 2.2}


 46%|████▌     | 23/50 [00:40<01:12,  2.70s/it]

{'loss': 0.4927, 'learning_rate': 0.00012, 'epoch': 2.3}


 48%|████▊     | 24/50 [00:40<00:51,  2.00s/it]

{'loss': 0.6166, 'learning_rate': 0.00011555555555555555, 'epoch': 2.4}


 50%|█████     | 25/50 [00:41<00:40,  1.62s/it]

{'loss': 0.54, 'learning_rate': 0.00011111111111111112, 'epoch': 2.5}


 52%|█████▏    | 26/50 [00:41<00:30,  1.26s/it]

{'loss': 0.5373, 'learning_rate': 0.00010666666666666667, 'epoch': 2.6}


 54%|█████▍    | 27/50 [00:42<00:23,  1.02s/it]

{'loss': 0.5697, 'learning_rate': 0.00010222222222222222, 'epoch': 2.7}


 56%|█████▌    | 28/50 [00:42<00:18,  1.21it/s]

{'loss': 0.5329, 'learning_rate': 9.777777777777778e-05, 'epoch': 2.8}


 60%|██████    | 30/50 [00:43<00:10,  1.83it/s]

{'loss': 0.5462, 'learning_rate': 9.333333333333334e-05, 'epoch': 2.9}
{'loss': 0.5312, 'learning_rate': 8.888888888888889e-05, 'epoch': 3.0}


                                               
 60%|██████    | 30/50 [00:56<00:10,  1.83it/s]

{'eval_train_loss': 1.5240408182144165, 'eval_train_runtime': 13.1262, 'eval_train_samples_per_second': 40.834, 'eval_train_steps_per_second': 5.104, 'epoch': 3.0}


 62%|██████▏   | 31/50 [00:56<01:24,  4.42s/it]

{'loss': 0.6012, 'learning_rate': 8.444444444444444e-05, 'epoch': 3.1}


 64%|██████▍   | 32/50 [00:57<01:02,  3.46s/it]

{'loss': 0.3749, 'learning_rate': 8e-05, 'epoch': 3.2}


 66%|██████▌   | 33/50 [00:58<00:42,  2.53s/it]

{'loss': 0.4471, 'learning_rate': 7.555555555555556e-05, 'epoch': 3.3}


 68%|██████▊   | 34/50 [00:58<00:30,  1.91s/it]

{'loss': 0.5525, 'learning_rate': 7.111111111111112e-05, 'epoch': 3.4}


 70%|███████   | 35/50 [00:59<00:22,  1.49s/it]

{'loss': 0.4666, 'learning_rate': 6.666666666666667e-05, 'epoch': 3.5}


 72%|███████▏  | 36/50 [00:59<00:17,  1.26s/it]

{'loss': 0.5554, 'learning_rate': 6.222222222222222e-05, 'epoch': 3.6}


 74%|███████▍  | 37/50 [01:00<00:13,  1.00s/it]

{'loss': 0.4748, 'learning_rate': 5.7777777777777776e-05, 'epoch': 3.7}


 76%|███████▌  | 38/50 [01:00<00:10,  1.19it/s]

{'loss': 0.4387, 'learning_rate': 5.333333333333333e-05, 'epoch': 3.8}


 80%|████████  | 40/50 [01:01<00:05,  1.78it/s]

{'loss': 0.4716, 'learning_rate': 4.888888888888889e-05, 'epoch': 3.9}
{'loss': 0.5042, 'learning_rate': 4.4444444444444447e-05, 'epoch': 4.0}


                                               
 80%|████████  | 40/50 [01:14<00:05,  1.78it/s]

{'eval_train_loss': 1.519470453262329, 'eval_train_runtime': 13.2192, 'eval_train_samples_per_second': 40.547, 'eval_train_steps_per_second': 5.068, 'epoch': 4.0}


 82%|████████▏ | 41/50 [01:15<00:41,  4.59s/it]

{'loss': 0.4714, 'learning_rate': 4e-05, 'epoch': 4.1}


 84%|████████▍ | 42/50 [01:15<00:26,  3.37s/it]

{'loss': 0.4744, 'learning_rate': 3.555555555555556e-05, 'epoch': 4.2}


 86%|████████▌ | 43/50 [01:16<00:17,  2.47s/it]

{'loss': 0.4963, 'learning_rate': 3.111111111111111e-05, 'epoch': 4.3}


 88%|████████▊ | 44/50 [01:16<00:11,  1.85s/it]

{'loss': 0.4534, 'learning_rate': 2.6666666666666667e-05, 'epoch': 4.4}


 90%|█████████ | 45/50 [01:17<00:07,  1.52s/it]

{'loss': 0.4502, 'learning_rate': 2.2222222222222223e-05, 'epoch': 4.5}


 92%|█████████▏| 46/50 [01:17<00:04,  1.21s/it]

{'loss': 0.4648, 'learning_rate': 1.777777777777778e-05, 'epoch': 4.6}


 94%|█████████▍| 47/50 [01:18<00:03,  1.21s/it]

{'loss': 0.391, 'learning_rate': 1.3333333333333333e-05, 'epoch': 4.7}


 96%|█████████▌| 48/50 [01:19<00:01,  1.00it/s]

{'loss': 0.389, 'learning_rate': 8.88888888888889e-06, 'epoch': 4.8}


100%|██████████| 50/50 [01:19<00:00,  1.65it/s]

{'loss': 0.4062, 'learning_rate': 4.444444444444445e-06, 'epoch': 4.9}
{'loss': 0.2851, 'learning_rate': 0.0, 'epoch': 5.0}


                                               
100%|██████████| 50/50 [01:33<00:00,  1.86s/it]

{'eval_train_loss': 1.5257525444030762, 'eval_train_runtime': 13.205, 'eval_train_samples_per_second': 40.591, 'eval_train_steps_per_second': 5.074, 'epoch': 5.0}
{'train_runtime': 100.1006, 'train_samples_per_second': 3.696, 'train_steps_per_second': 0.499, 'train_loss': 0.6666267824172973, 'epoch': 5.0}





TrainOutput(global_step=50, training_loss=0.6666267824172973, metrics={'train_runtime': 100.1006, 'train_samples_per_second': 3.696, 'train_steps_per_second': 0.499, 'train_loss': 0.6666267824172973, 'epoch': 5.0})