trainig_conf = Namespace(rng_seed=2703368087, learning_rate='5e-5', gradient_checkpointing=False, gradient_accumulation_steps=2, per_device_train_batch_size=4, per_device_eval_batch_size=4, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon='1e-12', weight_decay=0.0, warmup_steps=600, eval_steps=200, save_strategy='steps', save_steps=1000, max_length=512, val_max_length=None, num_train_epochs=3, logging_steps=10, max_grad_norm=2.0, save_total_limit=4, dtype='fp16', eval_accumulation_steps=None, freeze_layer=None, datasets=['webgpt'], datasets_extra=[], cache_dir='.cache', loss_fn='CrossEntropyLoss', eval_size=None, log_dir='base', quantization=False, seq2seqmodel=False, poly_eps=1.0, fuse_gelu=True, log_wandb=False, samples_mixing=False, verbose=False, output_dir='saved_model', use_custom_sampler=False, random_offset_probability=0.8, label_masking=True, residual_dropout=0.0, use_flash_attention=False, sort_by_length=False, use_system_prefix=False, system_prefix='You are Joi, a large language model trained by Open-Assistant. Answer as concisely as possible.\nKnowledge cutoff: 2021-09-01\nCurrent date: 2023-03-12', use_system_tag=False, system_property_dropout=0.5, system_add_length=False, per_digit_tokens=False, is_reward_model=False, deepspeed_config='configs/zero_config.json', model_name='facebook/galactica-125m', wandb_entity='open-assistant', local_rank=-1, deepspeed=False, resume_from_checkpoint=False, show_dataset_stats=False, world_size=1) RNG seed: 2703368087 Tokenizer sanity check: Type: PreTrainedTokenizerFast special_tokens_map: {'eos_token': '', 'pad_token': '', 'additional_special_tokens': ['<|prefix_end|>', '<|prompter|>', '<|prefix_begin|>', '<|system|>', '<|assistant|>']} Using bos_token, but it is not set yet. bos_token='None', bos_token_id=None eos_token='', eos_token_id=2 prompter_token_id=50001, assistant_token_id=50004 encoding result: {'input_ids': [50001, 71, 39, 2, 50003, 22018, 48, 542, 221, 5717, 48, 243, 39, 221, 14314, 48, 23555, 221, 2, 50004, 55, 39, 2, 50001, 71, 40, 2, 50003, 5717, 48, 243, 39, 221, 22018, 48, 542, 221, 14314, 48, 23555, 221, 2, 50004, 55, 40, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} 0: 50001 -> "<|prompter|>" 1: 71 -> "Q" 2: 39 -> "1" 3: 2 -> "" 4: 50003 -> "<|system|>" 5: 22018 -> "lang" 6: 48 -> ":" 7: 542 -> " en" 8: 221 -> " " 9: 5717 -> "length" 10: 48 -> ":" 11: 243 -> " " 12: 39 -> "1" 13: 221 -> " " 14: 14314 -> "context" 15: 48 -> ":" 16: 23555 -> " ctx" 17: 221 -> " " 18: 2 -> "" 19: 50004 -> "<|assistant|>" 20: 55 -> "A" 21: 39 -> "1" 22: 2 -> "" 23: 50001 -> "<|prompter|>" 24: 71 -> "Q" 25: 40 -> "2" 26: 2 -> "" 27: 50003 -> "<|system|>" 28: 5717 -> "length" 29: 48 -> ":" 30: 243 -> " " 31: 39 -> "1" 32: 221 -> " " 33: 22018 -> "lang" 34: 48 -> ":" 35: 542 -> " en" 36: 221 -> " " 37: 14314 -> "context" 38: 48 -> ":" 39: 23555 -> " ctx" 40: 221 -> " " 41: 2 -> "" 42: 50004 -> "<|assistant|>" 43: 55 -> "A" 44: 40 -> "2" 45: 2 -> "" message_indices: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3] Found cached dataset webgpt_comparisons (/home/miles/.cache/huggingface/datasets/openai___webgpt_comparisons/default/0.0.0/8b5d5879cdc98c4c0099af6053dffe8d504588d43d3b11f1b1ec223ab1e8db0a) 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 511.06it/s] Resizing embeddings to 50016 Number of trainable parameters: 125M Continuing without patching training_conf.resume_from_checkpoint: False /home/miles/anaconda3/envs/oa/lib/python3.10/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning warnings.warn( wandb: Tracking run with wandb version 0.15.3 wandb: W&B syncing is set to `offline` in this directory. wandb: Run `wandb online` or set WANDB_MODE=online to enable cloud syncing. 0%| | 0/5343 [00:00 main() File "/home/miles/repos/Open-Assistant/model/model_training/trainer_sft.py", line 466, in main trainer.train(resume_from_checkpoint=training_conf.resume_from_checkpoint) File "/home/miles/anaconda3/envs/oa/lib/python3.10/site-packages/transformers/trainer.py", line 1639, in train return inner_training_loop( File "/home/miles/anaconda3/envs/oa/lib/python3.10/site-packages/transformers/trainer.py", line 1939, in _inner_training_loop self.scaler.unscale_(self.optimizer) File "/home/miles/anaconda3/envs/oa/lib/python3.10/site-packages/torch/cuda/amp/grad_scaler.py", line 284, in unscale_ optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_scale, found_inf, False) File "/home/miles/anaconda3/envs/oa/lib/python3.10/site-packages/torch/cuda/amp/grad_scaler.py", line 212, in _unscale_grads_ raise ValueError("Attempting to unscale FP16 gradients.") ValueError: Attempting to unscale FP16 gradients. ╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /home/miles/repos/Open-Assistant/model/model_training/trainer_sft.py:472 in │ │ │ │ │ │ 469 │ │ 470 │ │ 471 if __name__ == "__main__": │ │ ❱ 472 │ main() │ │ 473 │ │ │ │ /home/miles/repos/Open-Assistant/model/model_training/trainer_sft.py:466 in │ │ main │ │ │ │ 463 │ │ preprocess_logits_for_metrics=preprocess_logits_for_metrics, │ │ 464 │ ) │ │ 465 │ print("training_conf.resume_from_checkpoint:",training_conf.resume │ │ ❱ 466 │ trainer.train(resume_from_checkpoint=training_conf.resume_from_che │ │ 467 │ trainer.save_model() │ │ 468 │ tokenizer.save_pretrained(output_dir) │ │ 469 │ │ │ │ /home/miles/anaconda3/envs/oa/lib/python3.10/site-packages/transformers/trai │ │ ner.py:1639 in train │ │ │ │ 1636 │ │ inner_training_loop = find_executable_batch_size( │ │ 1637 │ │ │ self._inner_training_loop, self._train_batch_size, args.a │ │ 1638 │ │ ) │ │ ❱ 1639 │ │ return inner_training_loop( │ │ 1640 │ │ │ args=args, │ │ 1641 │ │ │ resume_from_checkpoint=resume_from_checkpoint, │ │ 1642 │ │ │ trial=trial, │ │ │ │ /home/miles/anaconda3/envs/oa/lib/python3.10/site-packages/transformers/trai │ │ ner.py:1939 in _inner_training_loop │ │ │ │ 1936 │ │ │ │ │ │ │ │ gradients = xm._fetch_gradients(self. │ │ 1937 │ │ │ │ │ │ │ │ xm.all_reduce("sum", gradients, scale │ │ 1938 │ │ │ │ │ │ │ # AMP: gradients need unscaling │ │ ❱ 1939 │ │ │ │ │ │ │ self.scaler.unscale_(self.optimizer) │ │ 1940 │ │ │ │ │ │ │ │ 1941 │ │ │ │ │ │ if is_sagemaker_mp_enabled() and args.fp16: │ │ 1942 │ │ │ │ │ │ │ self.optimizer.clip_master_grads(args.max │ │ │ │ /home/miles/anaconda3/envs/oa/lib/python3.10/site-packages/torch/cuda/amp/gr │ │ ad_scaler.py:284 in unscale_ │ │ │ │ 281 │ │ inv_scale = self._scale.double().reciprocal().float() │ │ 282 │ │ found_inf = torch.full((1,), 0.0, dtype=torch.float32, device= │ │ 283 │ │ │ │ ❱ 284 │ │ optimizer_state["found_inf_per_device"] = self._unscale_grads_ │ │ 285 │ │ optimizer_state["stage"] = OptState.UNSCALED │ │ 286 │ │ │ 287 │ def _maybe_opt_step(self, optimizer, optimizer_state, *args, **kwa │ │ │ │ /home/miles/anaconda3/envs/oa/lib/python3.10/site-packages/torch/cuda/amp/gr │ │ ad_scaler.py:212 in _unscale_grads_ │ │ │ │ 209 │ │ │ │ │ if param.grad is None: │ │ 210 │ │ │ │ │ │ continue │ │ 211 │ │ │ │ │ if (not allow_fp16) and param.grad.dtype == torch. │ │ ❱ 212 │ │ │ │ │ │ raise ValueError("Attempting to unscale FP16 g │ │ 213 │ │ │ │ │ if param.grad.is_sparse: │ │ 214 │ │ │ │ │ │ # is_coalesced() == False means the sparse gra │ │ 215 │ │ │ │ │ │ # coalesce() deduplicates indices and adds all │ ╰──────────────────────────────────────────────────────────────────────────────╯ ValueError: Attempting to unscale FP16 gradients. wandb: Waiting for W&B process to finish... (failed 1). wandb: You can sync this run to the cloud by running: wandb: wandb sync /home/miles/repos/Open-Assistant/model/model_training/wandb/offline-run-20230522_171644-3amgmt8z wandb: Find logs at: ./wandb/offline-run-20230522_171644-3amgmt8z/logs