Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
b08d1f3
feat: log consumed tokens
mali-git May 24, 2024
ffa374a
fix: training step logging
mali-git May 24, 2024
529e146
refactor: renamed skip_num_steps to skip_num_micro_steps in Dataloader
le1nux May 27, 2024
7dc9e6a
refactor: passing now tokens_per_train_step from main to trainer
le1nux May 27, 2024
8616988
fix: failing tests due to missing tokens_per_train_step
le1nux May 27, 2024
a288932
chore: fixed typo
le1nux May 29, 2024
e087ec4
chore: fixed skip_num_micro_steps in test configs
le1nux May 29, 2024
dedd2fc
refactor: skip_num_batches implemented for Dataloader, added testing
le1nux May 29, 2024
f92c35d
feat: implemented get_local_num_batches_from_num_tokens as part of Nu…
le1nux May 29, 2024
bfcae03
test: test skips batches now based on number of tokens to skip
le1nux May 30, 2024
21620bf
refactor: renamed tokens_per_train_step to global_num_tokens_per_trai…
le1nux May 30, 2024
c58d5c0
refactor: trainer with num_train_steps_done
le1nux May 30, 2024
42a1b8a
refactor: num_train_steps_done now calculated internally of _train_batch
le1nux May 30, 2024
2821bab
refactor: applied num_train_steps_done everywhere
le1nux May 30, 2024
f74054f
feat: implemented num_tokens_from_num_steps_callable and passing it t…
le1nux May 30, 2024
8ade072
refactor: renamed global_training_log_interval_in_steps, global_check…
le1nux May 31, 2024
e117649
refactor: added num_steps_from_num_tokens computation to configs
le1nux May 31, 2024
61f89f3
fix: fixed failing test_skipped_and_distributed_dataloader_from_config
le1nux Jun 1, 2024
d6674cb
fix: fixed failing test_e2e_training_run_wout_ckpt
le1nux Jun 1, 2024
d1cce1b
fix: fixed failing test_e2e_coca_training_run_without_checkpoint
le1nux Jun 1, 2024
d8ef06d
fix: fixed checkpoint execution tests
le1nux Jun 1, 2024
7971598
refactor: batch_progress_subscriber now uses gradient_acc_steps to ca…
le1nux Jun 2, 2024
a93667a
refactor: logging and evaluation now called only when the step changed
le1nux Jun 2, 2024
d98bb1b
refactor(configs): update batch_progress_subscriber config
flxst Jun 3, 2024
2d19a87
Update src/modalities/trainer.py
le1nux Jun 6, 2024
600a36b
Update src/modalities/trainer.py
le1nux Jun 6, 2024
a3b2a0a
test: added tests for NumberConversion
le1nux Jun 6, 2024
099d58b
chore: Merge branch 'fix_logging_steps' of github.com:Modalities/moda…
le1nux Jun 6, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 20 additions & 9 deletions config_files/training/config_example_coca.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@ settings:
sample_key: input_ids
target_key: target_ids
training:
global_training_log_interval_in_steps: 2
global_checkpointing_interval_in_steps: 2
global_evaluation_interval_in_steps: 2
training_log_interval_in_steps: 2
checkpointing_interval_in_steps: 2
evaluation_interval_in_steps: 2
global_num_training_samples: 12
global_num_seen_steps: 0
global_num_seen_tokens: 0
do_apply_activation_checkpointing: true
gradient_acc_steps: 1
local_train_micro_batch_size: 3
Expand Down Expand Up @@ -144,10 +144,13 @@ checkpoint_saving:
checkpoint_path: ${settings.paths.checkpointing_path}
global_rank: ${settings.cuda_env.global_rank}
experiment_id: ${settings.experiment_id}
mixed_precision_settings: FP_16
sharding_strategy: FULL_SHARD
block_names: [TransformerBlock, VisionTransformerBlock]

get_num_tokens_from_num_steps_callable:
component_key: number_conversion
variant_key: num_tokens_from_num_steps_callable
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
context_size: ${settings.training.sequence_length}
loss_fn:
component_key: loss
variant_key: clm_cross_entropy_loss
Expand Down Expand Up @@ -258,7 +261,15 @@ batch_progress_subscriber:
config:
local_rank: ${settings.cuda_env.local_rank}
world_size: ${settings.cuda_env.world_size}
global_num_seen_steps: ${settings.training.global_num_seen_steps}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
global_num_tokens: ${settings.training.global_num_seen_tokens}
context_size: ${settings.training.sequence_length}
gradient_acc_steps: ${settings.training.gradient_acc_steps}
train_dataloader:
instance_key: train_dataloader
pass_type: BY_REFERENCE
Expand Down
17 changes: 12 additions & 5 deletions config_files/training/config_gpt2_small_overfitting_de.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ settings:
target_key: target_ids
prediction_key: logits
training:
global_training_log_interval_in_steps: 1
global_checkpointing_interval_in_steps: 128
global_evaluation_interval_in_steps: 64
global_num_seen_steps: 0
training_log_interval_in_steps: 1
checkpointing_interval_in_steps: 128
evaluation_interval_in_steps: 64
global_num_seen_tokens: 0
do_apply_activation_checkpointing: false
gradient_acc_steps: 1
local_train_micro_batch_size: 16
Expand Down Expand Up @@ -238,7 +238,14 @@ batch_progress_subscriber:
config:
local_rank: ${settings.cuda_env.local_rank}
world_size: ${settings.cuda_env.world_size}
global_num_seen_steps: ${settings.training.global_num_seen_steps}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
global_num_tokens: ${settings.training.global_num_seen_tokens}
context_size: ${settings.training.sequence_length}
train_dataloader:
instance_key: train_dataloader
pass_type: BY_REFERENCE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ settings:
target_key: target_ids
prediction_key: logits
training:
global_training_log_interval_in_steps: 1
global_checkpointing_interval_in_steps: 128
global_evaluation_interval_in_steps: 64
global_num_seen_steps: 0
training_log_interval_in_steps: 1
checkpointing_interval_in_steps: 128
evaluation_interval_in_steps: 64
global_num_seen_tokens: 0
do_apply_activation_checkpointing: false
gradient_acc_steps: 1
local_train_micro_batch_size: 16
Expand Down Expand Up @@ -238,7 +238,14 @@ batch_progress_subscriber:
config:
local_rank: ${settings.cuda_env.local_rank}
world_size: ${settings.cuda_env.world_size}
global_num_seen_steps: ${settings.training.global_num_seen_steps}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
global_num_tokens: ${settings.training.global_num_seen_tokens}
context_size: ${settings.training.sequence_length}
train_dataloader:
instance_key: train_dataloader
pass_type: BY_REFERENCE
Expand Down
17 changes: 12 additions & 5 deletions config_files/training/config_gpt2_small_redpajama_DE_1048576.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ settings:
target_key: target_ids
prediction_key: logits
training:
global_training_log_interval_in_steps: 32
global_checkpointing_interval_in_steps: 8192
global_evaluation_interval_in_steps: 1024
global_num_seen_steps: 0
training_log_interval_in_steps: 32
checkpointing_interval_in_steps: 8192
evaluation_interval_in_steps: 1024
global_num_seen_tokens: 0
do_apply_activation_checkpointing: false
gradient_acc_steps: 1
local_train_micro_batch_size: 16
Expand Down Expand Up @@ -238,7 +238,14 @@ batch_progress_subscriber:
config:
local_rank: ${settings.cuda_env.local_rank}
world_size: ${settings.cuda_env.world_size}
global_num_seen_steps: ${settings.training.global_num_seen_steps}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
global_num_tokens: ${settings.training.global_num_seen_tokens}
context_size: ${settings.training.sequence_length}
train_dataloader:
instance_key: train_dataloader
pass_type: BY_REFERENCE
Expand Down
32 changes: 23 additions & 9 deletions config_files/training/config_lorem_ipsum.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ settings:
sample_key: input_ids
target_key: target_ids
training:
global_training_log_interval_in_steps: 8
global_checkpointing_interval_in_steps: 3
global_evaluation_interval_in_steps: 2
global_num_seen_steps: 0
do_apply_activation_checkpointing: true
gradient_acc_steps: 1
training_log_interval_in_steps: 2
checkpointing_interval_in_steps: 4
evaluation_interval_in_steps: 2
global_num_seen_tokens: 0
do_apply_activation_checkpointing: false
gradient_acc_steps: 2
local_train_micro_batch_size: 1
sequence_length: 256
cuda_env:
Expand Down Expand Up @@ -149,7 +149,14 @@ checkpoint_saving:
config:
checkpoint_path: ${settings.paths.checkpointing_path}
global_rank: ${settings.cuda_env.global_rank}
experiment_id: ${settings.experiment_id}
experiment_id: ${settings.experiment_id}
get_num_tokens_from_num_steps_callable:
component_key: number_conversion
variant_key: num_tokens_from_num_steps_callable
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
context_size: ${settings.training.sequence_length}

# resolving class types via different enums sucks...
loss_fn:
Expand Down Expand Up @@ -262,15 +269,22 @@ batch_progress_subscriber:
config:
local_rank: ${settings.cuda_env.local_rank}
world_size: ${settings.cuda_env.world_size}
global_num_seen_steps: ${settings.training.global_num_seen_steps}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
global_num_tokens: ${settings.training.global_num_seen_tokens}
context_size: ${settings.training.sequence_length}
gradient_acc_steps: ${settings.training.gradient_acc_steps}
train_dataloader:
instance_key: train_dataloader
pass_type: BY_REFERENCE
eval_dataloaders:
instance_key: eval_dataloaders
pass_type: BY_REFERENCE


evaluation_subscriber:
component_key: results_subscriber
variant_key: wandb
Expand Down
17 changes: 12 additions & 5 deletions examples/getting_started/example_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ settings:
sample_key: input_ids
target_key: target_ids
training:
global_training_log_interval_in_steps: 48
global_checkpointing_interval_in_steps: 48
global_evaluation_interval_in_steps: 48
global_num_seen_steps: 0
training_log_interval_in_steps: 48
checkpointing_interval_in_steps: 48
evaluation_interval_in_steps: 48
global_num_seen_tokens: 0
do_apply_activation_checkpointing: false
gradient_acc_steps: 1
local_train_micro_batch_size: 8
Expand Down Expand Up @@ -227,7 +227,14 @@ batch_progress_subscriber:
config:
local_rank: ${settings.cuda_env.local_rank}
world_size: ${settings.cuda_env.world_size}
global_num_seen_steps: ${settings.training.global_num_seen_steps}
global_num_seen_steps:
component_key: number_conversion
variant_key: num_steps_from_num_tokens
config:
num_ranks: ${settings.cuda_env.world_size}
local_micro_batch_size: ${settings.training.local_train_micro_batch_size}
global_num_tokens: ${settings.training.global_num_seen_tokens}
context_size: ${settings.training.sequence_length}
train_dataloader:
instance_key: train_dataloader
pass_type: BY_REFERENCE
Expand Down
13 changes: 10 additions & 3 deletions src/modalities/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,12 +196,19 @@ def run(self, components: TrainingComponentsInstantiationModel):
)

# Trainer
global_num_tokens_per_train_step = (
components.settings.training.local_train_micro_batch_size
* components.settings.training.sequence_length
* components.settings.training.gradient_acc_steps
* components.settings.cuda_env.world_size
)
trainer = Trainer(
local_rank=components.settings.cuda_env.local_rank,
batch_progress_publisher=batch_processed_publisher,
evaluation_result_publisher=evaluation_result_publisher,
gradient_acc_steps=components.settings.training.gradient_acc_steps,
gradient_clipper=components.gradient_clipper,
global_num_tokens_per_train_step=global_num_tokens_per_train_step,
)

# Evaluator
Expand Down Expand Up @@ -231,9 +238,9 @@ def run(self, components: TrainingComponentsInstantiationModel):
model=wrapped_model,
optimizer=components.optimizer,
scheduler=components.scheduler,
global_checkpointing_interval_in_steps=components.settings.training.global_checkpointing_interval_in_steps,
global_evaluation_interval_in_steps=components.settings.training.global_evaluation_interval_in_steps,
global_training_log_interval_in_steps=components.settings.training.global_training_log_interval_in_steps,
checkpointing_interval_in_steps=components.settings.training.checkpointing_interval_in_steps,
evaluation_interval_in_steps=components.settings.training.evaluation_interval_in_steps,
training_log_interval_in_steps=components.settings.training.training_log_interval_in_steps,
)
print("done")

Expand Down
4 changes: 2 additions & 2 deletions src/modalities/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,13 +99,13 @@ class EvaluationResultBatch(Batch):
Also entire epoch results are stored in here."""

dataloader_tag: str
train_step_id: int
num_train_steps_done: int
losses: Dict[str, torch.Tensor] = field(default_factory=lambda: dict())
metrics: Dict[str, torch.Tensor] = field(default_factory=lambda: dict())
throughput_metrics: Dict[str, torch.Tensor] = field(default_factory=lambda: dict())

def __str__(self) -> str:
eval_str = f"Evaluation result on dataset tag {self.dataloader_tag} after {self.train_step_id + 1} steps:"
eval_str = f"Evaluation result on dataset tag {self.dataloader_tag} after {self.num_train_steps_done} steps:"
eval_str += "\n\nlosses: " + "\n\t".join([f"{k}: {v.mean().item()}" for k, v in self.losses.items()])
eval_str += "\n\nmetrics: " + "\n\t".join([f"{k}: {v.mean().item()}" for k, v in self.metrics.items()])
eval_str += "\n\nthroughput metrics: " + "\n\t".join(
Expand Down
6 changes: 3 additions & 3 deletions src/modalities/checkpointing/checkpoint_saving.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,21 @@ def __init__(

def save_checkpoint(
self,
train_step_id: int,
num_train_steps_done: int,
evaluation_result: Dict[str, EvaluationResultBatch],
model: nn.Module,
optimizer: Optimizer,
early_stoppping_criterion_fulfilled: bool = False,
):
checkpointing_instruction = self.checkpoint_saving_strategy.get_checkpoint_instruction(
train_step_id=train_step_id,
num_train_steps_done=num_train_steps_done,
evaluation_result=evaluation_result,
early_stoppping_criterion_fulfilled=early_stoppping_criterion_fulfilled,
)

self.checkpoint_saving_execution.run_checkpoint_instruction(
checkpointing_instruction=checkpointing_instruction,
train_step_id=train_step_id,
num_train_steps_done=num_train_steps_done,
model=model,
optimizer=optimizer,
)
12 changes: 6 additions & 6 deletions src/modalities/checkpointing/checkpoint_saving_execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,22 @@

class CheckpointSavingExecutionABC(ABC):
@abstractmethod
def _save_checkpoint(self, model: nn.Module, optimizer: Optimizer, train_step_id: int):
def _save_checkpoint(self, model: nn.Module, optimizer: Optimizer, num_train_steps_done: int):
raise NotImplementedError

@abstractmethod
def _delete_checkpoint(self, train_step_id: int):
def _delete_checkpoint(self, num_train_steps_done: int):
raise NotImplementedError

def run_checkpoint_instruction(
self,
checkpointing_instruction: CheckpointingInstruction,
train_step_id: int,
num_train_steps_done: int,
model: nn.Module,
optimizer: Optimizer,
):
if checkpointing_instruction.save_current:
self._save_checkpoint(model=model, optimizer=optimizer, train_step_id=train_step_id)
self._save_checkpoint(model=model, optimizer=optimizer, num_train_steps_done=num_train_steps_done)

for train_step_id in checkpointing_instruction.checkpoints_to_delete:
self._delete_checkpoint(train_step_id=train_step_id)
for num_train_steps_done in checkpointing_instruction.checkpoints_to_delete:
self._delete_checkpoint(num_train_steps_done=num_train_steps_done)
12 changes: 6 additions & 6 deletions src/modalities/checkpointing/checkpoint_saving_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class CheckpointSavingStrategyIF(ABC):
@abstractmethod
def get_checkpoint_instruction(
self,
train_step_id: int,
num_train_steps_done: int,
evaluation_result: Dict[str, EvaluationResultBatch] | None = None,
early_stoppping_criterion_fulfilled: bool = False,
) -> CheckpointingInstruction:
Expand All @@ -32,23 +32,23 @@ def __init__(self, k: int = -1):

def get_checkpoint_instruction(
self,
train_step_id: int,
num_train_steps_done: int,
evaluation_result: Dict[str, EvaluationResultBatch] | None = None,
early_stoppping_criterion_fulfilled: bool = False,
) -> CheckpointingInstruction:
checkpoints_to_delete = []
save_current = True

if self.k > 0:
self.saved_step_checkpoints = [train_step_id] + self.saved_step_checkpoints
self.saved_step_checkpoints = [num_train_steps_done] + self.saved_step_checkpoints
if len(self.saved_step_checkpoints) > self.k:
# Delete oldest checkpoint
checkpoints_to_delete = [self.saved_step_checkpoints[-1]]
self.saved_step_checkpoints = self.saved_step_checkpoints[:-1]
elif self.k == 0:
save_current = False
elif self.k == -1:
self.saved_step_checkpoints = [train_step_id] + self.saved_step_checkpoints
self.saved_step_checkpoints = [num_train_steps_done] + self.saved_step_checkpoints

return CheckpointingInstruction(save_current=save_current, checkpoints_to_delete=checkpoints_to_delete)

Expand All @@ -59,9 +59,9 @@ def __init__(self, k: int):

def get_checkpoint_instruction(
self,
train_step_id: int,
num_train_steps_done: int,
evaluation_result: Dict[str, EvaluationResultBatch] | None = None,
early_stoppping_criterion_fulfilled: bool = False,
) -> CheckpointingInstruction:
save_current = (train_step_id + 1) % self.k == 0
save_current = num_train_steps_done % self.k == 0
return CheckpointingInstruction(save_current=save_current, checkpoints_to_delete=[])
Loading