Skip to content

Commit

Permalink
Fix streaming mode
Browse files Browse the repository at this point in the history
  • Loading branch information
JonasGeiping committed Jun 7, 2023
1 parent 4383663 commit 31cbe4a
Show file tree
Hide file tree
Showing 5 changed files with 7 additions and 4 deletions.
2 changes: 1 addition & 1 deletion cramming/backend/torch_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def initialize_torch(model, dataset, tokenizer, cfg_train, cfg_impl, setup=_defa
model_engine = TorchEngineFull(model, cfg_train, cfg_impl, setup=setup, seq_length=tokenizer.model_max_length)
else:
model_engine = TorchEngineMinimal(model, cfg_train, cfg_impl, setup=setup, seq_length=tokenizer.model_max_length)
model_engine.train()
model_engine.train() # This is the default engine state. Pretraining scripts may change this.
return model_engine, model_engine.optimizer, model_engine.scheduler, dataloader


Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Draw a preprocessed dataset directly from my HF profile.
# This dataset is already tokenized, you "have" to load the correct tokenizer
name: roots-mini
name: pile-readymade
sources:
hub:
provider: hub
hf_location: JonasGeiping/roots-mini_WordPiecex32768_2a22a1d08cbdc9685c3c795938ebebfb
hf_location: JonasGeiping/the_pile_WordPiecex32768_97b8e776baafb99c3892e6572a9f51b3
streaming: True

vocab_size: 32768 # cannot be changed!
seq_length: 128 # cannot be changed!
2 changes: 1 addition & 1 deletion cramming/data/pretraining_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,5 +470,5 @@ def _load_from_hub(cfg_data, data_path):
repo_type="dataset",
local_dir=os.path.join(data_path),
)
tokenizer = load_tokenizer(os.path.join(data_path, "tokenizer"), cache_dir=data_path)
tokenizer = load_tokenizer(os.path.join(data_path, "tokenizer"), seq_length=cfg_data.seq_length, cache_dir=data_path)
return tokenized_dataset, tokenizer
1 change: 1 addition & 0 deletions load_local_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def main_load_process(cfg, setup):
# Save to hub
if cfg.impl.push_to_huggingface_hub:
model_engine.push_to_hub(tokenizer, cfg, dryrun=cfg.dryrun)
return {}


@hydra.main(config_path="cramming/config", config_name="cfg_pretrain", version_base="1.1")
Expand Down
1 change: 1 addition & 0 deletions upload_processed_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def upload(cfg, setup):
repo_type="dataset",
)
log.info("Upload completed succesfully.")
return {}


@hydra.main(config_path="cramming/config", config_name="cfg_pretrain", version_base="1.1")
Expand Down

0 comments on commit 31cbe4a

Please sign in to comment.