-
Notifications
You must be signed in to change notification settings - Fork 99
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
55ac013
commit 9fac771
Showing
4 changed files
with
99 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,8 @@ tables/*.csv | |
| tables/*.csv# | ||
| tables/*.ods | ||
|
|
||
| torch_compile_debug | ||
| checkpoints | ||
| wandb-metadata.json | ||
|
|
||
| dedup | ||
|
|
||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,86 @@ | ||
| """Script to upload a processed dataset to the huggingface hub. You probably don't need this :)""" | ||
|
|
||
|
|
||
| import hydra | ||
| import logging | ||
| from omegaconf import OmegaConf | ||
| import tempfile | ||
| import os | ||
|
|
||
| from datasets import load_dataset | ||
|
|
||
| import cramming | ||
|
|
||
|
|
||
| log = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def upload(cfg, setup): | ||
| dataset, tokenizer = cramming.load_pretraining_corpus(cfg.data, cfg.impl) | ||
| checksum = cramming.data.utils.checksum_config(cfg.data) | ||
| processed_dataset_name = f"{cfg.data.name}_{checksum}" | ||
|
|
||
| use_own_chunking = True | ||
| chunk_size = 8192 * 32 | ||
| num_files = len(dataset) // chunk_size + 1 | ||
| target_types = ["input_ids"] | ||
|
|
||
| files = [] | ||
| # Split dataset in parquet files | ||
| with tempfile.TemporaryDirectory() as tmpdirname: | ||
| if use_own_chunking: | ||
| # Loop through the dataset and write each chunk to a Parquet file | ||
| # This is not really necessary, but nice to save only target_types and to match chunk sizes to target batch sizes | ||
| for idx in range(num_files): | ||
| chunk = dataset.select(range(idx * chunk_size, min(len(dataset), (idx + 1) * chunk_size))) | ||
| filename = f"{tmpdirname}/train_{idx}.parquet" | ||
| chunk.to_pandas()[target_types].to_parquet(filename, index=False) | ||
| files.append(filename) | ||
| log.info(f"Chunk {idx} written to file {filename}.") | ||
|
|
||
| # Re-assemble parqueted dataset | ||
| dataset = load_dataset("parquet", data_files=files) | ||
|
|
||
| # Define the dataset info | ||
| description = f"""This is a preprocessed dataset for the cramming-project. | ||
| Use only with the tokenizer prescribed here. | ||
| This version is {processed_dataset_name}, which corresponds to the following setup: | ||
| {OmegaConf.to_yaml(cfg, resolve=True)} | ||
| Limitations and bias: | ||
| This training data was further filtered and sorted beyond the normal preprocessing. | ||
| These modifications were not tested for unintended consequences. | ||
| """ | ||
| dataset["train"].info.description = description | ||
| # dataset_tags = ["cramming", "English", "preprocessed"] | ||
|
|
||
| # Launch upload | ||
| log.info("Preparing for dataset upload ...") | ||
| dataset.push_to_hub(processed_dataset_name, private=True) | ||
|
|
||
| # Upload tokenizer to same adress - this is annoying because by default tokenizers are pushed to model directories | ||
| # tokenizer.push_to_hub(processed_dataset_name) -> this will push to a new directory in HF models | ||
| from huggingface_hub import HfApi | ||
|
|
||
| api = HfApi() | ||
| log.info("Preparing for tokenizer upload ...") | ||
| tokenizer_loc = os.path.join(os.path.join(cfg.impl.path, processed_dataset_name), "tokenizer") | ||
| for file in os.listdir(tokenizer_loc): | ||
| api.upload_file( | ||
| path_or_fileobj=os.path.join(tokenizer_loc, file), | ||
| path_in_repo=os.path.join("tokenizer", file), | ||
| repo_id=f"{api.whoami()['name']}/{processed_dataset_name}", | ||
| repo_type="dataset", | ||
| ) | ||
| log.info("Upload completed succesfully.") | ||
|
|
||
|
|
||
| @hydra.main(config_path="cramming/config", config_name="cfg_pretrain", version_base="1.1") | ||
| def launch(cfg): | ||
| cramming.utils.main_launcher(cfg, upload, job_name="upload") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| launch() |