Skip to content

Commit

Permalink
cleanup and mark new dependencies
Browse files Browse the repository at this point in the history
  • Loading branch information
JonasGeiping committed Jun 6, 2023
1 parent 55ac013 commit 9fac771
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 9 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ tables/*.csv
tables/*.csv#
tables/*.ods

torch_compile_debug
checkpoints
wandb-metadata.json

dedup
Expand Down
4 changes: 0 additions & 4 deletions cramming/backend/torch_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,6 @@ def __init__(self, model, cfg_train, cfg_impl, setup=_default_setup, seq_length=
self.model = self._init_distributed(model)
else:
self.model = model
try:
self.forward_attention_masks = model.cfg.attention.causal_attention
except (AttributeError, ValueError):
self.forward_attention_masks = False

self.optimizer, self.scheduler = _load_optimizer(model, cfg_train, cfg_impl)
self.initial_time = time.time()
Expand Down
16 changes: 11 additions & 5 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,20 @@ setup_requires =
setuptools

install_requires =
torch >= 1.13
torch >= 2.0.0
hydra-core >= 1.1
datasets >= 2.8.0
tokenizers >= 0.13.2
transformers >= 4.25.1
datasets
tokenizers
transformers
evaluate
scipy
scikit-learn # for metrics
pynvml
psutil
einops
zstandard
safetensors
# apache-beam # only used for wikipedia ...
zstandard # only used for the Pile

scripts =
pretrain.py
Expand Down
86 changes: 86 additions & 0 deletions upload_processed_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Script to upload a processed dataset to the huggingface hub. You probably don't need this :)"""


import hydra
import logging
from omegaconf import OmegaConf
import tempfile
import os

from datasets import load_dataset

import cramming


log = logging.getLogger(__name__)


def upload(cfg, setup):
dataset, tokenizer = cramming.load_pretraining_corpus(cfg.data, cfg.impl)
checksum = cramming.data.utils.checksum_config(cfg.data)
processed_dataset_name = f"{cfg.data.name}_{checksum}"

use_own_chunking = True
chunk_size = 8192 * 32
num_files = len(dataset) // chunk_size + 1
target_types = ["input_ids"]

files = []
# Split dataset in parquet files
with tempfile.TemporaryDirectory() as tmpdirname:
if use_own_chunking:
# Loop through the dataset and write each chunk to a Parquet file
# This is not really necessary, but nice to save only target_types and to match chunk sizes to target batch sizes
for idx in range(num_files):
chunk = dataset.select(range(idx * chunk_size, min(len(dataset), (idx + 1) * chunk_size)))
filename = f"{tmpdirname}/train_{idx}.parquet"
chunk.to_pandas()[target_types].to_parquet(filename, index=False)
files.append(filename)
log.info(f"Chunk {idx} written to file {filename}.")

# Re-assemble parqueted dataset
dataset = load_dataset("parquet", data_files=files)

# Define the dataset info
description = f"""This is a preprocessed dataset for the cramming-project.
Use only with the tokenizer prescribed here.
This version is {processed_dataset_name}, which corresponds to the following setup:
{OmegaConf.to_yaml(cfg, resolve=True)}
Limitations and bias:
This training data was further filtered and sorted beyond the normal preprocessing.
These modifications were not tested for unintended consequences.
"""
dataset["train"].info.description = description
# dataset_tags = ["cramming", "English", "preprocessed"]

# Launch upload
log.info("Preparing for dataset upload ...")
dataset.push_to_hub(processed_dataset_name, private=True)

# Upload tokenizer to same adress - this is annoying because by default tokenizers are pushed to model directories
# tokenizer.push_to_hub(processed_dataset_name) -> this will push to a new directory in HF models
from huggingface_hub import HfApi

api = HfApi()
log.info("Preparing for tokenizer upload ...")
tokenizer_loc = os.path.join(os.path.join(cfg.impl.path, processed_dataset_name), "tokenizer")
for file in os.listdir(tokenizer_loc):
api.upload_file(
path_or_fileobj=os.path.join(tokenizer_loc, file),
path_in_repo=os.path.join("tokenizer", file),
repo_id=f"{api.whoami()['name']}/{processed_dataset_name}",
repo_type="dataset",
)
log.info("Upload completed succesfully.")


@hydra.main(config_path="cramming/config", config_name="cfg_pretrain", version_base="1.1")
def launch(cfg):
cramming.utils.main_launcher(cfg, upload, job_name="upload")


if __name__ == "__main__":
launch()

0 comments on commit 9fac771

Please sign in to comment.