Imports

In [1]:
import os
import sys
sys.path.append("src/scripts") 

import sagemaker
from sagemaker.huggingface import HuggingFace
import boto3
from datasets import load_dataset
from random import randrange, randint
from transformers import AutoTokenizer
from huggingface_hub import HfFolder

from user_management import instantiate_sagemaker_session
from pack_dataset import pack_dataset

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/leonvouk/.config/sagemaker/config.yaml


Manage users / Log in

In [None]:
%huggingface-cli login --token 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

In [None]:
role, sess = instantiate_sagemaker_session()

Instantiate model / tokenizer

In [None]:
LOCAL = True
model_id = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)

Create test dataset

In [None]:
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")
dataset = dataset.select(range(1000))
print(f"dataset size: {len(dataset)}")

def format_dolly(sample):
    instruction = f"### Instruction\n{sample['instruction']}"
    context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
    response = f"### Answer\n{sample['response']}"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt

def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample

dataset = dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(dataset.features)
)

lm_dataset = pack_dataset(dataset, chunk_length=2048)
print(f"Total number of samples: {len(lm_dataset)}")

training_input_path = f's3://{sess.default_bucket()}/fine_tuning/kri_kri/dolly/train'
lm_dataset.save_to_disk(training_input_path)

print("uploaded data to:")
print(f"training dataset to: {training_input_path}")

Define training with the huggingface estimator

In [None]:
### Unused since we're using a torch_launch script
# hyperparameters, which are passed into the training job
hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset_path': '/opt/ml/input/data/training',    # path where sagemaker will save training dataset
  'num_train_epochs': 3,                            # number of training epochs
  'per_device_train_batch_size': 6,                 # batch size for training
  'gradient_accumulation_steps': 2,                 # Number of updates steps to accumulate 
  'gradient_checkpointing': True,                   # save memory but slower backward pass
  'bf16': True,                                     # use bfloat16 precision
  'tf32': True,                                     # use tf32 precision
  'learning_rate': 1e-5,                            # learning rate
  'max_grad_norm': 0.3,                             # Maximum norm (for gradient clipping)
  'warmup_ratio': 0.03,                             # warmup ratio
  "lr_scheduler_type":"constant_with_warmup",       # learning rate scheduler
  'save_strategy': "epoch",                         # save strategy for checkpoints
  "logging_steps": 10,                              # log every x steps
  'deepspeed': './src/configs/ds_config.json',      # deepspeed config file
  'merge_adapters': True,                           # wether to merge LoRA into the model (needs more memory)
  'use_flash_attn': True,                           # Whether to use Flash Attention
  'output_dir': '/tmp/run',                         # output directory, where to save assets during training
                                                    # could be used for checkpointing. The final trained
                                                    # model will always be saved to s3 at the end of training 
}

if HfFolder.get_token() is not None:
    hyperparameters['hf_token'] = HfFolder.get_token() # huggingface token to access gated models, e.g. llama 2

In [None]:
# define Training Job Name 
job_name = f'meltemi-deepspeed-finetuned-{hyperparameters["model_id"].replace("/","-").replace(".","-")}'

if LOCAL:
    instance_type = 'local_gpu'
else:
    instance_type = 'ml.g5.4xlarge'

# define the model s3 path which will store your trained model asset
# Note: you should use your real s3 path to configure target model_s3_path
target_model_s3_path=f's3://{sess.default_bucket()}/krikri-deepspeed-finetuned-test/model/'

# define the s3 path of source model before training.  
# Note: Please add the wildcard character '*' in the following path, otherwise error will happen.
## source_model_s3_path = f's3://{sess.default_bucket()}/llama/pretrained/7B/model/*'

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'start_deepspeed.py',    # train script
    source_dir           = '../scripts',      # directory which includes all the files needed for training
    instance_type        = instance_type,   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    max_run              = 2*24*60*60,        # maximum runtime in seconds (days * hours * minutes * seconds)
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.34',            # the transformers version used in the training job
    pytorch_version      = '2.1',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    # hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { 
        "HUGGINGFACE_HUB_CACHE": "/tmp/.cache",
        'CUDA_LAUNCH_BLOCKING': '1',
        'NODE_NUMBER':'1',
        'FI_PROVIDER': 'efa',
        'NCCL_PROTO': 'simple',
        'FI_EFA_USE_DEVICE_RDMA': '1',
        'NCCL_DEBUG': 'INFO', 
    }, # set env variable to cache models in /tmp
    disable_output_compression = True         # not compress output to save training time and cost
)

In [None]:
# define a data input dictonary with our uploaded s3 uris
data = {'training': training_input_path}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

In [None]:
print(huggingface_estimator.model_data["S3DataSource"]["S3Uri"].replace("s3://", "https://s3.console.aws.amazon.com/s3/buckets/"))