In [1]:
pip install transformers datasets evaluate accelerate

Collecting transformers
  Downloading transformers-4.39.2-py3-none-any.whl.metadata (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl.metadata (18 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.22.2-py3-none-any.whl.metadata (12 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_

In [2]:
import os
import json
from transformers import (
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    MistralForCausalLM,
)
from datasets import load_dataset, load_metric

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import sagemaker
import boto3
import tarfile
import random

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [3]:
sess = sagemaker.Session()
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::940119374655:role/service-role/AmazonSageMakerServiceCatalogProductsUseRole
sagemaker bucket: sagemaker-us-east-1-940119374655
sagemaker session region: us-east-1


In [4]:
student_id = "mistralai/Mistral-7B-v0.1"
teacher_id = "mistralai/Mixtral-8x7B-v0.1"
dataset_id = "imdb"
s3_prefix_dataset = "knowledge_distill_mistral_classification"

In [5]:
train_dataset = load_dataset(dataset_id, split="train[:1%]")
test_dataset = load_dataset(dataset_id, split="test[:1%]")

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 21.0M/21.0M [00:00<00:00, 158MB/s]
Downloading data: 100%|██████████| 20.5M/20.5M [00:00<00:00, 241MB/s]
Downloading data: 100%|██████████| 42.0M/42.0M [00:00<00:00, 288MB/s]


Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(teacher_id)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [8]:
train_dataset = train_dataset.map(preprocess_function)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
test_dataset = test_dataset.map(preprocess_function)

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [10]:
dataset_path = f's3://{sess.default_bucket()}/{s3_prefix_dataset}'
train_dataset_path = dataset_path + '/dataset/train'
test_dataset_path = dataset_path + '/dataset/test'

In [11]:
train_dataset.save_to_disk(train_dataset_path)
test_dataset.save_to_disk(test_dataset_path)

severe performance issues, see also https://github.com/dask/dask/issues/10276

To fix, you should specify a lower version bound on s3fs, or
update the current installation.



Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/250 [00:00<?, ? examples/s]

In [12]:
hyperparameters={
        'teacher_model_id': teacher_id,
        'student_model_id': student_id,
        'num_epochs': 1,
        'lr': 6e-5,
        'fp16': True,
        'temperature': 4.0,
        'alpha': 0.5,
        'save_strategy': 'epoch',
        'evaluation_strategy': 'epoch',
}

In [13]:
script = '15_train_classification.py'
train_name = script.replace('_', '-')


import time
job_name = f'{train_name[:-3]}-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'
print(job_name)

15-train-classification-2024-03-31-15-48-58


In [14]:
smp_config = {
    "torch_distributed":{
        "enabled": True
    },
    "smdistributed": {
#         "dataparallel":{
#             "enabled": True
#         },
        "modelparallel": {
            "enabled": True,
            "parameters": {
                "hybrid_shard_degree": 8,
#                 "tensor_parallel_degree": 8,
#                 "tensor_parallel_seed": 0 
            }
        }
    }
}

In [15]:
from sagemaker.huggingface import HuggingFace
from sagemaker.pytorch import PyTorch

pytorch_estimator = PyTorch(
    entry_point          = script,
    source_dir           = 'scripts',
    instance_type        = 'ml.p4d.24xlarge',
    instance_count       = 2,
    base_job_name        = job_name,
    role                 = role,
    transformers_version = '4.36',
    framework_version    = '2.1.2',
    py_version           = 'py310',
    hyperparameters      = hyperparameters,
    distribution         = smp_config
)

In [16]:
# pytorch_estimator.fit({
#         'train_data': train_dataset_path,
#         'test_data': test_dataset_path
# })

In [17]:
from sagemaker.inputs import TrainingInput

pytorch_estimator.fit(inputs={
    'train': TrainingInput(
        s3_data=train_dataset_path,
        input_mode='FastFile'),
    'test': TrainingInput(
        s3_data=test_dataset_path,
        input_mode='FastFile'),
})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: 15-train-classification-2024-03-31-15-4-2024-03-31-15-48-58-724


2024-03-31 15:48:59 Starting - Starting the training job
2024-03-31 15:48:59 Pending - Training job waiting for capacity...............
2024-03-31 15:51:11 Pending - Preparing the instances for training...........................
2024-03-31 15:55:47 Downloading - Downloading input data...
2024-03-31 15:56:24 Downloading - Downloading the training image.....................
2024-03-31 15:59:35 Training - Training image download completed. Training in progress.........[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-03-31 16:01:15,120 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-03-31 16:01:15,215 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-03-31 16:01:15,224 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-03-31 16:01:15,226 sa

UnexpectedStatusException: Error for Training job 15-train-classification-2024-03-31-15-4-2024-03-31-15-48-58-724: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
ExitCode 1
ErrorMessage "torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacty of 39.39 GiB of which 99.38 MiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Process 53 has 0 bytes memory in use. Process 52 has 0 bytes memory in use. Process 59 has 0 bytes memory in use. Process 57 has 0 bytes memory in use. Process 58 has 0 bytes memory in use. Process 54 has 0 bytes memory in use. Process 55 has 0 bytes memory in use. Of the allocated memory 3.38 GiB is allocated by PyTorch, and 5.92 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
 Loading checkpoint shards:   0%|          | 0/19 [00:02<?, ?it/s]
 Traceback (most recent call last)
 File "/opt/ml/code/15_train_classification.py", line 250, in <module>
 main()
 File "/opt/ml/cod