In [None]:
pip install -U transformers datasets

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset
import sagemaker
import boto3

from sagemaker.pytorch import PyTorch
from sagemaker.inputs import TrainingInput

In [None]:
sess = sagemaker.Session()
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
student_id = "mistralai/Mistral-7B-v0.1"
teacher_id = "mistralai/Mixtral-8x7B-v0.1"
dataset_id = "princeton-nlp/datasets-for-simcse"
s3_prefix_dataset = "knowledge_distill_mistral_classification"
hf_token = "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"

In [None]:
dataset = load_dataset(dataset_id)
dataset = dataset["train"].select(range(40000))
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(student_id, token=hf_token)
tokenizer.pad_token = tokenizer.unk_token

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"])

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

In [None]:
block_size = 128

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size

    result = {
        k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }

    result["labels"] = result["input_ids"].copy()

    return result

In [None]:
final_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

In [None]:
dataset_path = f's3://{sess.default_bucket()}/{s3_prefix_dataset}'
train_dataset_path = dataset_path + '/dataset/train'
test_dataset_path = dataset_path + '/dataset/test'

In [None]:
train_dataset = final_dataset["train"]
test_dataset = final_dataset["test"]

In [None]:
train_dataset.save_to_disk(train_dataset_path)
test_dataset.save_to_disk(test_dataset_path)

In [None]:
script = 'train.py'

In [None]:
hyperparameters={
    'dist_backend': 'smddp',
    'student_model_id': student_id,
    'hf_token': hf_token,
    'batch': 4,
    'teacher_model_id': teacher_id,
    'alpha': 0.50,
    'temperature': 8.0,
    'lr': .000005,
    'num_epochs': 2,
}

In [None]:
distribution = {
    "torch_distributed":{
        "enabled": True
    },
    "smdistributed": {
        "modelparallel": {
            "enabled": True,
            "parameters": {
                "hybrid_shard_degree": 0
            }
        }
    }
}

In [None]:
pytorch_estimator = PyTorch(
        entry_point=script,
        source_dir='scripts',
        instance_type='ml.p4d.24xlarge',
        instance_count=3,
        base_job_name=script[:-3],
        role=role,
        framework_version='2.2.0',
        py_version='py310',
        hyperparameters=hyperparameters,
        distribution=distribution,
        max_run=400000,
        disable_output_compression=True
)

In [None]:
pytorch_estimator.fit(inputs={
    'train': TrainingInput(s3_data=train_dataset_path),
    'test': TrainingInput(s3_data=test_dataset_path),
})