In [None]:
import sagemaker
import boto3
from transformers import AutoTokenizer
from datasets import load_dataset
from sagemaker.huggingface import HuggingFace
import time

In [ ]:
model_id = "tiiuae/falcon-7b"
dataset_name = "Amod/mental_health_counseling_conversations"
split_type = "train"
s3_prefix_dataset = "dataset"
entry_point = 'train.py'

In [ ]:
sess = sagemaker.Session()
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [ ]:
def generate_prompt(data_point):
    return f"""
        <Human>: {data_point["Context"]}
        <AI>: {data_point["Response"]}
        """.strip()

In [ ]:
def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenizer(full_prompt, padding=True, truncation=True)
    return tokenized_full_prompt

In [ ]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [ ]:
dataset = load_dataset(dataset_name, split=split_type)
dataset = dataset.shuffle().map(generate_and_tokenize_prompt)

In [ ]:
dataset.set_format("torch")
dataset.format

In [ ]:
dataset_path = f's3://{sess.default_bucket()}/{s3_prefix_dataset}'
dataset.save_to_disk(dataset_path)

In [ ]:
hyperparameters={
    'model_id': model_id,
    'epochs': 1,
    'lr': 2e-4,
    'fp16': True,
    'lora_r': 16,
    'lora_alpha': 32,
    'lora_dropout': 0.05,
    'save_embeddings': True,
    'merge_weights': True,
    'training_output_dir': "/tmp1"
}

In [ ]:
job_name = f'{entry_point[:-3]}-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'
print(job_name)

In [ ]:
huggingface_estimator = HuggingFace(
    entry_point          = entry_point,
    source_dir           = 'scripts',
    instance_type        = 'ml.p3dn.24xlarge',
    instance_count       = 1,
    base_job_name        = job_name,
    role                 = role,
    disable_output_compression = False,
    input_mode           = 'FastFile',
    transformers_version = '4.28',
    pytorch_version      = '2.0',
    py_version           = 'py310',
    hyperparameters = hyperparameters
)

In [ ]:
print(dataset_path)

In [ ]:
huggingface_estimator.fit({
    's3_data': dataset_path
})