In [None]:
import pandas as pd
import torch
import transformers
import sagemaker
from sagemaker.huggingface import HuggingFace
from sagemaker import get_execution_role
from datetime import datetime
from sagemaker.debugger import TensorBoardOutputConfig
import os



In [None]:
# get sagemaker execution role
role = get_execution_role()
session = sagemaker.Session()

# create timestamp for unique job naming
timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
s3_output_path = "s3://your-project-bucket/logs"

# configure tensorboard logging
tensorboard_output_config = TensorBoardOutputConfig(
    s3_output_path=os.path.join(s3_output_path, "tensorboard"),
    container_local_output_path="/opt/ml/output/tensorboard"
)

In [None]:

# create huggingface estimator for distributed training
estimator = HuggingFace(
    entry_point='training_script.py',
    source_dir='./',
    role=role,
    instance_count=1,
    instance_type='ml.g6.xlarge',  # gpu instance for faster training
    transformers_version='4.46',
    pytorch_version='2.3',
    py_version='py311',
    output_path='s3://your-project-bucket/models/',
    dependencies=["requirements.txt"],
    hyperparameters={
        'epochs': 5,
        'train_batch_size': 8,
        'val_batch_size': 2,
        'lr': 1e-5,  # learning rate for fine-tuning
        'dev_run': False,
        'max_time': "00:01:10:00",  # max training time
        'timestamp': timestamp,
    },
    enable_sagemaker_metrics=True,
    tensorboard_output_config=tensorboard_output_config
)

In [None]:
# start training job
estimator.fit(job_name=f'distilbert-ticketsclassification-training-{timestamp}')