# 1. 사전 준비 작업
- https://www.philschmid.de/fsdp-qlora-llama3

## 필요 패키지 설치 및 local docker repo 위치 변경

In [1]:
install_needed = True
# install_needed = False

In [2]:
%%bash
#!/bin/bash

DAEMON_PATH="/etc/docker"
MEMORY_SIZE=10G

FLAG=$(cat $DAEMON_PATH/daemon.json | jq 'has("data-root")')
# echo $FLAG

if [ "$FLAG" == true ]; then
    echo "Already revised"
else
    sudo service docker stop
    echo "Add data-root and default-shm-size=$MEMORY_SIZE"
    sudo cp $DAEMON_PATH/daemon.json $DAEMON_PATH/daemon.json.bak
    sudo cat $DAEMON_PATH/daemon.json.bak | jq '. += {"data-root":"/home/ec2-user/SageMaker/.container/docker","default-shm-size":"'$MEMORY_SIZE'"}' | sudo tee $DAEMON_PATH/daemon.json > /dev/null
    sudo rsync -aP /var/lib/docker /home/ec2-user/SageMaker/.container
    sudo service docker start
    echo "Docker Restart"
fi

# sudo curl -L "https://github.com/docker/compose/releases/download/v2.7.0/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
# sudo chmod +x /usr/local/bin/docker-compose

Already revised


In [3]:
import sys
import IPython

if install_needed:
    print("installing deps and restarting kernel")
    !{sys.executable} -m pip install --upgrade pip --quiet
    !{sys.executable} -m pip install -U sagemaker transformers datasets peft trl bitsandbytes wandb mlflow==2.13.2 sagemaker-mlflow==0.1.0 --quiet
    IPython.Application.instance().kernel.do_shutdown(True)

If you are going to use Sagemaker in a local environment. You need access to an IAM Role with the required permissions for Sagemaker. You can find [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) more about it.



## 시작

In [4]:
import sagemaker
from pathlib import Path
from time import strftime

sagemaker_session = sagemaker.Session()

bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/llama-3-1-kor-bllossom-8b'

role = sagemaker.get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [5]:
sagemaker.__version__

'2.227.0'

In [6]:
import os

os.environ['HF_DATASETS_CACHE'] = '/home/ec2-user/SageMaker/.cache'
os.environ['HF_CACHE_HOME'] = '/home/ec2-user/SageMaker/.cache'
os.environ['HUGGINGFACE_HUB_CACHE'] = '/home/ec2-user/SageMaker/.cache'
# os.environ['TRANSFORMERS_HOME'] = '/home/ec2-user/SageMaker/.cache'
# os.environ['HF_HOME'] = '/home/ec2-user/SageMaker/.cache'

## Model, tokenizer 저장 및 S3 업로드

In [7]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer
)
import torch
from datasets import load_dataset
import huggingface_hub
from trl import setup_chat_format

In [8]:
test_model_id = "MLP-KTLim/llama-3-Korean-Bllossom-8B"

In [9]:
# custom_draft_model_id="meta-llama/Meta-Llama-3-8B"

hf_local_download_dir = Path.cwd() / "llama-3-Korean-Bllossom-8B"
hf_local_download_dir.mkdir(exist_ok=True)

huggingface_hub.snapshot_download(
    repo_id=test_model_id,
    revision="main",
    local_dir=hf_local_download_dir
)

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

'/home/ec2-user/SageMaker/2024/llama-3-on-sagemaker/llama-3-Korean-Bllossom-8B'

In [10]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(test_model_id)

In [11]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    test_model_id,
    # torch_dtype=torch.bfloat16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
model, tokenizer = setup_chat_format(model, tokenizer)

## 데이터 전처리 및 S3 업로드

In [13]:
tokenizer.save_pretrained(f'./model_weight/{test_model_id}')
model.save_pretrained(f'./model_weight/{test_model_id}')

In [14]:
model_weight_path = sagemaker_session.upload_data(path=f'./model_weight/{test_model_id}', bucket=bucket, key_prefix=f"{prefix}/model_weight/{test_model_id}")
print('input spec (in this case, just an S3 path): {}'.format(model_weight_path))

input spec (in this case, just an S3 path): s3://sagemaker-us-west-2-322537213286/sagemaker/llama-3-1-kor-bllossom-8b/model_weight/MLP-KTLim/llama-3-Korean-Bllossom-8B


In [15]:
# 주가 증권 보고서 gemini 데이터셋
hkcode_dataset = "uiyong/gemini_result_kospi_0517_jsonl"

In [16]:
dataset = load_dataset(hkcode_dataset, split="train")

In [17]:
dataset

Dataset({
    features: ['text'],
    num_rows: 230
})

In [18]:
dataset_name = 'gemini_result_kospi_0517'

# save train_dataset to s3
local_training_input_path = f'{Path.cwd()}/dataset/train'
dataset.to_json(f"{local_training_input_path}/train_dataset.json", orient="records", force_ascii=False)
training_input_path = sagemaker_session.upload_data(path=f"{local_training_input_path}/train_dataset.json", bucket=bucket, key_prefix=f"{prefix}/{dataset_name}/train")

# save test_dataset to s3
local_test_input_path = f'{Path.cwd()}/dataset/test'
dataset.to_json(f"{local_test_input_path}/test_dataset.json", orient="records", force_ascii=False)
test_input_path = sagemaker_session.upload_data(path=f"{local_test_input_path}/test_dataset.json", bucket=bucket, key_prefix=f"{prefix}/{dataset_name}/test")

print("uploaded data to:")
print(f"training dataset to: {training_input_path}")
print(f"test dataset to: {test_input_path}")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

uploaded data to:
training dataset to: s3://sagemaker-us-west-2-322537213286/sagemaker/llama-3-1-kor-bllossom-8b/gemini_result_kospi_0517/train/train_dataset.json
test dataset to: s3://sagemaker-us-west-2-322537213286/sagemaker/llama-3-1-kor-bllossom-8b/gemini_result_kospi_0517/test/test_dataset.json


#### mlflow policy
```python
{
    "Version": "2012-10-17",    
    "Statement": [        
        {            
            "Effect": "Allow",            
            "Action": [
                "sagemaker-mlflow:*",
                "sagemaker:CreateMlflowTrackingServer",
                "sagemaker:UpdateMlflowTrackingServer",
                "sagemaker:DeleteMlflowTrackingServer",
                "sagemaker:StartMlflowTrackingServer",
                "sagemaker:StopMlflowTrackingServer",
                "sagemaker:CreatePresignedMlflowTrackingServerUrl"
            ],            
            "Resource": "*"        
        }        
    ]
}
```

#### SSM access policy
arn:aws:iam::aws:policy/AmazonSSMFullAccess

## SageMaker mlflow 생성

In [40]:
flag = "240801"

In [41]:
import sagemaker, boto3
import json

In [43]:
sess = sagemaker.Session()
bucket_name = sess.default_bucket()
role = sagemaker.get_execution_role()
region = sess.boto_region_name

iam_client = boto3.client("iam")
sts_client = boto3.client("sts")
sm_client = boto3.client("sagemaker")
account_id = sts_client.get_caller_identity()["Account"]

In [44]:
from sagemaker import get_execution_role
sagemaker_role_name = get_execution_role().rsplit('/', 1)[-1]
print (f"SageMaker Execution Role Name: {sagemaker_role_name}")

SageMaker Execution Role Name: AmazonSageMaker-ExecutionRole-20230604T222555


In [45]:
sm_mlflow_execution_policy = {
    "Version": "2012-10-17",    
    "Statement": [        
        {            
            "Effect": "Allow",            
            "Action": [
                "sagemaker-mlflow:*",
                "sagemaker:CreateMlflowTrackingServer",
                "sagemaker:UpdateMlflowTrackingServer",
                "sagemaker:DeleteMlflowTrackingServer",
                "sagemaker:StartMlflowTrackingServer",
                "sagemaker:StopMlflowTrackingServer",
                "sagemaker:CreatePresignedMlflowTrackingServerUrl"
            ],            
            "Resource": "*"        
        }        
    ]
}

sagemaker_role_name = get_execution_role().rsplit('/', 1)[-1]
sagemaker_exe_role_arn = get_execution_role()

sm_mlflow_execution_policy_info = iam_client.create_policy(
    PolicyName=f"sm-mlflow-execution-policy-{flag}",
    PolicyDocument=json.dumps(sm_mlflow_execution_policy)
)

sm_mlflow_execution_policy_arn = sm_mlflow_execution_policy_info["Policy"]["Arn"]

# Attach the policy to the MLflow role
iam_client.attach_role_policy(
    RoleName=sagemaker_role_name, PolicyArn=sm_mlflow_execution_policy_arn
)

{'ResponseMetadata': {'RequestId': 'e44c5b66-3eed-42db-81d2-fc3998a00e0d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 01 Aug 2024 12:45:22 GMT',
   'x-amzn-requestid': 'e44c5b66-3eed-42db-81d2-fc3998a00e0d',
   'content-type': 'text/xml',
   'content-length': '212'},
  'RetryAttempts': 0}}

In [46]:
tracking_server_name = f"mlflow-tracking-{flag}"
mlflow_tracking_server_role_name = f"mlflow-tracking-server-{flag}"
tracking_server_name

In [47]:
mlflow_trust_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {"Service": ["sagemaker.amazonaws.com"]},
            "Action": "sts:AssumeRole",
        }
    ],
}

# Create role for MLflow
mlflow_role = iam_client.create_role(
    RoleName=mlflow_tracking_server_role_name,
    AssumeRolePolicyDocument=json.dumps(mlflow_trust_policy)
)
mlflow_role_arn = mlflow_role["Role"]["Arn"]

# Create policy for S3 and SageMaker Model Registry
sm_s3_model_registry_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "s3:Get*",
                "s3:Put*",
                "s3:List*",
                "sagemaker:AddTags",
                "sagemaker:CreateModelPackageGroup",
                "sagemaker:CreateModelPackage",
                "sagemaker:UpdateModelPackage",
                "sagemaker:DescribeModelPackageGroup",
            ],
            "Resource": "*",
        }
    ],
}

mlflow_s3_sm_model_registry_iam_policy = iam_client.create_policy(
    PolicyName=f"mlflow-s3-sm-model-registry-{flag}", PolicyDocument=json.dumps(sm_s3_model_registry_policy)
)
mlflow_s3_sm_model_registry_iam_policy_arn = mlflow_s3_sm_model_registry_iam_policy["Policy"]["Arn"]

# Attach the policy to the MLflow role
iam_client.attach_role_policy(
    RoleName=mlflow_tracking_server_role_name, 
    PolicyArn=mlflow_s3_sm_model_registry_iam_policy_arn
)

{'ResponseMetadata': {'RequestId': '01a8ba8c-2bbc-436d-a6ea-ff640fe3ab0e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 01 Aug 2024 12:45:24 GMT',
   'x-amzn-requestid': '01a8ba8c-2bbc-436d-a6ea-ff640fe3ab0e',
   'content-type': 'text/xml',
   'content-length': '212'},
  'RetryAttempts': 0}}

In [49]:
sm_client.create_mlflow_tracking_server(
    TrackingServerName=tracking_server_name,
    ArtifactStoreUri=f"s3://{bucket_name}/{tracking_server_name}",
    TrackingServerSize="Small",
    MlflowVersion="2.13.2",
    RoleArn=mlflow_role_arn,
    AutomaticModelRegistration=False,
)

{'TrackingServerArn': 'arn:aws:sagemaker:us-west-2:322537213286:mlflow-tracking-server/mlflow-tracking-240801',
 'ResponseMetadata': {'RequestId': 'd4ceef0b-6f17-47b9-947b-9d1b35f70cad',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'd4ceef0b-6f17-47b9-947b-9d1b35f70cad',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '110',
   'date': 'Thu, 01 Aug 2024 12:56:20 GMT'},
  'RetryAttempts': 0}}

In [50]:
tracking_server_arn = (
    f"arn:aws:sagemaker:{region}:{account_id}:mlflow-tracking-server/{tracking_server_name}"
)
tracking_server_arn

'arn:aws:sagemaker:us-west-2:322537213286:mlflow-tracking-server/mlflow-tracking-240801'

In [51]:
import time

while True:
    ml_tracking_server = sm_client.describe_mlflow_tracking_server(TrackingServerName=tracking_server_name)
    if ml_tracking_server['IsActive'] == 'Active':
        print("mlflow training server is Active")
        break
    else:
        print("mlflow training server is Creating")
        time.sleep(20)

mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training server is Creating
mlflow training serv

In [52]:
import mlflow

mlflow.set_tracking_uri(tracking_server_arn)
print (f'tracking_server_arn: {tracking_server_arn}')

tracking_server_arn: arn:aws:sagemaker:us-west-2:322537213286:mlflow-tracking-server/mlflow-tracking-240801


In [53]:
experiment_name = prefix.split("/")[-1] + f"-{flag}"
experiment_name

'llama-3-1-kor-bllossom-8b-240801'

In [54]:
mlflow.create_experiment(name=experiment_name)


'1'

In [59]:
registered_model = 'llama-3-1-kor-bllossom-8b'

from mlflow import MlflowClient
client = MlflowClient()
client.create_registered_model(registered_model)

<RegisteredModel: aliases={}, creation_timestamp=1722519207822, description='', last_updated_timestamp=1722519207822, latest_versions=[], name='llama-3-1-kor-bllossom-8b', tags={}>

In [7]:
# tracking_server_name="mlflow-tracking-240801"

In [9]:
import boto3
sm_client = boto3.client("sagemaker")
sm_client.create_presigned_mlflow_tracking_server_url(TrackingServerName=tracking_server_name, 
                                                      ExpiresInSeconds=300,
                                                      SessionExpirationDurationInSeconds=43200)

{'AuthorizedUrl': 'https://t-d8yxtuyd6uuh.us-west-2.experiments.sagemaker.aws/auth?authToken=eyJhbGciOiJIUzI1NiJ9.eyJhdXRoVG9rZW5JZCI6IjcxM2I1MjlhLTgzMjYtNGZlNS04ZmYxLWM5YjlmODgxOThjYSIsImZhc0NyZWRlbnRpYWxzIjoiQWdWNHRNWmFzb3g1ZGdaS0FQNVh2RlRQdmdiU1ExeDd4Wk4zZnRzTnFEVkFNSU1BWHdBQkFCVmhkM010WTNKNWNIUnZMWEIxWW14cFl5MXJaWGtBUkVGeGJsbGhVSEZGZG5GTGRHZ3piRGxNWjFCUE9VZDZhM2xSV25FNVMzTm5MemgyS3pCQ2VVbE5kVGhqZEVkMU5VOU5aa05zUVROdU0xWm9iblUyVVVoQlFUMDlBQUVBQjJGM2N5MXJiWE1BUzJGeWJqcGhkM002YTIxek9uVnpMWGRsYzNRdE1qbzFPVEF4T0RNM016azFNRFE2YTJWNUx6ZzNabUUxTVdReUxURTRNRGt0TkdVMFl5MWhObVV6TFRRNFpXWTNNelk1WW1NM1lnQzRBUUlCQUhneXZiSTRoY0UxaXBwaCtZcDhxRkZCOEo0ZzZ3RjFucFd0dks2NlZ0VHN1d0hRRDVBOGttOTZVZ3NVbnUrbXgwVXlBQUFBZmpCOEJna3Foa2lHOXcwQkJ3YWdiekJ0QWdFQU1HZ0dDU3FHU0liM0RRRUhBVEFlQmdsZ2hrZ0JaUU1FQVM0d0VRUU1JekozU1BUV0V0MHRvdm1lQWdFUWdEdUw5b0VXeUNsVlZHakorNU9HcHRhQjV6Rm5qbmc3RUtEUTF6R01FdVQwTnhkb2k1NWlmenpUdFhlK3M4ZHpPckh0SjhoaFNiUHRTRHhxN1FJQUFCQUFjdWtzQ1g0NWxRRk1xU3dsTlBmOW10M3hMeGNyWGEzN0tOZWlWbmh3SXV5al

In [61]:
%store test_model_id
%store bucket
%store prefix
%store model_weight_path
%store training_input_path
%store test_input_path
%store local_training_input_path
%store local_test_input_path
%store tracking_server_arn
%store experiment_name
%store registered_model
%store tracking_server_name

print(f"test_model_id : {test_model_id}")
print(f"bucket : {bucket}")
print(f"prefix : {prefix}")
print(f"model_weight_path : {model_weight_path}")
print(f"training_input_path : {training_input_path}")
print(f"test_input_path : {test_input_path}")
print(f"local_training_input_path : {local_training_input_path}")
print(f"local_test_input_path : {local_test_input_path}")
print(f"tracking_server_arn : {tracking_server_arn}")
print(f"experiment_name : {experiment_name}")
print(f"registered_model : {registered_model}")

Stored 'test_model_id' (str)
Stored 'bucket' (str)
Stored 'prefix' (str)
Stored 'model_weight_path' (str)
Stored 'training_input_path' (str)
Stored 'test_input_path' (str)
Stored 'local_training_input_path' (str)
Stored 'local_test_input_path' (str)
Stored 'tracking_server_arn' (str)
Stored 'experiment_name' (str)
Stored 'registered_model' (str)
test_model_id : MLP-KTLim/llama-3-Korean-Bllossom-8B
bucket : sagemaker-us-west-2-322537213286
prefix : sagemaker/llama-3-1-kor-bllossom-8b
model_weight_path : s3://sagemaker-us-west-2-322537213286/sagemaker/llama-3-1-kor-bllossom-8b/model_weight/MLP-KTLim/llama-3-Korean-Bllossom-8B
training_input_path : s3://sagemaker-us-west-2-322537213286/sagemaker/llama-3-1-kor-bllossom-8b/gemini_result_kospi_0517/train/train_dataset.json
test_input_path : s3://sagemaker-us-west-2-322537213286/sagemaker/llama-3-1-kor-bllossom-8b/gemini_result_kospi_0517/test/test_dataset.json
local_training_input_path : /home/ec2-user/SageMaker/2024/llama-3-on-sagemaker/dat

In [5]:
%store -r