# Set up the AWS_PROFILE

In [None]:
%env AWS_PROFILE=dev

# Installing required libraries

In [None]:
!pip install --upgrade sagemaker datasets

# Downloading and preparing training data

In [None]:
from datasets import load_dataset

dolly_dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

# To train for question answering/information extraction, you can replace the assertion in next line to example["category"] == "closed_qa"/"information_extraction".
summarization_dataset = dolly_dataset.filter(lambda example: example["category"] == "information_extraction")
summarization_dataset = summarization_dataset.remove_columns("category")

# We split the dataset into two where test data is used to evaluate at the end.
train_and_test_dataset = summarization_dataset.train_test_split(test_size=0.1)

# Dumping the training data to a local file to be used for training.
train_and_test_dataset["train"].to_json("train.jsonl")

In [None]:
train_and_test_dataset["train"][0]

# This template is tailored for the task of extracting specific information

In [None]:
import json

template = {
    "prompt": "Below is an instruction that describes a task, paired with an input that provides further context. "
              "Extract the requested information from the context and provide a concise response.\n\n"
              "### Instruction:\n{instruction}\n\n### Input:\n{context}\n\n",
    "completion": "{response}"
}
with open("template.json", "w") as f:
    json.dump(template, f)

# Upload the training data and also the tempalte to default s3 bucket

In [None]:
from sagemaker.s3 import S3Uploader
import sagemaker
import random

output_bucket = sagemaker.Session().default_bucket()
default_bucket_prefix = sagemaker.Session().default_bucket_prefix
default_bucket_prefix_path = ""

# If a default bucket prefix is specified, append it to the s3 path
if default_bucket_prefix:
    default_bucket_prefix_path = f"/{default_bucket_prefix}"

local_data_file = "train.jsonl"
train_data_location = f"s3://{output_bucket}{default_bucket_prefix_path}/dolly_information_exatraction_dataset"
S3Uploader.upload(local_data_file, train_data_location)
S3Uploader.upload("template.json", train_data_location)
print(f"Training data: {train_data_location}")

# Create a Role for Sagemaker

In [None]:
import boto3
import json
import time

# Initialize the IAM client
iam_client = boto3.client('iam')

# Define the role name
role_name = 'SageMakerExecutionRole'

# Trust policy for SageMaker
trust_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "sagemaker.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

# Custom S3 policy (optional, for specific bucket access)
s3_policy = {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "s3:GetObject",
                "s3:PutObject",
                "s3:ListBucket"
            ],
            "Resource": [
                "*",
            ]
        }
    ]
}

try:
    # Create the IAM role
    response = iam_client.create_role(
        RoleName=role_name,
        AssumeRolePolicyDocument=json.dumps(trust_policy),
        Description='Role for SageMaker to create training jobs'
    )
    role_arn = response['Role']['Arn']
    print(f"Created role with ARN: {role_arn}")

    # Attach AmazonSageMakerFullAccess managed policy
    iam_client.attach_role_policy(
        RoleName=role_name,
        PolicyArn='arn:aws:iam::aws:policy/AmazonSageMakerFullAccess'
    )
    print("Attached AmazonSageMakerFullAccess policy")

    # (Optional) Create and attach a custom S3 policy
    s3_policy_name = 'SageMakerS3AccessPolicy'
    iam_client.create_policy(
        PolicyName=s3_policy_name,
        PolicyDocument=json.dumps(s3_policy)
    )
    iam_client.attach_role_policy(
        RoleName=role_name,
        PolicyArn=f'arn:aws:iam::{boto3.client("sts").get_caller_identity()["Account"]}:policy/{s3_policy_name}'
    )
    print("Attached custom S3 policy")

    # Wait briefly to ensure the role is propagated
    time.sleep(10)

except iam_client.exceptions.EntityAlreadyExistsException:
    print(f"Role {role_name} already exists. Retrieving ARN...")
    role_arn = iam_client.get_role(RoleName=role_name)['Role']['Arn']
except Exception as e:
    print(f"Error creating role: {str(e)}")
    raise

# Train the model


In [None]:
from sagemaker.jumpstart.estimator import JumpStartEstimator

model_id, model_version = "meta-textgeneration-llama-3-8b", "2.*"

estimator = JumpStartEstimator(
    model_id=model_id,
    model_version=model_version,
    environment={"accept_eula": "true"},  # Please change {"accept_eula": "true"}
    disable_output_compression=True,
    instance_type="ml.g5.12xlarge",  # For Llama-3-70b, add instance_type = "ml.g5.48xlarge",
    role=role_arn,
    instance_count=1,
)
# By default, instruction tuning is set to false. Thus, to use instruction tuning dataset you use
estimator.set_hyperparameters(instruction_tuned="True", epoch="5", max_input_length="1024")
estimator.fit({"training": train_data_location})

# Deploy the trained model

In [None]:
finetuned_predictor = estimator.deploy()

In [None]:
import pandas as pd
from IPython.display import display, HTML

test_dataset = train_and_test_dataset["test"]

(
    inputs,
    ground_truth_responses,
    responses_before_finetuning,
    responses_after_finetuning,
) = (
    [],
    [],
    [],
    [],
)


def predict_and_print(datapoint):
    # For instruction fine-tuning, we insert a special key between input and output
    input_output_demarkation_key = "\n\n### Response:\n"

    payload = {
        "inputs": template["prompt"].format(
            instruction=datapoint["instruction"], context=datapoint["context"]
        )
        + input_output_demarkation_key,
        "parameters": {"max_new_tokens": 100},
    }
    print("_"*20)

    print(payload)
    # inputs.append(payload["inputs"])
    ground_truth_responses.append(datapoint["response"])
    # Please change the following line to "accept_eula=true"
    # pretrained_response = pretrained_predictor.predict(
    #     payload, custom_attributes="accept_eula=false"
    # )
    # responses_before_finetuning.append(pretrained_response.get("generated_text"))
    # Fine Tuned Llama 3 models doesn't required to set "accept_eula=true"
    finetuned_response = finetuned_predictor.predict(payload)
    print(finetuned_response.get("generated_text"))
    print("_"*20)

    responses_after_finetuning.append(finetuned_response.get("generated_text"))


for i, datapoint in enumerate(test_dataset.select(range(5))):
    predict_and_print(datapoint)

In [None]:
finetuned_predictor.delete_model()
finetuned_predictor.delete_endpoint()