In [None]:
# You only need to run this once per machine
!pip install -q -U bitsandbytes 
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git 
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U datasets scipy ipywidgets
!pip install urllib
!pip install s3fs --upgrade
!pip install botocore --upgrade
!pip install trl

In [None]:
!huggingface-cli login --token "[Your Token]"

### Select base model

In [None]:
from transformers import AutoTokenizer

augmentation_type = 'LLM'
instruction_base_model = True
full_instruct_model = False
response_template = "[/INST]"
save_model = False
final_model = True
k = 0
hub_name = 'no'

base_model = 'biomistralFINLLM'

model_id = 'salangarica/BioMistral-LLM'
tokenizer = AutoTokenizer.from_pretrained(model_id)


In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

### Define templates

In [None]:
template_general_instruction = """You are an expert microbiologist who given an excerpt from a research paper can easily 
identify the type of relation between a microbe and a disease. Doesn't create new information, but is completely faithful to the information provided, and always gives concise answers."""

template_instruction = """Given the following meaning of the labels, answer the following question with the appropiate label.
positive: This type is used to annotate microbe-disease entity pairs with positive correlation, such as microbe will cause or aggravate the disease, the microbe will increase when disease occurs.
negative: This type is used to annotate microbe-disease entity pairs that have a negative correlation, such as microbe can be a treatment for a disease, or microbe will decrease when disease occurs. 
na: This type is used when the relation between a microbe and a disease is not clear from the context or there is no relation. In other words, use this label if the relation is not positive and not negative."""

template_evidence = """Based on the above description, evidence is as follows: 
{evidence}

What is the relationship between {microbe} and {disease}?"""

template_system = template_general_instruction + '\n' + template_instruction
if 'mistral' in base_model and instruction_base_model:
    template_user =  template_general_instruction + '\n' + template_instruction + '\n' + template_evidence
else:
    template_user = template_evidence


### Getting the dataset

In [None]:
import pandas as pd 
from datasets import Dataset

def format_dataset(df):
    formated_dataset = []
    for i in range(len(df)):
        row = df.iloc[i]
        example_list = []
        if 'mistral' in base_model and instruction_base_model:
            example_list.append({'content': copy.deepcopy(template_user).format(evidence=row['EVIDENCE'],
                                                                            microbe=row['MICROBE'],
                                                                            disease=row['DISEASE']), 'role': 'user'})
            example_list.append({'content': row['RELATION'], 'role': 'assistant'})
        else:
            example_list.append({'content': template_system, 'role': 'system'})
            example_list.append({'content': copy.deepcopy(template_user).format(evidence=row['EVIDENCE'],
                                                                            microbe=row['MICROBE'],
                                                                            disease=row['DISEASE']), 'role': 'user'})
            example_list.append({'content': row['RELATION'], 'role': 'assistant'})

        formated_dataset.append({'message': example_list})
    return formated_dataset

if final_model:
    if augmentation_type == 'RAG':
        train_path = "s3://finetune-mistral/instruction_finetuning/Final/AUG_RAG/train_k{}.csv".format(k)
        validation_path = "s3://finetune-mistral/instruction_finetuning/Final/AUG_RAG/val_k{}.csv".format(k)
    elif augmentation_type == 'LLM':
        train_path = "s3://finetune-mistral/instruction_finetuning/Final/AUG_LLM/train_k{}.csv".format(k)
        validation_path = "s3://finetune-mistral/instruction_finetuning/Final/AUG_LLM/val_k{}.csv".format(k)
    elif augmentation_type == 'ALL':
        train_path = "s3://finetune-mistral/instruction_finetuning/Final/ALL_AUG/train_k{}.csv".format(k)
        validation_path = "s3://finetune-mistral/instruction_finetuning/Final/ALL_AUG/val_k{}.csv".format(k)
    else:
        train_path = "s3://finetune-mistral/instruction_finetuning/Final/NO_AUG/train_k{}.csv".format(k)
        validation_path = "s3://finetune-mistral/instruction_finetuning/Final/NO_AUG/val_k{}.csv".format(k)
else:
    if augmentation_type == 'RAG':
        train_path = "s3://finetune-mistral/instruction_finetuning/AUG_RAG/train_k{}.csv".format(k)
        validation_path = "s3://finetune-mistral/instruction_finetuning/AUG_RAG/val_k{}.csv".format(k)
    elif augmentation_type == 'LLM':
        train_path = "s3://finetune-mistral/instruction_finetuning/AUG_LLM/train_k{}.csv".format(k)
        validation_path = "s3://finetune-mistral/instruction_finetuning/AUG_LLM/val_k{}.csv".format(k)
    else:
        train_path = "s3://finetune-mistral/instruction_finetuning/train_k{}.csv".format(k)
        validation_path = "s3://finetune-mistral/instruction_finetuning/val_k{}.csv".format(k)


training_data = pd.read_csv(train_path)
validation_data = pd.read_csv(validation_path)

training_data['RELATION'] = training_data['RELATION'].replace(['relate'], 'na')
validation_data['RELATION'] = validation_data['RELATION'].replace(['relate'], 'na')


print(training_data.head())
print(training_data.shape)

print(training_data.loc[training_data['RELATION'] == 'na'])

### Format dataset

In [None]:
from datasets import Dataset
import copy

training_data = format_dataset(training_data)
validation_data = format_dataset(validation_data)

train_dataset = Dataset.from_list(training_data)
validation_dataset = Dataset.from_list(validation_data)
print(train_dataset)
print(train_dataset[0])

### Saving the processed dataset to disk

In [None]:
if final_model:
    if augmentation_type == 'RAG':
        training_destination_path = f's3://{sess.default_bucket()}/instruction_finetuning/FINAL/processed_RAG/train3_classes_k{k}'
        validation_destination_path = f's3://{sess.default_bucket()}/instruction_finetuning/FINAL/processed_RAG/validation3_classes_k{k}'
    elif augmentation_type == 'LLM':
        training_destination_path = f's3://{sess.default_bucket()}/instruction_finetuning/FINAL/processed_LLM/trtrain3_classesain_k{k}'
        validation_destination_path = f's3://{sess.default_bucket()}/instruction_finetuning/FINAL/processed_LLM/validation3_classes_k{k}'
    else:
        training_destination_path = f's3://{sess.default_bucket()}/instruction_finetuning/FINAL/processed/train3_classes_k{k}'
        validation_destination_path = f's3://{sess.default_bucket()}/instruction_finetuning/FINAL/processed/validation3_classes_k{k}'

else:
    if augmentation_type == 'RAG':
        training_destination_path = f's3://{sess.default_bucket()}/instruction_finetuning/processed_RAG/train3_classes_k{k}'
        validation_destination_path = f's3://{sess.default_bucket()}/instruction_finetuning/processed_RAG/validation3_classes_k{k}'
    elif augmentation_type == 'LLM':
        training_destination_path = f's3://{sess.default_bucket()}/instruction_finetuning/processed_LLM/trtrain3_classesain_k{k}'
        validation_destination_path = f's3://{sess.default_bucket()}/instruction_finetuning/processed_LLM/validation3_classes_k{k}'
    else:
        training_destination_path = f's3://{sess.default_bucket()}/instruction_finetuning/processed/train3_classes_k{k}'
        validation_destination_path = f's3://{sess.default_bucket()}/instruction_finetuning/processed/validation3_classes_k{k}'



train_dataset.save_to_disk(training_destination_path)
validation_dataset.save_to_disk(validation_destination_path)

print('Training Saved to: {}'.format(training_destination_path))
print('Validation Saved to: {}'.format(validation_destination_path))

### Training Hyperparameters

In [None]:
from huggingface_hub import HfFolder


# hyperparameters, which are passed into the training job
hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset_path': '/opt/ml/input/data/training',    # path where sagemaker will save training dataset
  'val_dataset_path':'/opt/ml/input/data/validation',
  'hub_name':hub_name,
    'save_model':save_model,
    'num_train_epochs': 20,                            # number of training epochs
  'instruction_base_model':instruction_base_model,
    'full_instruct_model':full_instruct_model,
    'response_template':response_template,
    'per_device_train_batch_size': 8,                 # batch size for training
  'gradient_accumulation_steps': 1,                 # Number of updates steps to accumulate 
  'gradient_checkpointing': True,                   # save memory but slower backward pass
  'bf16': True,                                     # use bfloat16 precision
  'tf32': True,                                     # use tf32 precision
    'learning_rate': 2e-5,                            # learning rate
  #'max_grad_norm': 0.3,                             # Maximum norm (for gradient clipping)
  'warmup_ratio': 0.03,                             # warmup ratio
  "lr_scheduler_type":"constant",                   # learning rate scheduler
    'save_strategy': "steps",
    'evaluation_strategy': "steps", 
    'save_steps':0.05,
    'eval_steps':0.05,
  "logging_steps": 10,                              # log every x steps
  'merge_adapters': True,                           # wether to merge LoRA into the model (needs more memory)
  'use_flash_attn': True,                           # Whether to use Flash Attention
  'output_dir': '/tmp/run',                         # output directory, where to save assets during training                                                     # could be used for checkpointing. The final trained                                                    # model will always be saved to s3 at the end of training 
'load_best_model_at_end':True,
    'save_total_limit':1,

}

if HfFolder.get_token() is not None:
    hyperparameters['hf_token'] = HfFolder.get_token()

In [None]:
from sagemaker.huggingface import HuggingFace

# define Training Job Name 
job_name = '{}-3class-k{}'.format(base_model, k)

print(job_name)

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'train_aws.py',    # train script
    source_dir           = 'scripts',      # directory which includes all the files needed for training
    instance_type        = 'ml.g5.12xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    max_run              = 2*24*60*60,        # maximum runtime in seconds (days * hours * minutes * seconds)
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.28',            # the transformers version used in the training job
    pytorch_version      = '2.0',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" }, # set env variable to cache models in /tmp
    disable_output_compression = True         # not compress output to save training time and cost
)

In [None]:
# define a data input dictonary with our uploaded s3 uris
data = {'training': training_destination_path, 'validation': validation_destination_path}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)

In [None]:
from sagemaker import TrainingJobAnalytics

training_job_name = huggingface_estimator.latest_training_job.job_name

df = TrainingJobAnalytics(training_job_name=training_job_name).dataframe()
df.head(10)