# An Endpoint using BERT large model (uncased) with the MS_MARCO dataset

# Part 2: TRAINING

---

## Preparation

In [40]:
!pip install -qU --upgrade pip
!pip install -qU --upgrade boto3

[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aiobotocore 2.4.2 requires botocore<1.27.60,>=1.27.59, but you have botocore 1.29.152 which is incompatible.
awscli 1.27.111 requires botocore==1.29.111, but you have botocore 1.29.152 which is incompatible.
awscli 1.27.111 requires rsa<4.8,>=3.1.2, but you have rsa 4.9 which is incompatible.
sagemaker 2.145.0 requires importlib-metadata<5.0,>=1.4.0, but you have importlib-metadata 6.3.0 which is incompatible.
sagemaker 2.145.0 requires PyYAML==5.4.1, but you have pyyaml 5.3 which is incompatible.[0m[31m
[0m

In [41]:
# cell 01
import sagemaker
bucket=sagemaker.Session().default_bucket()
 
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()

Now let's bring in the Python libraries that we'll use

In [42]:
# cell 02
!pip install -qU datasets
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker 
import zipfile     # Amazon SageMaker's Python SDK provides many helper functions
from datasets import load_dataset

[0m

---

## Cleaned Data Importation


In [98]:
dataset = load_dataset("csv", data_files='MS_Marco.csv',split='train').train_test_split(test_size=0.2)
dataset

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-f1442b4f802b2eb3/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'answers', 'query', 'query_id', 'context'],
        num_rows: 79004
    })
    test: Dataset({
        features: ['Unnamed: 0', 'answers', 'query', 'query_id', 'context'],
        num_rows: 19751
    })
})

In [99]:
dataset = dataset.remove_columns("Unnamed: 0")
dataset

DatasetDict({
    train: Dataset({
        features: ['answers', 'query', 'query_id', 'context'],
        num_rows: 79004
    })
    test: Dataset({
        features: ['answers', 'query', 'query_id', 'context'],
        num_rows: 19751
    })
})

In [45]:
dataset["train"]["answers"][0]

"{'text': 'For this reason, the term fiber is common descriptive term for muscle cells. Muscle cells, therefore are also muscle fibers or more specifically myofibers. There are three types of myofibers: skeletal. cardiac. smooth. Skeletal and cardiac muscles are classified as striated types while smooth is a non-striated type. The three muscle types we will view are then: Skeletal-voluntary muscles attached to skeletal elements and cartilage. Cardiac-involuntary muscles making up the myocardium of the heart. Smooth-involuntary muscles in the walls of hollow organs, blood vessels, and all other body locales where muscles perform work(i.e.', 'answer_start': [2397], 'answer_end': [3031]}"

In [46]:
import ast
dataset = dataset.map(lambda x: {"answers": ast.literal_eval(x["answers"])})
dataset["train"]["answers"][0]

Map:   0%|          | 0/79004 [00:00<?, ? examples/s]

Map:   0%|          | 0/19751 [00:00<?, ? examples/s]

{'answer_end': [3031],
 'answer_start': [2397],
 'text': 'For this reason, the term fiber is common descriptive term for muscle cells. Muscle cells, therefore are also muscle fibers or more specifically myofibers. There are three types of myofibers: skeletal. cardiac. smooth. Skeletal and cardiac muscles are classified as striated types while smooth is a non-striated type. The three muscle types we will view are then: Skeletal-voluntary muscles attached to skeletal elements and cartilage. Cardiac-involuntary muscles making up the myocardium of the heart. Smooth-involuntary muscles in the walls of hollow organs, blood vessels, and all other body locales where muscles perform work(i.e.'}

---
# Tokenization of the dataset

In [47]:
!pip install -qU torchvision
!pip uninstall tensorflow --yes
!pip install -qU tensorflow==2.11.0
!pip uninstall PyTorch --yes
#!pip install -qU PyTorch
!pip uninstall transformers --yes
!pip install -qU transformers==3.3.1

[0mFound existing installation: tensorflow 2.11.0
Uninstalling tensorflow-2.11.0:
  Successfully uninstalled tensorflow-2.11.0
[0mFound existing installation: transformers 3.3.1
Uninstalling transformers-3.3.1:
  Successfully uninstalled transformers-3.3.1
[0m

In [113]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking")

def tokenize(batch):
    return tokenizer(batch["context"], batch["query"], padding="max_length", truncation=True)

In [114]:
train_dataset = dataset["train"].map(tokenize)
test_dataset = dataset["test"].map(tokenize)

Map:   0%|          | 0/79004 [00:00<?, ? examples/s]

Map:   0%|          | 0/19751 [00:00<?, ? examples/s]

In [117]:
type(train_dataset)

datasets.arrow_dataset.Dataset

In [126]:
import botocore
from datasets.filesystems import S3FileSystem

dataset_name='dataset'
# Upload to S3
s3 = S3FileSystem()
s3_prefix = f'samples/datasets/{dataset_name}'
training_input_path = f's3://{bucket}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path,fs=s3)
test_input_path = f's3://{bucket}/{s3_prefix}/test'
test_dataset.save_to_disk(test_input_path,fs=s3)



Saving the dataset (0/2 shards):   0%|          | 0/79004 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/19751 [00:00<?, ? examples/s]

---
# Model Training

In order to use SageMaker to fit our algorithm, we create an [`estimator`] from Huggingface library that defines how to use the container to train. This includes the configuration we need to invoke SageMaker training:

- `entry point (str)` - the script we enter to allows to fine-tune any model from huggingface hub
- `source_dir (str)` - the directory where is located this script inside the git repository
- `instance_type (str)` - the type of machine to use for training.
- `instance_count (int)` - number of machines to use for training.
- `role (str)` - SageMaker IAM role as obtained previously
- `git_config (dict)` - dictionnary that has the link and the branch of the git repository containing the transformers scripts
- `transformers_version (str)` - the transformer version to run the different scripts
- `pytorch_version (str)` - the pytorch version to run the different scripts
- `py_version (str)` - the python version to run the scripts
- `hyperparameters (dict)` - the dictionnary containg all the parameters' value



In [127]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

In [128]:
from sagemaker.huggingface import HuggingFace

hyperparameters={
    'model_name_or_path': 'bert-large-uncased-whole-word-masking',
    'do_train': True,
    'do_eval': True,
    'fp16': True,
    'per_device_train_batch_size': 4,
    'per_device_eval_batch_size': 4,
    'num_train_epochs': 2,
    'max_seq_length': 384,
    'max_steps': 100,
    'pad_to_max_length': True,
    'doc_stride': 128,
    'output_dir': '/opt/ml/model'
}

# configuration for running training on smdistributed Data Parallel
#distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}

# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.26.0'}

# instance configurations
#instance_type='ml.p3dn.24xlarge'
instance_type='ml.p3.2xlarge'
#instance_count=2
instance_count=1
volume_size=200

# metric definition to extract the results
metric_definitions=[
     {"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
     {'Name': 'train_samples_per_second', 'Regex': "train_samples_per_second.*=\D*(.*?)$"},
     {'Name': 'epoch', 'Regex': "epoch.*=\D*(.*?)$"},
     {'Name': 'f1', 'Regex': "f1.*=\D*(.*?)$"},
     {'Name': 'exact_match', 'Regex': "exact_match.*=\D*(.*?)$"}]

In [129]:
# estimator
huggingface_estimator = HuggingFace(entry_point='run_qa.py',
                                    source_dir='./examples/pytorch/question-answering',
                                    git_config=git_config,
                                    metric_definitions=metric_definitions,
                                    instance_type=instance_type,
                                    instance_count=instance_count,
                                    volume_size=volume_size,
                                    role=role,
                                    transformers_version='4.26.0',
                                    pytorch_version='1.13.1',
                                    py_version='py39',
                                    #distribution= distribution,
                                    hyperparameters = hyperparameters)


In [None]:
# starting the train job
huggingface_estimator.fit({"train": training_input_path, "test": test_input_path})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2023-06-12-22-41-30-557


2023-06-12 22:41:42 Starting - Starting the training job...
2023-06-12 22:42:09 Starting - Preparing the instances for training.........
2023-06-12 22:43:24 Downloading - Downloading input data..

---
# Deployment

Once the training job is complete, deploy your fine-tuned model by calling deploy() with the number of instances and instance type:

In [None]:
predictor = huggingface_estimator.deploy(initial_instance_count=1,"ml.g4dn.xlarge")

Call predict() on your data:

In [116]:
dataset["test"]["query"][100]

'how wide is a doorway'

In [None]:
qa_input = {"inputs": dataset["test"]["query"][100]}

predictor.predict(qa_input)