# An Endpoint using DistilBert (uncased) with the MS_MARCO dataset

# Part 2: TRAINING

---

## Preparation

In [2]:
!pip install -qU --upgrade pip
!pip install -qU --upgrade boto3
!pip install -qU --upgrade tensorflow
!pip install -qU torchvision
!pip uninstall tensorflow --yes
!pip install -qU tensorflow==2.11.0
!pip uninstall transformers --yes
!pip install -qU --upgrade transformers
!pip install "accelerate[sagemaker]" --upgrade

[0mFound existing installation: tensorflow 2.11.0
Uninstalling tensorflow-2.11.0:
  Successfully uninstalled tensorflow-2.11.0
[0mFound existing installation: transformers 4.30.2
Uninstalling transformers-4.30.2:
  Successfully uninstalled transformers-4.30.2
Collecting accelerate[sagemaker]
  Using cached accelerate-0.20.3-py3-none-any.whl (227 kB)
Collecting importlib-metadata<5.0,>=1.4.0 (from sagemaker->accelerate[sagemaker])
  Using cached importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Collecting pyyaml (from accelerate[sagemaker])
  Using cached PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
Installing collected packages: pyyaml, importlib-metadata, accelerate
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 5.3
[31mERROR: Cannot uninstall 'PyYAML'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.[0m[31m
[0m

In [3]:
# cell 01
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::376887058029:role/sagemaker-immersion-day-SageMakerExecutionRole-IRLPTS5YE7LD
sagemaker bucket: sagemaker-us-east-1-376887058029
sagemaker session region: us-east-1


Now let's bring in the Python libraries that we'll use

In [4]:
# cell 02
!pip install -qU datasets
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import zipfile     # Amazon SageMaker's Python SDK provides many helper functions


%matplotlib inline
%config InlineBackend.figure_format='retina'
import sagemaker
from sagemaker.pytorch import PyTorch
from sagemaker.debugger import ProfilerConfig, DebuggerHookConfig, Rule, ProfilerRule, rule_configs
import sagemaker.huggingface
from sagemaker.huggingface import HuggingFace
from datasets import load_dataset


import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from textwrap import wrap

import boto3
import pprint
import time

[0m

---

## Cleaned Data Importation


In [5]:
df = pd.read_csv('MS_Marco.csv')
df = df[:15000]
df.to_csv('MS_Marco_small.csv',index=False)

In [6]:
dataset = load_dataset("csv", data_files='MS_Marco.csv',split='train').train_test_split(test_size=0.5)
dataset

Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-c803a60da302cfb3/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


DatasetDict({
    train: Dataset({
        features: ['answers', 'query', 'query_id', 'context'],
        num_rows: 19944
    })
    test: Dataset({
        features: ['answers', 'query', 'query_id', 'context'],
        num_rows: 19944
    })
})

In [7]:
dataset["train"]["answers"][1]

"{'text': '£14,250.', 'answer_start': [4242]}"

In [8]:
import ast
dataset = dataset.map(lambda x: {"answers": ast.literal_eval(x["answers"])})
dataset["train"]["answers"][1]

Map:   0%|          | 0/19944 [00:00<?, ? examples/s]

Map:   0%|          | 0/19944 [00:00<?, ? examples/s]

{'answer_start': [4242], 'text': '£14,250.'}

---
# Tokenization of the dataset

In [9]:
from transformers import AutoTokenizer

tokenizer_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

def tokenize(batch):
    return tokenizer(batch["context"], batch["query"], truncation=True, padding=True)

In [10]:
max_length = 384
stride = 128

def preprocess_training(dataset):
    questions = [q.strip() for q in dataset["query"]]
    inputs = tokenizer(
        questions,
        dataset["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = dataset["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [11]:
train_dataset = dataset["train"].map(
    preprocess_training,
    batched=True,
    remove_columns=dataset["train"].column_names,)
test_dataset = dataset["test"].map(
    preprocess_training,
    batched=True,
    remove_columns=dataset["test"].column_names,)

Map:   0%|          | 0/19944 [00:00<?, ? examples/s]

Map:   0%|          | 0/19944 [00:00<?, ? examples/s]

In [12]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 61283
})

In [13]:
from datasets.filesystems import S3FileSystem
prefix = 'final_project'
s3 = S3FileSystem()

# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{prefix}/train'
train_dataset.save_to_disk(training_input_path,fs=s3)

# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{prefix}/test'
test_dataset.save_to_disk(test_input_path,fs=s3)



Saving the dataset (0/1 shards):   0%|          | 0/61283 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/61265 [00:00<?, ? examples/s]

---
# Model Training

In order to use SageMaker to fit our algorithm, we create an [`estimator`] from Huggingface library that defines how to use the container to train. This includes the configuration we need to invoke SageMaker training:

- `entry point (str)` - the script we enter to allows to fine-tune any model from huggingface hub
- `source_dir (str)` - the directory where is located this script inside the git repository
- `instance_type (str)` - the type of machine to use for training.
- `instance_count (int)` - number of machines to use for training.
- `role (str)` - SageMaker IAM role as obtained previously
- `git_config (dict)` - dictionnary that has the link and the branch of the git repository containing the transformers scripts
- `transformers_version (str)` - the transformer version to run the different scripts
- `pytorch_version (str)` - the pytorch version to run the different scripts
- `py_version (str)` - the python version to run the scripts
- `hyperparameters (dict)` - the dictionnary containg all the parameters' value



## Initialize Training

In [15]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to

In [16]:
hyperparameters={
    'epochs': 3,
    'train_batch_size': 32,
    'model_name': model_name,
    'tokenizer_name': tokenizer_name,
    'output_dir':'/opt/ml/checkpoints',
 }

In [17]:
metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_f1', 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_precision', 'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_recall', 'Regex': "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_runtime', 'Regex': "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_samples_per_second', 'Regex': "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

In [18]:
from sagemaker.huggingface import HuggingFace

git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.26.0'}

huggingface_estimator = HuggingFace(entry_point='run_mlm.py',
                            source_dir='./examples/pytorch/language-modeling',
                            git_config=git_config,
                            instance_type='ml.p3.2xlarge',
                            instance_count=1,
                            role=role,
                            transformers_version='4.6', 
                            pytorch_version='1.7',
                            py_version='py36',
                            hyperparameters = hyperparameters,
                            metric_definitions=metric_definitions,
                            max_run=36000, # expected max run in seconds
                        )

## Run Training

In [14]:
model_name = 'distilbert-base-uncased'
import datetime
ct = datetime.datetime.now() 
current_time = str(ct.now()).replace(":", "-").replace(" ", "-")[:19]
training_job_name=f'finetune-{model_name}-{current_time}'
print( training_job_name )

finetune-distilbert-base-uncased-2023-06-16-01-00-25


In [19]:
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path}, wait=False, job_name=training_job_name )

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: finetune-distilbert-base-uncased-2023-06-16-01-00-25


In [20]:
sess.wait_for_job(training_job_name)


2023-06-16 01:00:44 Starting - Starting the training job....
2023-06-16 01:01:10 Starting - Preparing the instances for training................
2023-06-16 01:02:34 Downloading - Downloading input data....
2023-06-16 01:02:59 Training - Downloading the training image.............................
2023-06-16 01:05:30 Training - Training image download completed. Training in progress..........
2023-06-16 01:06:23 Uploading - Uploading generated training model
2023-06-16 01:06:29 Failed - Training job failed


UnexpectedStatusException: Error for Training job finetune-distilbert-base-uncased-2023-06-16-01-00-25: Failed. Reason: AlgorithmError: InstallRequirementsError:
Command "/opt/conda/bin/python3.6 -m pip install -r requirements.txt"
ERROR: Could not find a version that satisfies the requirement accelerate>=0.12.0 (from versions: 0.0.1, 0.1.0, 0.2.0, 0.2.1, 0.3.0, 0.4.0, 0.5.0, 0.5.1, 0.6.0, 0.6.1, 0.6.2, 0.7.0, 0.7.1, 0.8.0, 0.9.0)
ERROR: No matching distribution found for accelerate>=0.12.0, exit code: 1

In [22]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from transformers import TFAutoModelForQuestionAnswering

tf_model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)
model.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
model.fit(
    train_tokenized,
    validation_data=test_tokenized,
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForQuestionAnswering were not initialized from the PyTorch model and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it

AttributeError: 'DistilBertForQuestionAnswering' object has no attribute 'compile'

## Results

In [None]:
from sagemaker import TrainingJobAnalytics

# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=training_job_name).dataframe()
df.head(10)

In [None]:
evals = df[df.metric_name.isin(['eval_accuracy','eval_precision', 'eval_f1'])]
losses = df[df.metric_name.isin(['loss', 'eval_loss'])]

sns.lineplot(
    x='timestamp', 
    y='value', 
    data=evals, 
    style='metric_name',
    markers=True,
    hue='metric_name'
)

ax2 = plt.twinx()
sns.lineplot(
    x='timestamp', 
    y='value', 
    data=losses, 
    hue='metric_name',
    ax=ax2)

---
# Deployment

Once the training job is complete, deploy your fine-tuned model by calling deploy() with the number of instances and instance type:

In [None]:
predictor = huggingface_estimator.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge", endpoint_name=training_job_name)

Call predict() on your data:

In [None]:
dataset["test"]["query"][100]

In [None]:
qa_input = {"inputs": dataset["test"]["query"][100]}

predictor.predict(qa_input)