# An Endpoint using BERT large model (uncased) with the MS_MARCO dataset

---

## Preparation

In [4]:
!pip install -qU --upgrade pip

[0m

In [5]:
# cell 01
import sagemaker
bucket=sagemaker.Session().default_bucket()
 
# Define IAM role
import boto3
import re
from sagemaker import get_execution_role

role = get_execution_role()

Now let's bring in the Python libraries that we'll use

In [6]:
# cell 02
!pip install -qU datasets
import numpy as np                                # For matrix operations and numerical processing
import pandas as pd                               # For munging tabular data
import matplotlib.pyplot as plt                   # For charts and visualizations
from IPython.display import Image                 # For displaying images in the notebook
from IPython.display import display               # For displaying outputs in the notebook
from time import gmtime, strftime                 # For labeling SageMaker models, endpoints, etc.
import sys                                        # For writing outputs to notebook
import math                                       # For ceiling function
import json                                       # For parsing hosting outputs
import os                                         # For manipulating filepath names
import sagemaker 
import zipfile     # Amazon SageMaker's Python SDK provides many helper functions
from datasets import load_dataset

[0m

---

## Data


### STEP 1 : Importation of dataset from S3 bucket
Let's start by importing the dataset from S3 bucket

In [7]:
prefix = 'final_project'
train_path = f"s3://{bucket}/{prefix}/train/train.csv"
validation_path = f"s3://{bucket}/{prefix}/validation/validation.csv"
test_path = f"s3://{bucket}/{prefix}/test/test.csv"
print(train_path,'\n',validation_path,'\n',test_path)

s3://sagemaker-us-east-1-834242159264/final_project/train/train.csv 
 s3://sagemaker-us-east-1-834242159264/final_project/validation/validation.csv 
 s3://sagemaker-us-east-1-834242159264/final_project/test/test.csv


In [8]:
train = pd.read_csv(train_path).drop(['Unnamed: 0'],axis=1)
test = pd.read_csv(test_path).drop(['Unnamed: 0'],axis=1)
validation = pd.read_csv(validation_path).drop(['Unnamed: 0'],axis=1)

In [9]:
train.head(3)

Unnamed: 0,answers,query,query_id,query_type,context
0,"Approx $4,050.",average cost of assisted living in illinois,75327,1,The average cost of Assisted Living in Illinoi...
1,It is a type of liquid that is used on asphalt...,what is sealcoating,51110,2,Sealcoating is a type of liquid that is used o...
2,A demolition order requires you to leave the p...,what is an order to demolish,95687,2,1 A demolition order requires you to leave the...


### STEP 2 : Tokenization of the dataset
Let's k,ow tokenize the dataset

In [12]:
!pip install -qU torchvision
!pip uninstall tensorflow --yes
!pip install -qU tensorflow==2.2.0
!pip uninstall PyTorch --yes
!pip install -qU PyTorch
!pip uninstall transformers --yes
!pip install -qU transformers==3.3.1
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

[0m  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[6 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File "<string>", line 36, in <module>
  [31m   [0m   File "<pip-setuptools-caller>", line 34, in <module>
  [31m   [0m   File "/tmp/pip-install-j7d0ie1r/pytorch_07a9461b95404b00aeb0bab34ce6af94/setup.py", line 15, in <module>
  [31m   [0m     raise Exception(message)
  [31m   [0m Exception: You tried to install "pytorch". The package named for PyTorch is "torch"
  [31m   [0m [31m[end of output][0m
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[31m  ERROR: Failed building wheel for PyTorch[0m[31m
[0m[31mERROR: Could not build wheels for PyTorch, which is required to install pyproject.toml-based projects[0m[31m
[0mFound existing in

ImportError: cannot import name 'AutoModelForQuestionAnswering' from 'transformers' (/opt/conda/lib/python3.7/site-packages/transformers/__init__.py)

In [None]:
train_encodings = tokenizer(train['context'].values.tolist(), train['query'].values.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(test['context'].values.tolist(), test['query'].values.tolist(), truncation=True, padding=True)
validation_encodings = tokenizer(validation['context'].values.tolist(), validation['query'].values.tolist(), truncation=True, padding=True)

In [None]:
def add_answers(encodings, answers):
    encodings.update({'answer': answers})
add_answers(train_encodings, train['answers'].values.tolist())
add_answers(test_encodings, test['answers'].values.tolist())
add_answers(validation_encodings, validation['answers'].values.tolist())

In [None]:
import tensorflow as tf

def GetRightFormat(encodings):
    dataset =  tf.data.Dataset.from_tensor_slices((
        {key: encodings[key] for key in ['input_ids', 'attention_mask']},
        {key: encodings[key] for key in ['answer']}
    ))
    
train_dataset = GetRightFormat(train_encodings)
test_dataset = GetRightFormat(test_encodings)
validation_dataset = GetRightFormat(validation_encodings)

---
# Model Training

In order to use SageMaker to fit our algorithm, we create an [`estimator`] from Huggingface library that defines how to use the container to train. This includes the configuration we need to invoke SageMaker training:

- `entry point (str)` - the script we enter to allows to fine-tune any model from huggingface hub
- `source_dir (str)` - the directory where is located this script inside the git repository
- `instance_type (str)` - the type of machine to use for training.
- `instance_count (int)` - number of machines to use for training.
- `role (str)` - SageMaker IAM role as obtained previously
- `git_config (dict)` - dictionnary that has the link and the branch of the git repository containing the transformers scripts
- `transformers_version (str)` - the transformer version to run the different scripts
- `pytorch_version (str)` - the pytorch version to run the different scripts
- `py_version (str)` - the python version to run the scripts
- `hyperparameters (dict)` - the dictionnary containg all the parameters' value



In [None]:
from sagemaker.huggingface import HuggingFace

hyperparameters={
    'model_name_or_path': 'bert-large-uncased-whole-word-masking',
    'dataset_name':'MS_Marco',
    'do_train': True,
    'do_eval': True,
    'fp16': True,
    'per_device_train_batch_size': 4,
    'per_device_eval_batch_size': 4,
    'num_train_epochs': 2,
    'max_seq_length': 384,
    'max_steps': 100,
    'pad_to_max_length': True,
    'doc_stride': 128,
    'output_dir': '/opt/ml/model'
}

# configuration for running training on smdistributed Data Parallel
#distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}

# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.26.0'}

# instance configurations
#instance_type='ml.p3dn.24xlarge'
instance_type='ml.p3.2xlarge'
#instance_count=2
instance_count=1
volume_size=200

# metric definition to extract the results
metric_definitions=[
     {"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
     {'Name': 'train_samples_per_second', 'Regex': "train_samples_per_second.*=\D*(.*?)$"},
     {'Name': 'epoch', 'Regex': "epoch.*=\D*(.*?)$"},
     {'Name': 'f1', 'Regex': "f1.*=\D*(.*?)$"},
     {'Name': 'exact_match', 'Regex': "exact_match.*=\D*(.*?)$"}]

In [None]:
# estimator
huggingface_estimator = HuggingFace(entry_point='run_qa.py',
                                    source_dir='./examples/pytorch/question-answering',
                                    git_config=git_config,
                                    metric_definitions=metric_definitions,
                                    instance_type=instance_type,
                                    instance_count=instance_count,
                                    volume_size=volume_size,
                                    role=role,
                                    transformers_version='4.26.0',
                                    pytorch_version='1.13.1',
                                    py_version='py39',
                                    #distribution= distribution,
                                    hyperparameters = hyperparameters)


In [None]:
# starting the train job
huggingface_estimator.fit()