In [2]:
!pip install "sagemaker>=2.31.0" "transformers==4.6.1" "datasets[s3]==1.5.0" --upgrade

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


In [25]:
import pickle
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import datasets

In [26]:
data_dir = '../data/bert' # The folder we will use for storing data
os.makedirs(data_dir, exist_ok=True)  # ensure cache directory exists

cache_dir = os.path.join("../cache", "bert")  # where to store cache files
os.makedirs(cache_dir, exist_ok=True)  # ensure cache directory exists

In [27]:
datasets.__version__

'1.5.0'

# 1. Preparing Data

## 1.1. Loading Dataset

In [28]:
from datasets import load_dataset

In [29]:
# train_df = pickle.load(open('../data/processed/train_df.pkl', 'rb'))
# val_df = pickle.load(open('../data/processed/val_df.pkl', 'rb'))
# test_df = pickle.load(open('../data/processed/test_df.pkl', 'rb'))

# train_df = pd.concat([train_df, val_df]).reset_index(drop = True)
# val_df = None

# train_df = train_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
# # val_df = val_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})
# test_df = test_df.rename(columns = {'cleaned_y': 'label', 'all_text': 'sentence'})

# train_df[['label', 'sentence']].to_csv(os.path.join(data_dir, 'train_bert.csv'), index = False, header = True)
# # val_df[['label', 'sentence']].to_csv(os.path.join(data_dir, 'val_bert.csv'), index = False, header = True)
# test_df[['label', 'sentence']].to_csv(os.path.join(data_dir, 'test_bert.csv'), index = False, header = True)

In [30]:
train_dataset = load_dataset('csv', data_files=os.path.join(data_dir, 'train_bert.csv'))['train']
test_dataset = load_dataset('csv', data_files=os.path.join(data_dir, 'test_bert.csv'))['train']

Using custom data configuration default-8b650fbfd11b1ec5
Reusing dataset csv (/home/ec2-user/.cache/huggingface/datasets/csv/default-8b650fbfd11b1ec5/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)
Using custom data configuration default-df64ce23991a7e22
Reusing dataset csv (/home/ec2-user/.cache/huggingface/datasets/csv/default-df64ce23991a7e22/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)


In [31]:
test_dataset

Dataset({
    features: ['label', 'sentence'],
    num_rows: 15000
})

## 1.2 Process Dataset

In [32]:
# from datasets import load_dataset
from transformers import AutoTokenizer

# tokenizer used in preprocessing
tokenizer_name = 'distilbert-base-uncased'

# dataset used
# dataset_name = 'imdb'

# s3 key prefix for the data
s3_prefix = 'sagemaker/stackoverflow-question-quality'

In [33]:
# download tokenizer
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# tokenizer helper function
def tokenize(batch):
    return tokenizer(batch['sentence'], padding='max_length', truncation=True)

In [12]:
train_dataset

Dataset({
    features: ['label', 'sentence'],
    num_rows: 45000
})

In [13]:
# tokenize dataset
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# set format for pytorch
train_dataset =  train_dataset.rename_column("label", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

HBox(children=(FloatProgress(value=0.0, max=45.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [14]:
train_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'sentence'],
    num_rows: 45000
})

## 1.3 Uploading Data

In [15]:
import sagemaker

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::997893341280:role/service-role/AmazonSageMaker-ExecutionRole-20210707T190726
sagemaker bucket: sagemaker-us-east-1-997893341280
sagemaker session region: us-east-1


In [16]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()  

# save train_dataset to s3
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train_bert'
train_dataset.save_to_disk(training_input_path,fs=s3)

# save test_dataset to s3
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test_bert'
test_dataset.save_to_disk(test_input_path,fs=s3)

In [17]:
test_input_path

's3://sagemaker-us-east-1-997893341280/sagemaker/stackoverflow-question-quality/test_bert'

In [18]:
# from datasets import load_from_disk

# t = load_from_disk(test_input_path)

In [19]:
# # load dataset
# dataset = load_dataset(dataset_name)

# 2. Create Model

## 2.1 Model Training

In [20]:
training_input_path

's3://sagemaker-us-east-1-997893341280/sagemaker/stackoverflow-question-quality/train_bert'

In [34]:
from sagemaker.huggingface import HuggingFace

# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'per_device_train_batch_size': 2,
                 'model_name': 'bert-base-uncased'
                 }

# create the Estimator
huggingface_estimator = HuggingFace(
        entry_point='huggingface_train.py',
        source_dir='../src_bert',
        instance_type='ml.p2.xlarge',
        instance_count=1,
        role=role,
        transformers_version='4.4',
        pytorch_version='1.6',
        py_version='py36',
        hyperparameters = hyperparameters
)

In [35]:
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

2021-07-10 01:22:32 Starting - Starting the training job...
2021-07-10 01:22:54 Starting - Launching requested ML instancesProfilerReport-1625880151: InProgress
...
2021-07-10 01:23:31 Starting - Preparing the instances for training............
2021-07-10 01:25:20 Downloading - Downloading input data......
2021-07-10 01:26:16 Training - Downloading the training image......................[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-07-10 01:30:06,665 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-07-10 01:30:06,698 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-07-10 01:30:06,708 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-07-10 01:30:07,129 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m
[34m/o

UnexpectedStatusException: Error for Training job huggingface-pytorch-training-2021-07-10-01-22-31-590: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/opt/conda/bin/python3.6 huggingface_train.py --epochs 1 --model_name bert-base-uncased --per_device_train_batch_size 2"
Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]Downloading: 100%|ââââââââââ| 570/570 [00:00<00:00, 716kB/s]
Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]Downloading:   0%|          | 1.33M/440M [00:00<00:33, 13.1MB/s]Downloading:   1%|          | 4.81M/440M [00:00<00:27, 16.1MB/s]Downloading:   2%|â         | 8.06M/440M [00:00<00:22, 19.0MB/s]Downloading:   3%|â         | 13.2M/440M [00:00<00:18, 23.4MB/s]Downloading:   4%|â         | 18.4M/440M [00:00<00:15, 28.0MB/s]Downloading:   5%|â         | 23.7M/440M [00:00<00:12, 32.7MB/s]Downloading:   7%|â         | 29.1M/440M [00:00<00:11, 37.0MB/s]Downloading:   8%|â         | 34.5M/440M [00:00<00:09, 40.9MB/s]Downloading:   9%|â         | 39.9M/440M [00:00<00:09, 44.1MB/s]Downloading:  10%|â         | 45.3M/440M [00:01<00:08, 46.7

## 2.2 Model Evaluation

In [None]:
# container image used for training job
print(f"container image used for training job: \n{huggingface_estimator.image_uri}\n")

# s3 uri where the trained model is located
print(f"s3 uri where the trained model is located: \n{huggingface_estimator.model_data}\n")

# latest training job name for this estimator
print(f"latest training job name for this estimator: \n{huggingface_estimator.latest_training_job.name}\n")

In [None]:
huggingface_transformer = huggingface_estimator.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [None]:
# xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [None]:
# evaluate result:
# open(os.path.join(args.output_data_dir, "eval_results.txt"), "w")