In [4]:
import boto3
import pandas as pd
from datasets import load_dataset
import sagemaker
from sagemaker.session import Session
import os
import torch
from transformers import BertTokenizerFast
from tqdm.notebook import tqdm
import logging
from sagemaker.huggingface import HuggingFace

## Setup

In [2]:
sagemaker_session_bucket = "finer-replication"
role = sagemaker.get_execution_role()
sess = Session(default_bucket=sagemaker_session_bucket)

## Uploading untokenized data

In [4]:
def group_by_sentence(df):
    """Your existing grouping function"""
    df['gold_token'] = df['gold_token'].fillna("None")
    grouped = df.groupby(['doc_idx', 'sent_idx']).agg({
        'gold_token': list,
        'gold_label': list
    }).reset_index()
    
    return grouped[['gold_token', 'gold_label']].rename(columns={
        'gold_token': 'words',
        'gold_label': 'labels'
    })

In [5]:
# Load dataset from Hugging Face
dataset = load_dataset("gtfintechlab/finer-ord")

# Convert splits to DataFrames and preprocess
splits = ['train', 'validation', 'test']
s3_locations = {}

README.md:   0%|          | 0.00/5.34k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

val.csv:   0%|          | 0.00/135k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/336k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/80531 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10233 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25957 [00:00<?, ? examples/s]

In [6]:
for split in splits:
    # Convert to DataFrame
    df = pd.DataFrame(dataset[split])
    
    # Group sentences (your existing preprocessing)
    processed_df = group_by_sentence(df)
    
    # Save locally first
    local_path = f'/tmp/{split}.parquet'
    processed_df.to_parquet(local_path)
    
    # Upload to S3
    s3_path = f's3://{sagemaker_session_bucket}/data/{split}'
    aws_path = f'data/{split}'
    
    # Upload using boto3
    s3 = boto3.client('s3')
    s3.upload_file(local_path, sagemaker_session_bucket, f'{aws_path}/data.parquet')
    
    # Store S3 location
    s3_locations[split] = s3_path
    
    # Clean up local file
    os.remove(local_path)


## Uploading tokenized data with ProcessingJob

In [None]:
from sagemaker.huggingface import HuggingFaceProcessor


processor = HuggingFaceProcessor(
    role=role,
    instance_type='ml.g4dn.xlarge',
    instance_count=1,
    base_job_name='ner-preprocessing',
    transformers_version='4.26.0',
    pytorch_version='1.13.1',
    py_version='py39'
)

# Define input and output paths
input_data = 's3://finer-replication/data'
output_data = 's3://finer-replication/BERT-processed_data'

# Run processing job
processor.run(
    code='preprocess.py',
    inputs=[
        sagemaker.processing.ProcessingInput(
            source=input_data,
            destination='/opt/ml/processing/input'
        )
    ],
    outputs=[
        sagemaker.processing.ProcessingOutput(
            output_name='processed_data',
            source='/opt/ml/processing/output',
            destination=output_data
        )
    ],
    arguments=[
        '--input-data', '/opt/ml/processing/input',
        '--output-data', '/opt/ml/processing/output',
        '--model-id', 'bert-base-cased'
    ]
)

## Uploading untokenized data directly from notebook  

In [3]:
# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [4]:
# Initialize S3 client and load data
s3 = boto3.client('s3')
bucket_name = 'finer-replication'

# Load all splits
train_df = pd.read_parquet(f's3://{bucket_name}/data/train/data.parquet')
val_df = pd.read_parquet(f's3://{bucket_name}/data/validation/data.parquet')
test_df = pd.read_parquet(f's3://{bucket_name}/data/test/data.parquet')

print(f"Loaded {len(train_df)} training examples")
print(f"Loaded {len(val_df)} validation examples")
print(f"Loaded {len(test_df)} test examples")

Loaded 3262 training examples
Loaded 402 validation examples
Loaded 1075 test examples


In [33]:
# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')



In [11]:
# Initialize tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

def align_labels(labels, word_ids):
    """Align labels with tokenized input."""
    aligned_labels = []
    last_word = None
    begin2inside = {1: 2, 3: 4, 5: 6}  # B- to I- mapping
    
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(-100)
        elif word_idx != last_word:
            aligned_labels.append(labels[word_idx])
        else:
            label = labels[word_idx]
            if label in begin2inside:
                label = begin2inside[label]
            aligned_labels.append(label)
        last_word = word_idx
    
    return aligned_labels

def process_split(df, desc):
    """Process a dataframe of sentences and labels."""
    processed_data = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc=desc):
        # Convert numpy arrays to lists
        words = row['words'].tolist()
        labels = row['labels'].tolist()
        
        tokenized = tokenizer(
            words,
            is_split_into_words=True,
            truncation=True,
            max_length=512
        )
        
        word_ids = tokenized.word_ids()
        aligned_labels = align_labels(labels, word_ids)
        
        processed_data.append({
            'input_ids': tokenized['input_ids'],
            'attention_mask': tokenized['attention_mask'],
            'labels': aligned_labels
        })
    
    return processed_data


# Process all splits
processed_train = process_split(train_df, "Processing train")
processed_val = process_split(val_df, "Processing validation")
processed_test = process_split(test_df, "Processing test")



Processing train:   0%|          | 0/3262 [00:00<?, ?it/s]

Processing validation:   0%|          | 0/402 [00:00<?, ?it/s]

Processing test:   0%|          | 0/1075 [00:00<?, ?it/s]

In [15]:
# Save processed data to temporary files first
torch.save(processed_train, '/tmp/processed_train.pt')
torch.save(processed_val, '/tmp/processed_val.pt')
torch.save(processed_test, '/tmp/processed_test.pt')

# Upload to S3
s3.upload_file('/tmp/processed_train.pt', bucket_name, 'BERT-processed_data/train/processed_data.pt')
s3.upload_file('/tmp/processed_val.pt', bucket_name, 'BERT-processed_data/validation/processed_data.pt')
s3.upload_file('/tmp/processed_test.pt', bucket_name, 'BERT-processed_data/test/processed_data.pt')

# Save and upload tokenizer
tokenizer.save_pretrained('/tmp/tokenizer')
for file in ['config.json', 'tokenizer.json', 'tokenizer_config.json', 'vocab.txt', 'special_tokens_map.json']:
    if os.path.exists(f'/tmp/tokenizer/{file}'):
        s3.upload_file(f'/tmp/tokenizer/{file}', bucket_name, f'BERT-processed_data/{file}')

print("All data processed and saved to S3!")

All data processed and saved to S3!


## Training Job

In [48]:
# Define hyperparameters
hyperparameters = {
    'epochs': 1,
    'train_batch_size': 8,
    'eval_batch_size': 8,
    'learning_rate': 2e-5,
    'warmup_steps': 500
}

# Create HuggingFace estimator
huggingface_estimator = HuggingFace(
    entry_point='train.py',
    source_dir='train',
    instance_type='ml.g4dn.xlarge',
    instance_count=1,
    role=role,
    transformers_version='4.26.0',
    pytorch_version='1.13.1',
    py_version='py39',
    hyperparameters=hyperparameters,
    # use_spot_instances=True,  # Enable spot instances
    # max_wait=7200,  # Maximum time to wait for spot instances (in seconds)
    # max_run=3600,  # Maximum training time (in seconds)
)

# Start training
huggingface_estimator.fit({
    'training': 's3://finer-replication/BERT-processed_data'
})

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2024-11-14-22-06-20-450


2024-11-14 22:06:22 Starting - Starting the training job...
2024-11-14 22:06:36 Starting - Preparing the instances for training...
2024-11-14 22:07:04 Downloading - Downloading input data...
2024-11-14 22:07:29 Downloading - Downloading the training image...............
2024-11-14 22:10:27 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
  "cipher": algorithms.TripleDES,[0m
  "class": algorithms.TripleDES,[0m
[34m2024-11-14 22:10:38,800 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-11-14 22:10:38,819 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-11-14 22:10:38,833 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-11-14 22:10:38,836 sagemaker_pytorch_container.training 

## Deploying the model

In [49]:
env = {
    'HF_TASK': 'token-classification'
}

# Deploy the model with the specified environment variables
predictor = huggingface_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.g4dn.xlarge',
    env=env
)

INFO:sagemaker:Creating model with name: huggingface-pytorch-training-2024-11-14-22-13-09-947
INFO:sagemaker:Creating endpoint-config with name huggingface-pytorch-training-2024-11-14-22-13-09-947
INFO:sagemaker:Creating endpoint with name huggingface-pytorch-training-2024-11-14-22-13-09-947


--------!

In [51]:
data = {
   "inputs": "Donald Trump is the United States president."
}

In [52]:
predictor.predict(data)

[{'entity': 'B-PER',
  'score': 0.8649447560310364,
  'index': 1,
  'word': 'Donald',
  'start': 0,
  'end': 6},
 {'entity': 'I-PER',
  'score': 0.7147530317306519,
  'index': 2,
  'word': 'Trump',
  'start': 7,
  'end': 12},
 {'entity': 'B-LOC',
  'score': 0.7302929162979126,
  'index': 5,
  'word': 'United',
  'start': 20,
  'end': 26},
 {'entity': 'I-LOC',
  'score': 0.7545739412307739,
  'index': 6,
  'word': 'States',
  'start': 27,
  'end': 33}]

In [47]:
predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: huggingface-pytorch-training-2024-11-14-21-58-08-584
INFO:sagemaker:Deleting endpoint with name: huggingface-pytorch-training-2024-11-14-21-58-08-584
