## LightGBM de SageMaker

In [17]:
from sagemaker import image_uris, model_uris, script_uris

train_model_id, train_model_version, train_scope = "lightgbm-classification-model", "*", "training"
training_instance_type = "ml.m5.12xlarge"

In [18]:
# Retrieve the docker image
train_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    model_id=train_model_id,
    model_version=train_model_version,
    image_scope=train_scope,
    instance_type=training_instance_type
)

# Retrieve the training script
train_source_uri = script_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, script_scope=train_scope
)

train_model_uri = model_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, model_scope=train_scope
)

Using model 'lightgbm-classification-model' with wildcard version identifier '*'. You can pin to version '2.1.0' for more stable results. Note that models may have different input/output signatures after a major version upgrade.


In [19]:
# Buckets para Train data y Output
training_data_bucket = f"viamericas-datalake-dev-us-east-1-283731589572-raw/FraudModel/Data4Model"
#training_data_prefix = "training-datasets/tabular_multiclass/"

training_dataset_s3_path = f"s3://{training_data_bucket}/Train" #/exp1.parquet 
validation_dataset_s3_path = f"s3://{training_data_bucket}/Validation" #/exp1.parquet 

output_bucket = f"viamericas-datalake-dev-us-east-1-283731589572-analytics"
output_prefix = "FraudModel"

s3_output_location = f"s3://{output_bucket}/{output_prefix}/output"

In [20]:
# Defino hiperparametros
from sagemaker import hyperparameters

# Retrieve the default hyperparameters for training the model
hyperparameters = hyperparameters.retrieve_default(
    model_id=train_model_id, model_version=train_model_version
)

# [Optional] Override default hyperparameters with custom values
hyperparameters[
    "num_boost_round"
] = "500"
print(hyperparameters)

{'num_boost_round': '500', 'early_stopping_rounds': '30', 'metric': 'auto', 'learning_rate': '0.009', 'num_leaves': '67', 'feature_fraction': '0.74', 'bagging_fraction': '0.53', 'bagging_freq': '5', 'max_depth': '11', 'min_data_in_leaf': '26', 'max_delta_step': '0.0', 'lambda_l1': '0.0', 'lambda_l2': '0.0', 'boosting': 'gbdt', 'min_gain_to_split': '0.0', 'scale_pos_weight': '1.0', 'tree_learner': 'serial', 'feature_fraction_bynode': '1.0', 'is_unbalance': 'False', 'max_bin': '255', 'num_threads': '0', 'verbosity': '1', 'use_dask': 'False'}


In [21]:
from sagemaker.estimator import Estimator
from sagemaker.utils import name_from_base

training_job_name = name_from_base(f"built-in-algo-{train_model_id}-training2")

# Create SageMaker Estimator instance
tabular_estimator = Estimator(
    role="arn:aws:iam::283731589572:role/service-role/AmazonSageMaker-ExecutionRole-20231127T122316",
    image_uri=train_image_uri,
    source_dir=train_source_uri,
    model_uri=train_model_uri,
    entry_point="transfer_learning.py",
    instance_count=1, # for distributed training, specify an instance_count greater than 1
    instance_type=training_instance_type,
    max_run=360000,
    hyperparameters=hyperparameters,
    output_path=s3_output_location
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [None]:
import sagemaker

# Specify the location of your input data in S3
train_data = training_dataset_s3_path
validation_data = validation_dataset_s3_path

# Set input data channels
train_channel = sagemaker.inputs.TrainingInput(train_data, content_type='application/x-parquet')
validation_channel = sagemaker.inputs.TrainingInput(validation_data, content_type='application/x-parquet')

data_channels = {'train': train_channel, 'validation': validation_channel}

# Launch a SageMaker Training job by passing the S3 path of the training data
tabular_estimator.fit(
    {
        "train": train_channel,
        "validation": validation_channel,
    }, logs=True, job_name=training_job_name
)


INFO:sagemaker:Creating training-job with name: built-in-algo-lightgbm-classification-m-2024-03-12-14-56-09-133


2024-03-12 14:56:12 Starting - Starting the training job...
2024-03-12 14:56:28 Starting - Preparing the instances for training...
2024-03-12 14:57:01 Downloading - Downloading input data...
2024-03-12 14:57:31 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2024-03-12 14:57:58,907 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2024-03-12 14:57:58,908 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-03-12 14:57:58,917 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2024-03-12 14:57:58,919 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2024-03-12 14:57:59,439 sagemaker-training-toolkit INFO     Installing dependencies from requirements.txt:[0m


## Test model

In [None]:
import pandas as pd
s3_url = f"s3://viamericas-datalake-dev-us-east-1-283731589572-raw/FraudModel/Data4Model/Test/Test.parquet"

dtest = pd.read_parquet(s3_url)

In [None]:
import sagemaker
import joblib 
import tarfile
import boto3

# Set up S3 client
client = boto3.client('s3')
bucket_name = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
path = 'FraudModel/output/built-in-algo-lightgbm-classification-m-2024-03-12-14-56-09-133/output/' # Folder under analysis

In [4]:
tar_file_key = 'FraudModel/output/built-in-algo-lightgbm-classification-m-2024-03-12-14-56-09-133/output/model.tar.gz'

responde = client.get_object(Bucket=bucket_name, Key=tar_file_key)

In [5]:
tar_bytes = responde['Body'].read()

In [12]:
import io

fileobj = io.BytesIO(tar_bytes)
tarf = tarfile.open(fileobj=fileobj)

tarf.extractall()

In [23]:
!pip install lightgbm

Collecting lightgbm
  Using cached lightgbm-4.3.0-py3-none-manylinux_2_28_x86_64.whl.metadata (19 kB)
Using cached lightgbm-4.3.0-py3-none-manylinux_2_28_x86_64.whl (3.1 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.3.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [26]:
pip install --upgrade pandas "dask[complete]"

Collecting pandas
  Using cached pandas-2.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting dask[complete]
  Downloading dask-2024.3.0-py3-none-any.whl.metadata (3.8 kB)
Collecting importlib-metadata>=4.13.0 (from dask[complete])
  Using cached importlib_metadata-7.0.2-py3-none-any.whl.metadata (4.6 kB)
Collecting pyarrow-hotfix (from dask[complete])
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting lz4>=4.3.2 (from dask[complete])
  Using cached lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting distributed==2024.3.0 (from dask[complete])
  Downloading distributed-2024.3.0-py3-none-any.whl.metadata (3.4 kB)
Collecting dask-expr<1.1,>=1.0 (from dask[complete])
  Downloading dask_expr-1.0.1-py3-none-any.whl.metadata (2.4 kB)
Collecting zict>=3.0.0 (from distributed==2024.3.0->dask[complete])
  Using cached zict-3.0.0-py2.py3-none-any.whl.metadata (899 bytes)
Using cached

In [27]:
import lightgbm

In [None]:

#model_url = f"viamericas-datalake-dev-us-east-1-283731589572-analytics/FraudModel/output/built-in-algo-lightgbm-classification-m-2024-03-12-14-56-09-133/output"
#model_file_path = f"s3://viamericas-datalake-dev-us-east-1-283731589572-analytics/FraudModel/output/built-in-algo-lightgbm-classification-m-2024-03-12-14-56-09-133/output/model.tar.gz"


#t = tarfile.open(f"s3://{bucket_name}/{path}/model.tar.gz", 'r:gz')
#t = tarfile.open('model.tar.gz', 'r:gz')
#t.extractall()

model = joblib.load('model.pkl')

# prediction with test data
# dtest should be a pandas DataFrame with column names feature_0, feature_1, ..., feature_d
pred = model.predict(dtest.loc[:, dtest.columns != 'target_fraudes'],) 

In [None]:
# Retrieve the inference docker container uri
deploy_image_uri = image_uris.retrieve(
    region=None,
    framework=None,
    image_scope="inference",
    model_id=train_model_id,
    model_version=train_model_version,
    instance_type=inference_instance_type,
)
# Retrieve the inference script uri
deploy_source_uri = script_uris.retrieve(
    model_id=train_model_id, model_version=train_model_version, script_scope="inference"
)

endpoint_name = name_from_base(f"jumpstart-example-{train_model_id}-")

# Use the estimator from the previous step to deploy to a SageMaker endpoint
predictor = (tuner if use_amt else tabular_estimator).deploy(
    initial_instance_count=1,
    instance_type=inference_instance_type,
    entry_point="inference.py",
    image_uri=deploy_image_uri,
    source_dir=deploy_source_uri,
    endpoint_name=endpoint_name,
)