In [35]:
import boto3
from io import StringIO
import pandas as pd

In [36]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator
from sagemaker import image_uris

In [37]:
# Specify S3 bucket and path
region = sagemaker.Session().boto_region_name
bucket_name = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
file_name_abt_train = 'abt_train.csv'
file_name_abt_test = 'abt_test.csv'

path_abt_train = f's3://{bucket_name}/Forecast/data_full/{file_name_abt_train}'
path_abt_test = f's3://{bucket_name}/Forecast/data_full/{file_name_abt_test}'
output_s3_path = 's3://viamericas-datalake-dev-us-east-1-283731589572-analytics/Forecast/data_full/'

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [38]:
df_train = pd.read_csv(path_abt_train)

In [39]:
len(df_train.columns)

27

In [59]:
df_train.head(3)

Unnamed: 0,date,payer,country,amount,var_rate_lag_1,var_rate_lag_2,var_rate_lag_3,var_rate_lag_4,var_rate_lag_5,var_rate_lag_6,...,payer_country_encoder,payer_country,var_tx_cancelled_lag_1,var_tx_cancelled_lag_2,var_tx_cancelled_lag_3,var_tx_cancelled_lag_4,var_tx_cancelled_lag_5,var_tx_cancelled_lag_6,weekend,special_dates
0,2023-02-24,NAFA,"GAMBIA, THE",186.0,0.0,0.0,0.0,0.0,0.0,0.0,...,186.0,"NAFA_GAMBIA, THE",0.0,0.0,0.0,0.0,-1.0,-1.0,1,0
1,2022-04-15,BANCO ATLANTIDA,HONDURAS,294352.41,0.0,0.0,0.0,0.0,0.0,0.0,...,16.0,BANCO ATLANTIDA_HONDURAS,-18.0,-12.0,0.0,-16.0,17.0,6.0,1,0
2,2023-02-19,NAFA,MALI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,190.0,NAFA_MALI,0.0,0.0,0.0,0.0,0.0,0.0,1,0


In [60]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106496 entries, 0 to 106495
Data columns (total 27 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   date                    106496 non-null  object 
 1   payer                   106496 non-null  object 
 2   country                 106496 non-null  object 
 3   amount                  106496 non-null  float64
 4   var_rate_lag_1          106496 non-null  float64
 5   var_rate_lag_2          106496 non-null  float64
 6   var_rate_lag_3          106496 non-null  float64
 7   var_rate_lag_4          106496 non-null  float64
 8   var_rate_lag_5          106496 non-null  float64
 9   var_rate_lag_6          106496 non-null  float64
 10  var_rate_lag_7          106496 non-null  float64
 11  var_rate_lag_8          106496 non-null  float64
 12  var_rate_lag_9          106496 non-null  float64
 13  var_rate_lag_10         106496 non-null  float64
 14  var_rate_lag_11     

In [62]:
pd.Timestamp(df_train['date'].min(), freq=freq)

TypeError: __new__() got an unexpected keyword argument 'freq'

In [63]:
#Set up SageMaker session and role
sagemaker_session = sagemaker.Session()
#role = 'arn:aws:iam::283731589572:role/service-role/AmazonForecast-ExecutionRole-1704319864386'
role = sagemaker.get_execution_role()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [64]:
freq = 'D'
start_dataset = pd.Timestamp(df_train['date'].min())
end_training = pd.Timestamp(df_train['date'].max())

In [65]:
# Variables dinámicas, categóricas y el objetivo
dynamic_features = df_train[['var_rate_lag_1', 'var_rate_lag_2', 'var_rate_lag_3', 'var_rate_lag_4', 'var_rate_lag_5',
       'var_rate_lag_6', 'var_rate_lag_7', 'var_rate_lag_8', 'var_rate_lag_9', 'var_rate_lag_10', 'var_rate_lag_11', 
       'var_rate_lag_12', 'var_rate_lag_13','var_tx_cancelled_lag_1', 'var_tx_cancelled_lag_2', 'var_tx_cancelled_lag_3', 
       'var_tx_cancelled_lag_4', 'var_tx_cancelled_lag_5', 'var_tx_cancelled_lag_6']]
categorical_features = df_train[['payer', 'country', 'payer_country_encoder', 'payer_country']]
target = df_train['amount']

In [66]:
# Cardinality > Auto
cardinality_payer_country = pd.read_csv(path_abt_train)['payer_country'].nunique()

In [67]:
# URI Image for AMZ DeepAR
image_name = sagemaker.image_uris.retrieve("forecasting-deepar", region)

In [68]:
# 
hyperparameters = {
    "time_freq": "D",               # Daily frequency
    "epochs": 50,                    # Number of training epochs
    "context_length": 30,            # Length of the context (historical) data
    "prediction_length": 10          # Length of the predicted future data
}

In [69]:
# DeepAR Settings
estimator = sagemaker.estimator.Estimator(
    image_uri=image_name,
    sagemaker_session=sagemaker_session,
    role=role,
    train_instance_count=1,
    train_instance_type='ml.m4.xlarge',
    base_job_name='deepar-forecast',
    output_path= output_s3_path,
    use_feat_dynamic_real=True,
    use_feat_static_cat=True,
    use_feat_static_real=False,
    cardinality=[cardinality_payer_country], # Payer_country diff combinations
    num_dynamic_feat=dynamic_features.shape[1],  # Exog
)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [70]:
hyperparameters = {
    'time_freq': 'D',
    'epochs': '50',
    'early_stopping_patience': '40',
    'mini_batch_size': '64',
    'learning_rate': '5E-4',
    'context_length': '14',  # Longitud del contexto, 14 dias maso lo que vimos en el EDA
    'prediction_length': '10'  # Longitud de la predicción (si fuese una frec por hora, seria 240
}

In [71]:
estimator.set_hyperparameters(**hyperparameters)

In [None]:
# from sagemaker.tuner import HyperparameterTuner, IntegerParameter, CategoricalParameter, ContinuousParameter
# hyperparameter_ranges = {'optimizer': CategoricalParameter(['sgd', 'Adam']),
#                          'learning_rate': ContinuousParameter(0.01, 0.2),
#                          'num_epoch': IntegerParameter(10, 50)}

In [65]:
# Set up S3 client
#client = boto3.client('s3')

In [73]:
train_dataset = {
    "start": df_train['date'].min(),
    "target": target,
    "dynamic_feat": [dynamic_features.values.tolist()],
    "cat": [categorical_features.values.tolist()]
}


In [None]:
# Train the DeepAR model with the training data
estimator.fit({'train': path_abt_train})

INFO:sagemaker:Creating training-job with name: deepar-forecast-2024-01-31-15-49-04-499


2024-01-31 15:49:04 Starting - Starting the training job...
2024-01-31 15:49:29 Starting - Preparing the instances for training.........
2024-01-31 15:50:59 Downloading - Downloading input data...
2024-01-31 15:51:29 Downloading - Downloading the training image.....................
2024-01-31 15:54:45 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34mRunning custom environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
[34m[01/31/2024 15:55:06 INFO 140519086802752] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'_kvstore': 'auto', '_num_gpus': 'auto', '_num_kv_servers': 'auto', '_tuning_objective_metric': '', 'cardinality': 'auto', 'dropout_rate': '0.10', 'early_stopping_patience': '', 'embedding_dimension': '10', 'learning_rate': '0.001', 'likeli

In [None]:
# Make predictions using the deployed predictor
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name="deepAR_v1" 
)

In [None]:
test_data = pd.read_csv(path_abt_test)
predictor.predict(test_data)

In [None]:
import matplotlib.pyplot as plt

# Graficar predicciones y valores reales
plt.plot(true_values.index, true_values, label='True Values')
plt.plot(predictions.index, predictions['mean'], label='Predictions', color='red')
plt.xlabel('Fecha')
plt.ylabel('Amount')
plt.legend()
plt.show()
