In [3]:
# !pip install awswrangler
# !pip install skforecast
# !pip install pmdarima

In [4]:
import awswrangler as wr
import pandas as pd
import boto3
import pickle
from io import BytesIO
from io import StringIO
import joblib
import os
from datetime import datetime, timedelta
import warnings

# skforecast
from skforecast.Sarimax import Sarimax
from skforecast.ForecasterSarimax import ForecasterSarimax
from skforecast.model_selection_sarimax import backtesting_sarimax
from skforecast.model_selection_sarimax import grid_search_sarimax
from sklearn.metrics import mean_absolute_error


### Levantando ABT y leyendo los pkls

In [62]:
# Set up S3 client
client = boto3.client('s3')
bucket_name = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
path = 'ABTv3/Last_releases/' # Folder under analysis

ABTversion = 'v3'
path_ABT = f'ABT{ABTversion}/' 

In [6]:
# Specify the CSV file key
csv_key = None

# List objects in the S3 path
response = client.list_objects(Bucket=bucket_name, Prefix=path_ABT)

# Find the CSV file in the S3 path
for obj in response.get('Contents', []):
    if obj['Key'].endswith('.csv'):
        csv_key = obj['Key']
        break
        
# Check if CSV file is found
if csv_key is not None:
    # Read CSV content from S3
    csv_response = client.get_object(Bucket=bucket_name, Key=csv_key)
    csv_content = csv_response['Body'].read().decode('utf-8')

    # Transform CSV content to DataFrame
    data = pd.read_csv(StringIO(csv_content))
    data['date'] = pd.to_datetime(data['date']).dt.date
    print("CSV file loaded")
else:
    print("No CSV file found in the specified S3 path.")

CSV file loaded


In [7]:
#data = data[data['date'] < pd.to_datetime('2023-12-25').date()]

In [8]:
elements = client.list_objects(Bucket=bucket_name, Prefix=path)

#Listing pkl files
pkl_files = [obj['Key'] for obj in elements.get('Contents', []) if obj['Key'].endswith('.pkl')
                and ('MODEL_2d_' in obj['Key']) and not obj['Key'].endswith('_2d.pkl')]

In [9]:
len(pkl_files)

130

### Prueba con 1 payer

In [10]:
pkl_files[6]

'ABTv3/Last_releases/ARGENPER_ARGENTINA/MODEL_2d_ARGENPER_ARGENTINA.pkl'

In [11]:
file_key = pkl_files[6]

# Descargar el archivo pkl desde S3 y cargarlo en memoria
response = client.get_object(Bucket=bucket_name, Key=file_key)
buffer = BytesIO(response['Body'].read())
forecaster = joblib.load(buffer)

In [12]:
payer = 'ARGENPER_ARGENTINA'

#### PAYER SETTING ####
datos = data.loc[data.payer_country==payer]
datos['date'] = pd.to_datetime(datos['date'])
datos.set_index('date', inplace=True)
datos = datos.asfreq('D')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datos['date'] = pd.to_datetime(datos['date'])


In [13]:
last_window_date = forecaster.last_window.index[-1] + pd.Timedelta(days=1)
test_date = pd.Timestamp('2023-12-25') # El primer dia de test seria el dia a predecir

data_last_window = datos.loc[last_window_date:test_date - pd.Timedelta(days=1)]
data_last_window[forecaster.exog_col_names] = data_last_window[forecaster.exog_col_names].fillna(0)

data_test = datos.loc[test_date:test_date + pd.Timedelta(days=1)] 
data_test[forecaster.exog_col_names] = data_test[forecaster.exog_col_names].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_last_window[forecaster.exog_col_names] = data_last_window[forecaster.exog_col_names].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test[forecaster.exog_col_names] = data_test[forecaster.exog_col_names].fillna(0)


In [14]:
predictions = forecaster.predict(
                  steps            = 2,
                  exog             = data_test[forecaster.exog_col_names],
                  last_window      = data_last_window['amount'],
                  last_window_exog = data_last_window[forecaster.exog_col_names]
              )

In [15]:
predictions

2023-12-25    485.241489
2023-12-26    308.361684
Freq: D, Name: pred, dtype: float64

In [16]:
pd.DataFrame(predictions, columns=['pred']).reset_index()

Unnamed: 0,index,pred
0,2023-12-25,485.241489
1,2023-12-26,308.361684


### Bucle para procesar todos

In [17]:
# Initialize an empty DataFrame to store the results
df_temp = pd.DataFrame(columns=['date', 'pred', 'payer_country', 'model'])

In [18]:
df_temp.empty

True

In [None]:
i = 1

# Iterate over pkl files
for file_key in pkl_files:
    # Extract payer_country from file_key
    payer_country = file_key.split('/')[2]
    print(payer_country)
    
    # Download pkl file from S3 and load it into memory
    response = client.get_object(Bucket=bucket_name, Key=file_key)
    buffer = BytesIO(response['Body'].read())
    forecaster = joblib.load(buffer)
    
    #### PAYER SETTING ####
    # Filter data for the specific payer_country
    datos = data[data['payer_country'] == payer_country].copy()
#    datos = data.loc[data.payer_country == payer_country]
    datos['date'] = pd.to_datetime(datos['date'])
    datos.set_index('date', inplace=True)
    datos = datos.asfreq('D')

    # Predictions settings
    last_window_date = forecaster.last_window.index[-1] + pd.Timedelta(days=1)
    test_date = pd.Timestamp('2023-12-16')  # The first test day would be the day to predict

    # Extract data for last window and test period
    data_last_window = datos.loc[last_window_date:test_date - pd.Timedelta(days=1)].copy()
#    data_last_window = datos.loc[last_window_date:test_date - pd.Timedelta(days=1)]
    data_last_window[forecaster.exog_col_names] = data_last_window[forecaster.exog_col_names].fillna(0)
    data_test = datos.loc[test_date:test_date + pd.Timedelta(days=1)].copy()
#    data_test = datos.loc[test_date:test_date + pd.Timedelta(days=1)] 
    data_test[forecaster.exog_col_names] = data_test[forecaster.exog_col_names].fillna(0)

    try:
        
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
        # Make predictions
        predictions = forecaster.predict(
                          steps            = 2,
                          exog             = data_test[forecaster.exog_col_names],
                          last_window      = data_last_window['amount'],
                          last_window_exog = data_last_window[forecaster.exog_col_names]
                      )
        # Store predictions in a temporary DataFrame
        df_temp = pd.DataFrame(predictions, columns=['pred']).reset_index()
#        print(df_temp)

    except:
        # If an exception occurs, set predictions to zero
        print("\033[1;31m" + f"Error processing {payer_country}" + "\033[0m")
        predictions = [0, 0]
        df_temp = pd.DataFrame({'index': [test_date, test_date + pd.Timedelta(days=1)], 'pred': predictions})


    # Add additional columns
    df_temp['payer_country'] = payer_country
    df_temp['model'] = file_key.split('/')[-1]

    # Concatenate df_temp with the main DataFrame
    if i:
        temp_df = df_temp.copy()
        i = 0
    else:
        temp_df = pd.concat([temp_df, df_temp], ignore_index=True)

24XORO_MEXICO
[1;31mError processing 24XORO_MEXICO[0m
ABANK (TN)_EL SALVADOR
[1;31mError processing ABANK (TN)_EL SALVADOR[0m
AFEX_CHILE


In [95]:
len(temp_df)

260

In [96]:
temp_df[temp_df['pred'] < 0]

Unnamed: 0,index,pred,payer_country,model
48,2023-12-25,-148947.677845,BANCO INDUSTRIAL_GUATEMALA,MODEL_2d_BANCO INDUSTRIAL_GUATEMALA.pkl
54,2023-12-25,-67191.858226,BANCO RENDIMENTO_BRAZIL,MODEL_2d_BANCO RENDIMENTO_BRAZIL.pkl
58,2023-12-25,-19169.543009,BANCOPPEL (APPRIZA)_MEXICO,MODEL_2d_BANCOPPEL (APPRIZA)_MEXICO.pkl
70,2023-12-25,-26701.657343,BANRURAL (HONDURAS)_HONDURAS,MODEL_2d_BANRURAL (HONDURAS)_HONDURAS.pkl
108,2023-12-25,-12963.753813,ELEKTRA (BTS)_GUATEMALA,MODEL_2d_ELEKTRA (BTS)_GUATEMALA.pkl
110,2023-12-25,-95825.648347,ELEKTRA (BTS)_HONDURAS,MODEL_2d_ELEKTRA (BTS)_HONDURAS.pkl
147,2023-12-26,-6.873834,KORI_BENIN,MODEL_2d_KORI_BENIN.pkl


In [97]:
# Replace values in 'pred' column with 0 where 'pred' is less than 0
temp_df.loc[temp_df['pred'] < 0, 'pred'] = 0

In [98]:
# Convert 'index' column to date format
temp_df['index'] = pd.to_datetime(temp_df['index']).dt.date

In [99]:
# # Finding the minimum and maximum dates in the 'index' column
# min_date = temp_df['index'].min()
# max_date = temp_df['index'].max()

In [100]:
# # Adding suffixes to the minimum and maximum dates
# temp_df.loc[temp_df['index'] == min_date, 'index'] = min_date.strftime('%Y-%m-%d') + '_pred_day_0'
# temp_df.loc[temp_df['index'] == max_date, 'index'] = max_date.strftime('%Y-%m-%d') + '_pred_day_1'

In [101]:
temp_df.head()

Unnamed: 0,index,pred,payer_country,model
0,2023-12-25,36947.319758,24XORO_MEXICO,MODEL_2d_24XORO_MEXICO.pkl
1,2023-12-26,43751.589258,24XORO_MEXICO,MODEL_2d_24XORO_MEXICO.pkl
2,2023-12-25,9590.269894,ABANK (TN)_EL SALVADOR,MODEL_2d_ABANK (TN)_EL SALVADOR.pkl
3,2023-12-26,8998.724809,ABANK (TN)_EL SALVADOR,MODEL_2d_ABANK (TN)_EL SALVADOR.pkl
4,2023-12-25,5632.096572,AFEX_CHILE,MODEL_2d_AFEX_CHILE.pkl


In [105]:
temp_df.rename(columns={'index':'pred_date'}, inplace=True)

In [82]:
#df_pivot = pd.pivot_table(temp_df, index=['payer_country','model'], columns='index', values='pred', aggfunc='first', fill_value=None).reset_index()

In [27]:
## LEVANTO ULTIMA ABT ACTUALIZADA
prefix = 'abt_parquet/'

today = datetime.now().date()
yesterday = today - timedelta(days=1)

today_folder = 'dt=' + today.strftime('%Y-%m-%d') + '/'
yesterday_folder = 'dt=' + yesterday.strftime('%Y-%m-%d') + '/'

In [34]:
# objects = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix+yesterday_folder)

In [35]:
import s3fs
import pyarrow.parquet as pq

# # Check if objects were found
# if 'Contents' in objects:
#     # Get the Parquet file path within the folder
#     file_key = objects['Contents'][0]['Key']
    
#     # Create an S3 FileSystem client
#     s3_file_system = s3fs.S3FileSystem()

#     # Read the Parquet file using pyarrow
#     with s3_file_system.open(f's3://{bucket_name}/{file_key}', mode='rb') as file:
#         parquet_table = pq.read_table(file)
    
#     # Convert the Parquet table to a DataFrame (optional)
#     df = parquet_table.to_pandas()
# else:
#     print(f"No Parquet files found in the folder {prefix+yesterday_folder}.")

In [36]:
# DB Setting
bucket_name = 's3://viamericas-datalake-dev-us-east-1-283731589572-athena/'
origin_name = 'AwsDataCatalog'
database_name= 'analytics'
table_name = 'daily_check_gp'

In [37]:
df = wr.athena.read_sql_table(
    table=table_name,
    database=database_name,
)

awswrangler.athena._utils INFO  Created CTAS table "analytics"."temp_table_94197ef74104473995881fd637f526f9"


In [98]:
#parquet_table.to_pandas()

In [101]:
#df.isnull().sum().to_list()

In [44]:
df['payer_country'] = df['payer'] + '_' + df['country']

In [46]:
df_id = df[['payer_country', 'id_main_branch', 'id_country']].drop_duplicates().dropna(subset='id_main_branch')

In [47]:
#df_id.to_excel('df_id.xlsx')

In [48]:
len(df_id)

275

In [49]:
len(df['payer_country'].unique())

275

In [110]:
temp_df

Unnamed: 0,pred_date,pred,payer_country,model
0,2023-12-25,36947.319758,24XORO_MEXICO,MODEL_2d_24XORO_MEXICO.pkl
1,2023-12-26,43751.589258,24XORO_MEXICO,MODEL_2d_24XORO_MEXICO.pkl
2,2023-12-25,9590.269894,ABANK (TN)_EL SALVADOR,MODEL_2d_ABANK (TN)_EL SALVADOR.pkl
3,2023-12-26,8998.724809,ABANK (TN)_EL SALVADOR,MODEL_2d_ABANK (TN)_EL SALVADOR.pkl
4,2023-12-25,5632.096572,AFEX_CHILE,MODEL_2d_AFEX_CHILE.pkl
...,...,...,...,...
255,2023-12-26,0.000000,ZEEPAY_CAMEROON,MODEL_2d_ZEEPAY_CAMEROON.pkl
256,2023-12-25,0.000000,ZEEPAY_COTE D'IVOIRE (IVORY COAST),MODEL_2d_ZEEPAY_COTE D'IVOIRE (IVORY COAST).pkl
257,2023-12-26,0.000000,ZEEPAY_COTE D'IVOIRE (IVORY COAST),MODEL_2d_ZEEPAY_COTE D'IVOIRE (IVORY COAST).pkl
258,2023-12-25,0.000000,ZEEPAY_GHANA,MODEL_2d_ZEEPAY_GHANA.pkl


In [159]:
#sorted(df['payer_country'].unique())

In [111]:
df_final = pd.merge(temp_df, df_id, on='payer_country', how='left')

In [121]:
# Insertar la columna 'processing_date' al principio del DataFrame
df_final.insert(0, 'processing_date', today)

In [113]:
df_final['id_country'].isnull().sum()

0

In [114]:
df_final['id_main_branch'].isnull().sum()

0

In [115]:
### Workaround > Elimino duplicados
#df_final = df_final.drop_duplicates(subset=['payer_country'], keep='first')

In [116]:
df_final.columns

Index(['pred_date', 'pred', 'payer_country', 'model', 'id_main_branch',
       'id_country', 'processing_date'],
      dtype='object')

In [122]:
df_final

Unnamed: 0,processing_date,pred_date,pred,payer_country,model,id_main_branch,id_country
0,2024-03-20,2023-12-25,36947.319758,24XORO_MEXICO,MODEL_2d_24XORO_MEXICO.pkl,T314,MEX
1,2024-03-20,2023-12-26,43751.589258,24XORO_MEXICO,MODEL_2d_24XORO_MEXICO.pkl,T314,MEX
2,2024-03-20,2023-12-25,9590.269894,ABANK (TN)_EL SALVADOR,MODEL_2d_ABANK (TN)_EL SALVADOR.pkl,T282,ELS
3,2024-03-20,2023-12-26,8998.724809,ABANK (TN)_EL SALVADOR,MODEL_2d_ABANK (TN)_EL SALVADOR.pkl,T282,ELS
4,2024-03-20,2023-12-25,5632.096572,AFEX_CHILE,MODEL_2d_AFEX_CHILE.pkl,T089,CHI
...,...,...,...,...,...,...,...
255,2024-03-20,2023-12-26,0.000000,ZEEPAY_CAMEROON,MODEL_2d_ZEEPAY_CAMEROON.pkl,T280,CMR
256,2024-03-20,2023-12-25,0.000000,ZEEPAY_COTE D'IVOIRE (IVORY COAST),MODEL_2d_ZEEPAY_COTE D'IVOIRE (IVORY COAST).pkl,T280,CIV
257,2024-03-20,2023-12-26,0.000000,ZEEPAY_COTE D'IVOIRE (IVORY COAST),MODEL_2d_ZEEPAY_COTE D'IVOIRE (IVORY COAST).pkl,T280,CIV
258,2024-03-20,2023-12-25,0.000000,ZEEPAY_GHANA,MODEL_2d_ZEEPAY_GHANA.pkl,T280,GHA


In [126]:
# Splitting 'payer' & 'country'
df_final[['payer', 'country']] = df_final['payer_country'].str.split('_', expand=True)

In [124]:
# df_final[['processing_date', 'payer_country','payer', 'country', 'id_main_branch', 'id_country',
#     '2023-12-25_pred_day_0','2023-12-26_pred_day_1' ]]

In [125]:
#df_final.to_excel('df_pred.xlsx')

In [127]:
import sagemaker

# Obtener el IAM role de SageMaker
role = sagemaker.get_execution_role()
role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


'arn:aws:iam::283731589572:role/service-role/SageMaker-ExecutionRole-20240102T104128'