In [3]:
# !pip install awswrangler
# !pip install skforecast
# !pip install pmdarima

In [4]:
import awswrangler as wr
import pandas as pd
import boto3
import pickle
from io import BytesIO
from io import StringIO
import joblib
import os
from datetime import datetime, timedelta
import warnings

# skforecast
from skforecast.Sarimax import Sarimax
from skforecast.ForecasterSarimax import ForecasterSarimax
from skforecast.model_selection_sarimax import backtesting_sarimax
from skforecast.model_selection_sarimax import grid_search_sarimax
from sklearn.metrics import mean_absolute_error


### Levantando ABT y leyendo los pkls

In [6]:
# Set up S3 client
client = boto3.client('s3')
bucket_name = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
path = 'ABTv3/Tableau/' # Folder under analysis

ABTversion = 'v3'
path_ABT = f'ABT{ABTversion}/' 

In [7]:
# Specify the CSV file key
csv_key = None

# List objects in the S3 path
response = client.list_objects(Bucket=bucket_name, Prefix=path_ABT)

# Find the CSV file in the S3 path
for obj in response.get('Contents', []):
    if obj['Key'].endswith('.csv'):
        csv_key = obj['Key']
        break
        
# Check if CSV file is found
if csv_key is not None:
    # Read CSV content from S3
    csv_response = client.get_object(Bucket=bucket_name, Key=csv_key)
    csv_content = csv_response['Body'].read().decode('utf-8')

    # Transform CSV content to DataFrame
    data = pd.read_csv(StringIO(csv_content))
    data['date'] = pd.to_datetime(data['date']).dt.date
    print("CSV file loaded")
else:
    print("No CSV file found in the specified S3 path.")

CSV file loaded


In [8]:
data['amount'].sum()

26753064533.761703

In [7]:
#data = data[data['date'] < pd.to_datetime('2023-12-25').date()]

In [9]:
elements = client.list_objects(Bucket=bucket_name, Prefix=path)

#Listing pkl files
pkl_files = [obj['Key'] for obj in elements.get('Contents', []) if obj['Key'].endswith('.pkl')
                and ('MODEL_2d_' in obj['Key']) and not obj['Key'].endswith('_2d.pkl')]

In [10]:
len(pkl_files)

5

### Bucle para procesar todos

In [11]:
# Initialize an empty DataFrame to store the results
df_temp = pd.DataFrame(columns=['date', 'pred', 'payer_country', 'model'])

In [14]:
i = 1

# Iterate over pkl files
for file_key in pkl_files:
    # Extract payer_country from file_key
    payer_country = file_key.split('/')[2]
    print(payer_country)
    
    # Download pkl file from S3 and load it into memory
    response = client.get_object(Bucket=bucket_name, Key=file_key)
    buffer = BytesIO(response['Body'].read())
    forecaster = joblib.load(buffer)
    
    #### PAYER SETTING ####
    # Filter data for the specific payer_country
    datos = data[data['payer_country'] == payer_country].copy()
#    datos = data.loc[data.payer_country == payer_country]
    datos['date'] = pd.to_datetime(datos['date'])
    datos.set_index('date', inplace=True)
    datos = datos.asfreq('D')

    # Predictions settings
    last_window_date = forecaster.last_window.index[-1] + pd.Timedelta(days=1)
    test_date = pd.Timestamp('2023-09-08')  # The first test day would be the day to predict

    # Extract data for last window and test period
    data_last_window = datos.loc[last_window_date:test_date - pd.Timedelta(days=1)].copy()
#    data_last_window = datos.loc[last_window_date:test_date - pd.Timedelta(days=1)]
    data_last_window[forecaster.exog_col_names] = data_last_window[forecaster.exog_col_names].fillna(0)
    data_test = datos.loc[test_date:test_date + pd.Timedelta(days=1)].copy()
#    data_test = datos.loc[test_date:test_date + pd.Timedelta(days=1)] 
    data_test[forecaster.exog_col_names] = data_test[forecaster.exog_col_names].fillna(0)

    try:
        
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
        # Make predictions
        predictions = forecaster.predict(
                          steps            = 2,
                          exog             = data_test[forecaster.exog_col_names],
                          last_window      = data_last_window['amount'],
                          last_window_exog = data_last_window[forecaster.exog_col_names]
                      )
        # Store predictions in a temporary DataFrame
        df_temp = pd.DataFrame(predictions, columns=['pred']).reset_index()
#        print(df_temp)

    except:
        # If an exception occurs, set predictions to zero
        print("\033[1;31m" + f"Error processing {payer_country}" + "\033[0m")
        predictions = [0, 0]
        df_temp = pd.DataFrame({'index': [test_date, test_date + pd.Timedelta(days=1)], 'pred': predictions})


    # Add additional columns
    df_temp['payer_country'] = payer_country
    df_temp['model'] = file_key.split('/')[-1]

    # Concatenate df_temp with the main DataFrame
    if i:
        temp_df = df_temp.copy()
        i = 0
    else:
        temp_df = pd.concat([temp_df, df_temp], ignore_index=True)

BANCO INDUSTRIAL_GUATEMALA
BANCOPPEL (APPRIZA)_MEXICO
BANRURAL (RYT)_GUATEMALA
BBVA - BANCOMER (BTS)_MEXICO
ELEKTRA (MEXICO)_MEXICO


In [15]:
len(temp_df)

10

In [16]:
temp_df[temp_df['pred'] < 0]

Unnamed: 0,index,pred,payer_country,model


In [17]:
# Replace values in 'pred' column with 0 where 'pred' is less than 0
temp_df.loc[temp_df['pred'] < 0, 'pred'] = 0

In [18]:
# Convert 'index' column to date format
temp_df['index'] = pd.to_datetime(temp_df['index']).dt.date

In [19]:
temp_df

Unnamed: 0,index,pred,payer_country,model
0,2023-09-08,3372540.0,BANCO INDUSTRIAL_GUATEMALA,MODEL_2d_BANCO INDUSTRIAL_GUATEMALA.pkl
1,2023-09-09,4034526.0,BANCO INDUSTRIAL_GUATEMALA,MODEL_2d_BANCO INDUSTRIAL_GUATEMALA.pkl
2,2023-09-08,7579385.0,BANCOPPEL (APPRIZA)_MEXICO,MODEL_2d_BANCOPPEL (APPRIZA)_MEXICO.pkl
3,2023-09-09,7809040.0,BANCOPPEL (APPRIZA)_MEXICO,MODEL_2d_BANCOPPEL (APPRIZA)_MEXICO.pkl
4,2023-09-08,4733540.0,BANRURAL (RYT)_GUATEMALA,MODEL_2d_BANRURAL (RYT)_GUATEMALA.pkl
5,2023-09-09,5681799.0,BANRURAL (RYT)_GUATEMALA,MODEL_2d_BANRURAL (RYT)_GUATEMALA.pkl
6,2023-09-08,1173953.0,BBVA - BANCOMER (BTS)_MEXICO,MODEL_2d_BBVA - BANCOMER (BTS)_MEXICO.pkl
7,2023-09-09,1076527.0,BBVA - BANCOMER (BTS)_MEXICO,MODEL_2d_BBVA - BANCOMER (BTS)_MEXICO.pkl
8,2023-09-08,11618430.0,ELEKTRA (MEXICO)_MEXICO,MODEL_2d_ELEKTRA (MEXICO)_MEXICO.pkl
9,2023-09-09,12485430.0,ELEKTRA (MEXICO)_MEXICO,MODEL_2d_ELEKTRA (MEXICO)_MEXICO.pkl


In [20]:
temp_df.rename(columns={'index':'pred_date'}, inplace=True)

In [21]:
## LEVANTO ULTIMA ABT ACTUALIZADA
prefix = 'abt_parquet/'

today = datetime.now().date()
yesterday = today - timedelta(days=1)

today_folder = 'dt=' + today.strftime('%Y-%m-%d') + '/'
yesterday_folder = 'dt=' + yesterday.strftime('%Y-%m-%d') + '/'

In [22]:
# objects = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix+yesterday_folder)

In [24]:
# DB Setting
bucket_name = 's3://viamericas-datalake-dev-us-east-1-283731589572-athena/'
origin_name = 'AwsDataCatalog'
database_name= 'analytics'
table_name = 'daily_check_gp'

In [25]:
df = wr.athena.read_sql_table(
    table=table_name,
    database=database_name,
)

awswrangler.athena._utils INFO  Created CTAS table "analytics"."temp_table_c84e114610db4add813de55eabcc58c1"


In [26]:
df['payer_country'] = df['payer'] + '_' + df['country']

In [27]:
df_id = df[['payer_country', 'id_main_branch', 'id_country']].drop_duplicates().dropna(subset='id_main_branch')

In [28]:
len(df_id)

275

In [29]:
df_final = pd.merge(temp_df, df_id, on='payer_country', how='left')

In [30]:
# Insertar la columna 'processing_date' al principio del DataFrame
df_final.insert(0, 'processing_date', today)

In [38]:
df_final['processing_date'] = pd.to_datetime(df_final['processing_date'] )

In [31]:
df_final['id_country'].isnull().sum()

0

In [32]:
df_final['id_main_branch'].isnull().sum()

0

In [34]:
# Splitting 'payer' & 'country'
df_final[['payer', 'country']] = df_final['payer_country'].str.split('_', expand=True)

In [35]:
df_final.head()

Unnamed: 0,processing_date,pred_date,pred,payer_country,model,id_main_branch,id_country,payer,country
0,2024-03-21,2023-09-08,3372540.0,BANCO INDUSTRIAL_GUATEMALA,MODEL_2d_BANCO INDUSTRIAL_GUATEMALA.pkl,T085,GUA,BANCO INDUSTRIAL,GUATEMALA
1,2024-03-21,2023-09-09,4034526.0,BANCO INDUSTRIAL_GUATEMALA,MODEL_2d_BANCO INDUSTRIAL_GUATEMALA.pkl,T085,GUA,BANCO INDUSTRIAL,GUATEMALA
2,2024-03-21,2023-09-08,7579385.0,BANCOPPEL (APPRIZA)_MEXICO,MODEL_2d_BANCOPPEL (APPRIZA)_MEXICO.pkl,T239,MEX,BANCOPPEL (APPRIZA),MEXICO
3,2024-03-21,2023-09-09,7809040.0,BANCOPPEL (APPRIZA)_MEXICO,MODEL_2d_BANCOPPEL (APPRIZA)_MEXICO.pkl,T239,MEX,BANCOPPEL (APPRIZA),MEXICO
4,2024-03-21,2023-09-08,4733540.0,BANRURAL (RYT)_GUATEMALA,MODEL_2d_BANRURAL (RYT)_GUATEMALA.pkl,T228,GUA,BANRURAL (RYT),GUATEMALA


In [36]:
import sagemaker

# Obtener el IAM role de SageMaker
role = sagemaker.get_execution_role()
role

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


'arn:aws:iam::283731589572:role/service-role/SageMaker-ExecutionRole-20240102T104128'

In [39]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   processing_date  10 non-null     datetime64[ns]
 1   pred_date        10 non-null     object        
 2   pred             10 non-null     float64       
 3   payer_country    10 non-null     object        
 4   model            10 non-null     object        
 5   id_main_branch   10 non-null     string        
 6   id_country       10 non-null     string        
 7   payer            10 non-null     object        
 8   country          10 non-null     object        
dtypes: datetime64[ns](1), float64(1), object(5), string(2)
memory usage: 848.0+ bytes
