In [3]:
# !pip install awswrangler
# !pip install skforecast
# !pip install pmdarima

In [4]:
import awswrangler as wr
import pandas as pd
import boto3
import pickle
from io import BytesIO
from io import StringIO
import joblib
import os
from datetime import datetime, timedelta
import warnings

# skforecast
from skforecast.Sarimax import Sarimax
from skforecast.ForecasterSarimax import ForecasterSarimax
from skforecast.model_selection_sarimax import backtesting_sarimax
from skforecast.model_selection_sarimax import grid_search_sarimax
from sklearn.metrics import mean_absolute_error


### Levantando ABT y leyendo los pkls

In [5]:
# Set up S3 client
client = boto3.client('s3')
bucket_name = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
path = 'ABTv3/Last_releases_7d_8lags/' # Folder under analysis

ABTversion = 'v3'
path_ABT = f'ABT{ABTversion}/' 

In [6]:
# Specify the CSV file key
csv_key = None

# List objects in the S3 path
response = client.list_objects(Bucket=bucket_name, Prefix=path_ABT)

# Find the CSV file in the S3 path
for obj in response.get('Contents', []):
    if obj['Key'].endswith('.csv'):
        csv_key = obj['Key']
        break
        
# Check if CSV file is found
if csv_key is not None:
    # Read CSV content from S3
    csv_response = client.get_object(Bucket=bucket_name, Key=csv_key)
    csv_content = csv_response['Body'].read().decode('utf-8')

    # Transform CSV content to DataFrame
    data = pd.read_csv(StringIO(csv_content))
    data['date'] = pd.to_datetime(data['date']).dt.date
    print("CSV file loaded")
else:
    print("No CSV file found in the specified S3 path.")

CSV file loaded


In [62]:
data['date'] = pd.to_datetime(data['date'])

In [63]:
# Marcar con 1 en 'day_of_the_dead' cuando 'date' sea 2 de noviembre
data.loc[data['date'].dt.month.eq(11) & data['date'].dt.day.eq(2), 'day_of_the_dead'] = 1

In [64]:
data['amount'].sum()

26753064533.761703

In [38]:
elements = client.list_objects(Bucket=bucket_name, Prefix=path)

#Listing pkl files
pkl_files = [obj['Key'] for obj in elements.get('Contents', []) if obj['Key'].endswith('.pkl')
                and ('MODEL_7d' in obj['Key']) and not obj['Key'].endswith('_2d.pkl')]  # File structure 

In [39]:
len(pkl_files)

131

In [40]:
today = datetime.now().date()

### Bucle para procesar todos

In [41]:
# Initialize an empty DataFrame to store the results
df_temp = pd.DataFrame(columns=['date', 'pred', 'payer_country', 'model'])

In [66]:
# Cuales son los que tengo que revisar
payer_countries_pinched = []

In [67]:
i = 1

# Iterate over pkl files
for file_key in pkl_files:
    # Extract payer_country from file_key
    payer_country = file_key.split('/')[2]
    print(payer_country)
    
    # Download pkl file from S3 and load it into memory
    response = client.get_object(Bucket=bucket_name, Key=file_key)
    buffer = BytesIO(response['Body'].read())
    forecaster = joblib.load(buffer)
    
    #### PAYER SETTING ####
    # Filter data for the specific payer_country
    datos = data[data['payer_country'] == payer_country].copy()
#    datos = data.loc[data.payer_country == payer_country]
    datos['date'] = pd.to_datetime(datos['date'])
    datos.set_index('date', inplace=True)
    datos = datos.asfreq('D')

    # Predictions settings
    last_window_date = forecaster.last_window.index[-1] + pd.Timedelta(days=1)
    test_date = forecaster.last_window.index[-1] + pd.Timedelta(days=7) # The first test day would be the day to predict

    # Extract data for last window and test period
    data_last_window = datos.loc[last_window_date:test_date - pd.Timedelta(days=1)].copy()
    data_last_window[forecaster.exog_col_names] = data_last_window[forecaster.exog_col_names].fillna(0)
    data_test = datos.loc[test_date:test_date + pd.Timedelta(days=7)].copy()
    data_test[forecaster.exog_col_names] = data_test[forecaster.exog_col_names].fillna(0)

    try:
        
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
        # Make predictions
        predictions = forecaster.predict(
                          steps            = 7, # Days to predict
                          exog             = data_test[forecaster.exog_col_names],
                          last_window      = data_last_window['amount'],
                          last_window_exog = data_last_window[forecaster.exog_col_names]
                      )
        # Store predictions in a temporary DataFrame
        df_temp = pd.DataFrame(predictions, columns=['pred']).reset_index()

    except:
        # If an exception occurs, set predictions to zero
        print("\033[1;31m" + f"Error processing {payer_country}" + "\033[0m")
        predictions = [0, 0]
        df_temp = pd.DataFrame({'index': [test_date, test_date + pd.Timedelta(days=1)], 'pred': predictions})
        payer_countries_pinched.append(payer_country)


    # Add additional columns
    df_temp['payer_country'] = payer_country
    df_temp['model'] = file_key.split('/')[-1]

    # Concatenate df_temp with the main DataFrame
    if i:
        temp_df = df_temp.copy()
        i = 0
    else:
        temp_df = pd.concat([temp_df, df_temp], ignore_index=True)

24XORO_MEXICO
ABANK (TN)_EL SALVADOR
AFEX_CHILE
AFRO INTERNACIONAL_GUINEA
AFRO INTERNACIONAL_SIERRA LEONE
AIRPAK_MEXICO
ARGENPER_ARGENTINA
[1;31mError processing ARGENPER_ARGENTINA[0m
ARGENPER_BOLIVIA
[1;31mError processing ARGENPER_BOLIVIA[0m
ARGENPER_CHILE
ARGENPER_PERU
BAM - BANCO AGROMERCANTIL (UT)_GUATEMALA
BANCO AGRICOLA_EL SALVADOR
BANCO ATLANTIDA_HONDURAS
BANCO BHD LEON_DOMINICAN REPUBLIC
BANCO BISA_BOLIVIA
BANCO DAVIVIENDA SALVADORENO (BTS)_EL SALVADOR
BANCO DAYCOVAL_BRAZIL
BANCO DE CREDITO DEL PERU - BCP (UT)_PERU
BANCO DE GUAYAQUIL_ECUADOR
BANCO DE OCCIDENTE_HONDURAS
BANCO DE ORO (BDO)_PHILIPPINES
BANCO DEL AUSTRO_ECUADOR
BANCO GANADERO (BOLIVIA)_BOLIVIA
BANCO INDUSTRIAL ELS (RED CHAPINA)_EL SALVADOR
[1;31mError processing BANCO INDUSTRIAL ELS (RED CHAPINA)_EL SALVADOR[0m
BANCO INDUSTRIAL_GUATEMALA
BANCO PICHINCHA (TN)_ECUADOR
BANCO POPULAR HONDURAS (UT)_HONDURAS
[1;31mError processing BANCO POPULAR HONDURAS (UT)_HONDURAS[0m
BANCO RENDIMENTO_BRAZIL
BANCOLOMBIA_COLOMB

In [70]:
payer_countries_pinched

['ARGENPER_ARGENTINA',
 'ARGENPER_BOLIVIA',
 'BANCO INDUSTRIAL ELS (RED CHAPINA)_EL SALVADOR',
 'BANCO POPULAR HONDURAS (UT)_HONDURAS',
 'BNB_SIERRA LEONE',
 'CAJA POPULAR MEXICANA (UT)_MEXICO',
 'GIROSMEX_MEXICO',
 'KORI_BENIN',
 'KORI_BURKINA FASO',
 'MERCHANTRADE_MALAYSIA',
 'MERCHANTRADE_THAILAND',
 'MUTHOOT-BANGLADESH_BANGLADESH',
 'NAFA_LIBERIA',
 'SPEED MONEY_PHILIPPINES',
 'TRANSFERTO - THUNES_BELGIUM',
 'TRANSFERTO - THUNES_FRANCE',
 'TRANSFERTO - THUNES_GERMANY',
 'TRANSFERTO - THUNES_GHANA',
 'TRANSFERTO - THUNES_ITALY',
 'TRANSFERTO - THUNES_UNITED KINGDOM',
 'TRANSPAY_BANGLADESH',
 'TRANSPAY_BENIN',
 'TRANSPAY_CAMEROON',
 'TRANSPAY_INDONESIA',
 'TRANSPAY_SENEGAL',
 'TRANSPAY_TOGO',
 'VIAMERICAS USA DEPOSITS (RT)_UNITED STATES',
 'VTN_NIGERIA',
 'ZEEPAY_CAMEROON',
 "ZEEPAY_COTE D'IVOIRE (IVORY COAST)",
 'ZEEPAY_GHANA']

In [71]:
temp_df[temp_df['pred'] < 0]

Unnamed: 0,index,pred,payer_country,model
88,2023-12-25,-1542.579388,BANCO BISA_BOLIVIA,MODEL_7dBANCO BISA_BOLIVIA.pkl
169,2023-12-25,-56778.243308,BANCO RENDIMENTO_BRAZIL,MODEL_7dBANCO RENDIMENTO_BRAZIL.pkl
225,2023-12-25,-22445.473373,BANRURAL (HONDURAS)_HONDURAS,MODEL_7dBANRURAL (HONDURAS)_HONDURAS.pkl
362,2023-12-25,-59673.578578,ELEKTRA (BTS)_HONDURAS,MODEL_7dELEKTRA (BTS)_HONDURAS.pkl
476,2023-12-25,-168.070525,JMMB_JAMAICA,MODEL_7dJMMB_JAMAICA.pkl
478,2023-12-27,-105.307597,JMMB_JAMAICA,MODEL_7dJMMB_JAMAICA.pkl
479,2023-12-28,-7.734317,JMMB_JAMAICA,MODEL_7dJMMB_JAMAICA.pkl
480,2023-12-29,-94.230627,JMMB_JAMAICA,MODEL_7dJMMB_JAMAICA.pkl
482,2023-12-31,-177.859549,JMMB_JAMAICA,MODEL_7dJMMB_JAMAICA.pkl
523,2023-12-31,-139.172401,MERCHANTRADE_NEPAL,MODEL_7dMERCHANTRADE_NEPAL.pkl


In [72]:
# Replace values in 'pred' column with 0 where 'pred' is less than 0
temp_df.loc[temp_df['pred'] < 0, 'pred'] = 0

In [73]:
# Convert 'index' column to date format
temp_df['index'] = pd.to_datetime(temp_df['index']).dt.date

In [74]:
temp_df

Unnamed: 0,index,pred,payer_country,model
0,2023-12-25,34262.482708,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl
1,2023-12-26,33697.560404,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl
2,2023-12-27,27710.331951,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl
3,2023-12-28,41453.183069,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl
4,2023-12-29,43495.725388,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl
...,...,...,...,...
757,2023-12-09,0.000000,ZEEPAY_CAMEROON,MODEL_7dZEEPAY_CAMEROON.pkl
758,2023-12-07,0.000000,ZEEPAY_COTE D'IVOIRE (IVORY COAST),MODEL_7dZEEPAY_COTE D'IVOIRE (IVORY COAST).pkl
759,2023-12-08,0.000000,ZEEPAY_COTE D'IVOIRE (IVORY COAST),MODEL_7dZEEPAY_COTE D'IVOIRE (IVORY COAST).pkl
760,2023-12-25,0.000000,ZEEPAY_GHANA,MODEL_7dZEEPAY_GHANA.pkl


In [75]:
temp_df.rename(columns={'index':'pred_date'}, inplace=True)

In [76]:
# ## LEVANTO ULTIMA ABT ACTUALIZADA
# prefix = 'abt_parquet/'

# today = datetime.now().date()
# yesterday = today - timedelta(days=1)

# today_folder = 'dt=' + today.strftime('%Y-%m-%d') + '/'
# yesterday_folder = 'dt=' + yesterday.strftime('%Y-%m-%d') + '/'

In [77]:
# objects = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix+yesterday_folder)

In [78]:
# DB Setting
bucket_name = 's3://viamericas-datalake-dev-us-east-1-283731589572-athena/'
origin_name = 'AwsDataCatalog'
database_name= 'analytics'
table_name = 'daily_check_gp'

In [79]:
df = wr.athena.read_sql_table(
    table=table_name,
    database=database_name,
)

awswrangler.athena._utils INFO  Created CTAS table "analytics"."temp_table_80f71945daaa46a8b41b1ceb0b2531c6"


In [80]:
df['payer_country'] = df['payer'] + '_' + df['country']

In [81]:
df_id = df[['payer_country', 'id_main_branch', 'id_country']].drop_duplicates().dropna(subset='id_main_branch')

In [82]:
len(df_id)

334

In [83]:
df_final = pd.merge(temp_df, df_id, on='payer_country', how='left')

In [84]:
# Insertar la columna 'processing_date' al principio del DataFrame
df_final.insert(0, 'processing_date', today)

In [85]:
df_final['processing_date'] = pd.to_datetime(df_final['processing_date'] )

In [86]:
df_final['id_country'].isnull().sum()

0

In [87]:
df_final['id_main_branch'].isnull().sum()

0

In [88]:
# Splitting 'payer' & 'country'
df_final[['payer', 'country']] = df_final['payer_country'].str.split('_', expand=True)

In [89]:
df_final.head()

Unnamed: 0,processing_date,pred_date,pred,payer_country,model,id_main_branch,id_country,payer,country
0,2024-03-25,2023-12-25,34262.482708,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl,T314,MEX,24XORO,MEXICO
1,2024-03-25,2023-12-26,33697.560404,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl,T314,MEX,24XORO,MEXICO
2,2024-03-25,2023-12-27,27710.331951,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl,T314,MEX,24XORO,MEXICO
3,2024-03-25,2023-12-28,41453.183069,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl,T314,MEX,24XORO,MEXICO
4,2024-03-25,2023-12-29,43495.725388,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl,T314,MEX,24XORO,MEXICO


In [90]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 762 entries, 0 to 761
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   processing_date  762 non-null    datetime64[ns]
 1   pred_date        762 non-null    object        
 2   pred             762 non-null    float64       
 3   payer_country    762 non-null    object        
 4   model            762 non-null    object        
 5   id_main_branch   762 non-null    string        
 6   id_country       762 non-null    string        
 7   payer            762 non-null    object        
 8   country          762 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(5), string(2)
memory usage: 53.7+ KB


In [95]:
file_name = 'predictions_8d.parquet'
path_s3 = 's3://'+bucket_name+path+file_name

# Guarda el DataFrame en formato Parquet en S3
wr.s3.to_parquet(df_final, path=path_s3, dataset=True, index=False)


{'paths': ['s3://s3://viamericas-datalake-dev-us-east-1-283731589572-athena/ABTv3/Last_releases_7d_8lags/predictions_8d.parquet/fa3dd84a395a4c4a85d3f978d06f27ca.snappy.parquet'],
 'partitions_values': {}}

In [96]:
file_name = 'predictions_8d.csv'
path_s3 = 's3://'+bucket_name+path+file_name

# Guarda el DataFrame en formato Parquet en S3
wr.s3.to_csv(df_final, path=path_s3, dataset=True, index=False)

{'paths': ['s3://s3://viamericas-datalake-dev-us-east-1-283731589572-athena/ABTv3/Last_releases_7d_8lags/predictions_8d.csv/31aa71b0ac1d4b3382e922a2a658f2bc.csv'],
 'partitions_values': {}}

In [98]:
df_final.head(20)

Unnamed: 0,processing_date,pred_date,pred,payer_country,model,id_main_branch,id_country,payer,country
0,2024-03-25,2023-12-25,34262.482708,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl,T314,MEX,24XORO,MEXICO
1,2024-03-25,2023-12-26,33697.560404,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl,T314,MEX,24XORO,MEXICO
2,2024-03-25,2023-12-27,27710.331951,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl,T314,MEX,24XORO,MEXICO
3,2024-03-25,2023-12-28,41453.183069,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl,T314,MEX,24XORO,MEXICO
4,2024-03-25,2023-12-29,43495.725388,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl,T314,MEX,24XORO,MEXICO
5,2024-03-25,2023-12-30,32616.550238,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl,T314,MEX,24XORO,MEXICO
6,2024-03-25,2023-12-31,52192.900937,24XORO_MEXICO,MODEL_7d24XORO_MEXICO.pkl,T314,MEX,24XORO,MEXICO
7,2024-03-25,2023-12-25,8958.847975,ABANK (TN)_EL SALVADOR,MODEL_7dABANK (TN)_EL SALVADOR.pkl,T282,ELS,ABANK (TN),EL SALVADOR
8,2024-03-25,2023-12-26,8264.580743,ABANK (TN)_EL SALVADOR,MODEL_7dABANK (TN)_EL SALVADOR.pkl,T282,ELS,ABANK (TN),EL SALVADOR
9,2024-03-25,2023-12-27,10240.806858,ABANK (TN)_EL SALVADOR,MODEL_7dABANK (TN)_EL SALVADOR.pkl,T282,ELS,ABANK (TN),EL SALVADOR
