In [2]:
!pip install awswrangler
!pip install skforecast
!pip install pmdarima

[0m

In [3]:
import awswrangler as wr
import pandas as pd
import boto3
import pickle
from io import BytesIO
from io import StringIO
import joblib
import os
from datetime import datetime, timedelta
import warnings

# skforecast
from skforecast.Sarimax import Sarimax
from skforecast.ForecasterSarimax import ForecasterSarimax
from skforecast.model_selection_sarimax import backtesting_sarimax
from skforecast.model_selection_sarimax import grid_search_sarimax
from sklearn.metrics import mean_absolute_error


### Load ABT and .pkl

In [4]:
# Set up S3 client
client = boto3.client('s3')
bucket_name = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
path = 'ABTv3/Last_releases/' # Folder under analysis

#ABTversion = 'v3_update'
ABTversion = 'v6'
path_ABT = f'ABT{ABTversion}/' 

In [5]:
# Specify the CSV file key
csv_key = None

# List objects in the S3 path
response = client.list_objects(Bucket=bucket_name, Prefix=path_ABT)

# Find the CSV file in the S3 path
for obj in response.get('Contents', []):
    if obj['Key'].endswith('.csv'):
        csv_key = obj['Key']
        break
        
# Check if CSV file is found
if csv_key is not None:
    # Read CSV content from S3
    csv_response = client.get_object(Bucket=bucket_name, Key=csv_key)
    csv_content = csv_response['Body'].read().decode('utf-8')

    # Transform CSV content to DataFrame
    data = pd.read_csv(StringIO(csv_content))
    data['date'] = pd.to_datetime(data['date']).dt.date
    print("CSV file loaded")
else:
    print("No CSV file found in the specified S3 path.")

CSV file loaded


In [6]:
data['date'] = pd.to_datetime(data['date'])

In [7]:
# Mark with 1 in 'day_of_the_dead' when 'date' is Nov. 2
data.loc[data['date'].dt.month.eq(11) & data['date'].dt.day.eq(2), 'day_of_the_dead'] = 1

In [8]:
data['amount'].sum()

27791166635.5917

In [9]:
data.date.max()

Timestamp('2024-02-11 00:00:00')

In [10]:
#data=data.loc[data.date<'2023-12-21']

In [11]:
data.payer_country.nunique()

133

## LOAD .pkl

In [12]:
elements = client.list_objects(Bucket=bucket_name, Prefix=path)

#Listing pkl files
pkl_files = [obj['Key'] for obj in elements.get('Contents', []) if obj['Key'].endswith('.pkl')
                and ('MODEL_2d' in obj['Key']) and not obj['Key'].endswith('_7d.pkl')]  # File structure 

In [13]:
len(pkl_files)

131

In [14]:
pkl_files

['ABTv3/Last_releases/24XORO_MEXICO/MODEL_2d_24XORO_MEXICO.pkl',
 'ABTv3/Last_releases/ABANK (TN)_EL SALVADOR/MODEL_2d_ABANK (TN)_EL SALVADOR.pkl',
 'ABTv3/Last_releases/AFEX_CHILE/MODEL_2d_AFEX_CHILE.pkl',
 'ABTv3/Last_releases/AFRO INTERNACIONAL_GUINEA/MODEL_2d_AFRO INTERNACIONAL_GUINEA.pkl',
 'ABTv3/Last_releases/AFRO INTERNACIONAL_SIERRA LEONE/MODEL_2d_AFRO INTERNACIONAL_SIERRA LEONE.pkl',
 'ABTv3/Last_releases/AIRPAK_MEXICO/MODEL_2d_AIRPAK_MEXICO.pkl',
 'ABTv3/Last_releases/ARGENPER_ARGENTINA/MODEL_2d_ARGENPER_ARGENTINA.pkl',
 'ABTv3/Last_releases/ARGENPER_BOLIVIA/MODEL_2d_ARGENPER_BOLIVIA.pkl',
 'ABTv3/Last_releases/ARGENPER_CHILE/MODEL_2d_ARGENPER_CHILE.pkl',
 'ABTv3/Last_releases/ARGENPER_PERU/MODEL_2d_ARGENPER_PERU.pkl',
 'ABTv3/Last_releases/BAM - BANCO AGROMERCANTIL (UT)_GUATEMALA/MODEL_2d_BAM - BANCO AGROMERCANTIL (UT)_GUATEMALA.pkl',
 'ABTv3/Last_releases/BANCO AGRICOLA_EL SALVADOR/MODEL_2d_BANCO AGRICOLA_EL SALVADOR.pkl',
 'ABTv3/Last_releases/BANCO ATLANTIDA_HONDURAS/MOD

In [15]:
today = datetime.now().date()

### Loop to process all

In [16]:
# Initialize an empty DataFrame to store the results
df_temp = pd.DataFrame(columns=['date', 'pred', 'payer_country', 'model'])

In [17]:
# Payers to present errors in the inference process.
payer_countries_pinched = []

In [18]:
#data=data.loc[data.payer_country=='ELEKTRA (MEXICO)_MEXICO']

In [19]:
i = 1

# Iterate over pkl files
for file_key in pkl_files:
    # Extract payer_country from file_key
    payer_country = file_key.split('/')[2]
    print(payer_country)
    
    # Download pkl file from S3 and load it into memory
    response = client.get_object(Bucket=bucket_name, Key=file_key)
    buffer = BytesIO(response['Body'].read())
    forecaster = joblib.load(buffer)
    
    #### PAYER SETTING ####
    # Filter data for the specific payer_country
    datos = data[data['payer_country'] == payer_country].copy()
#    datos = data.loc[data.payer_country == payer_country]
    datos['date'] = pd.to_datetime(datos['date'])
    datos.set_index('date', inplace=True)
    datos = datos.asfreq('D')

    # Predictions settings
    last_window_date = forecaster.last_window.index[-1] + pd.Timedelta(days=1) #19/12/23 first day in last window
    test_date = forecaster.last_window.index[-1] + pd.Timedelta(days=39) # The first test day would be the day to predict 29/12/23-==11-(esto CRECE SIEMPRE UN DIA MAS)

    # Extract data for last window and test period
    data_last_window = datos.loc[last_window_date:test_date - pd.Timedelta(days=1)].copy() #From 19/12/23 to test minus one day 28/12/23
    data_last_window[forecaster.exog_col_names] = data_last_window[forecaster.exog_col_names].fillna(0)##extracting exogenous and filling missing
    data_test = datos.loc[test_date:test_date + pd.Timedelta(days=1)].copy()#29/12/2023 we have data in test (first day to predict) and their extension (2 days) (7D IN 8D MODEL)
    data_test[forecaster.exog_col_names] = data_test[forecaster.exog_col_names].fillna(0)##we fill in missings

    try:
        
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
        # Make predictions
        predictions = forecaster.predict(
                          steps            = 2, # Days to predict
                          exog             = data_test[forecaster.exog_col_names],
                          last_window      = data_last_window['amount'],
                          last_window_exog = data_last_window[forecaster.exog_col_names]
                      )
        # Store predictions in a temporary DataFrame
        df_temp = pd.DataFrame(predictions, columns=['pred']).reset_index()

    except:
        # If an exception occurs, set predictions to zero
        print("\033[1;31m" + f"Error processing {payer_country}" + "\033[0m")
        predictions = [0, 0]
        df_temp = pd.DataFrame({'index': [test_date, test_date + pd.Timedelta(days=1)], 'pred': predictions})
        payer_countries_pinched.append(payer_country)


    # Add additional columns
    df_temp['payer_country'] = payer_country
    df_temp['model'] = file_key.split('/')[-1]

    # Concatenate df_temp with the main DataFrame
    if i:
        temp_df = df_temp.copy()
        i = 0
    else:
        temp_df = pd.concat([temp_df, df_temp], ignore_index=True)

24XORO_MEXICO
ABANK (TN)_EL SALVADOR
AFEX_CHILE
AFRO INTERNACIONAL_GUINEA
AFRO INTERNACIONAL_SIERRA LEONE
AIRPAK_MEXICO
ARGENPER_ARGENTINA
ARGENPER_BOLIVIA
ARGENPER_CHILE
ARGENPER_PERU
BAM - BANCO AGROMERCANTIL (UT)_GUATEMALA
BANCO AGRICOLA_EL SALVADOR
BANCO ATLANTIDA_HONDURAS
BANCO BHD LEON_DOMINICAN REPUBLIC
BANCO BISA_BOLIVIA
BANCO DAVIVIENDA SALVADORENO (BTS)_EL SALVADOR
BANCO DAYCOVAL_BRAZIL
BANCO DE CREDITO DEL PERU - BCP (UT)_PERU
BANCO DE GUAYAQUIL_ECUADOR
BANCO DE OCCIDENTE_HONDURAS
BANCO DE ORO (BDO)_PHILIPPINES
BANCO DEL AUSTRO_ECUADOR
BANCO GANADERO (BOLIVIA)_BOLIVIA
BANCO INDUSTRIAL ELS (RED CHAPINA)_EL SALVADOR
BANCO INDUSTRIAL_GUATEMALA
BANCO PICHINCHA (TN)_ECUADOR
BANCO POPULAR HONDURAS (UT)_HONDURAS
BANCO RENDIMENTO_BRAZIL
BANCOLOMBIA_COLOMBIA
BANCOPPEL (APPRIZA)_MEXICO
BANHCAFE_HONDURAS
BANK OF PHILIPPINE ISLANDS (BPI)_PHILIPPINES
BANORTE (UT)_MEXICO
BANPAIS (RED CHAPINA)_HONDURAS
BANPRO_NICARAGUA
BANRURAL (HONDURAS)_HONDURAS
BANRURAL (RYT)_GUATEMALA
BANTRAB (UT)_GUAT

In [20]:
#print(pkl_files)

In [21]:
data_last_window.index.min()
#test_date

Timestamp('2023-12-19 00:00:00')

In [22]:
payer_countries_pinched

['GIROSMEX_MEXICO']

In [23]:
temp_df[temp_df['pred'] < 0]

Unnamed: 0,index,pred,payer_country,model
6,2024-01-26,-5.642385000000001e-17,AFRO INTERNACIONAL_GUINEA,MODEL_2d_AFRO INTERNACIONAL_GUINEA.pkl
82,2023-12-21,-0.0005028143,BNB_SIERRA LEONE,MODEL_2d_BNB_SIERRA LEONE.pkl
84,2024-01-26,-4.718102,CAJA POPULAR MEXICANA (UT)_MEXICO,MODEL_2d_CAJA POPULAR MEXICANA (UT)_MEXICO.pkl
85,2024-01-27,-4.718102,CAJA POPULAR MEXICANA (UT)_MEXICO,MODEL_2d_CAJA POPULAR MEXICANA (UT)_MEXICO.pkl
173,2024-01-27,-69.15966,MUTHOOT-BANGLADESH_BANGLADESH,MODEL_2d_MUTHOOT-BANGLADESH_BANGLADESH.pkl
222,2024-01-26,-406.0429,TRANSFERTO - THUNES_NIGERIA,MODEL_2d_TRANSFERTO - THUNES_NIGERIA.pkl
249,2024-01-24,-2.751725,VTN_NIGERIA,MODEL_2d_VTN_NIGERIA.pkl
260,2024-01-26,-444.8588,ZEEPAY_GHANA,MODEL_2d_ZEEPAY_GHANA.pkl
261,2024-01-27,-228.9884,ZEEPAY_GHANA,MODEL_2d_ZEEPAY_GHANA.pkl


In [24]:
# Replace values in 'pred' column with 0 where 'pred' is less than 0
temp_df.loc[temp_df['pred'] < 0, 'pred'] = 0

In [25]:
# Convert 'index' column to date format
temp_df['index'] = pd.to_datetime(temp_df['index']).dt.date

In [26]:
temp_df.head(4)

Unnamed: 0,index,pred,payer_country,model
0,2024-01-26,42132.100326,24XORO_MEXICO,MODEL_2d_24XORO_MEXICO.pkl
1,2024-01-27,24509.257391,24XORO_MEXICO,MODEL_2d_24XORO_MEXICO.pkl
2,2024-01-26,16845.073341,ABANK (TN)_EL SALVADOR,MODEL_2d_ABANK (TN)_EL SALVADOR.pkl
3,2024-01-27,13952.075612,ABANK (TN)_EL SALVADOR,MODEL_2d_ABANK (TN)_EL SALVADOR.pkl


In [27]:
temp_df.rename(columns={'index':'pred_date'}, inplace=True)

In [28]:
temp_df

Unnamed: 0,pred_date,pred,payer_country,model
0,2024-01-26,42132.100326,24XORO_MEXICO,MODEL_2d_24XORO_MEXICO.pkl
1,2024-01-27,24509.257391,24XORO_MEXICO,MODEL_2d_24XORO_MEXICO.pkl
2,2024-01-26,16845.073341,ABANK (TN)_EL SALVADOR,MODEL_2d_ABANK (TN)_EL SALVADOR.pkl
3,2024-01-27,13952.075612,ABANK (TN)_EL SALVADOR,MODEL_2d_ABANK (TN)_EL SALVADOR.pkl
4,2024-01-26,3210.051757,AFEX_CHILE,MODEL_2d_AFEX_CHILE.pkl
...,...,...,...,...
257,2024-01-10,115.316176,ZEEPAY_CAMEROON,MODEL_2d_ZEEPAY_CAMEROON.pkl
258,2024-01-08,6.990699,ZEEPAY_COTE D'IVOIRE (IVORY COAST),MODEL_2d_ZEEPAY_COTE D'IVOIRE (IVORY COAST).pkl
259,2024-01-09,7.585481,ZEEPAY_COTE D'IVOIRE (IVORY COAST),MODEL_2d_ZEEPAY_COTE D'IVOIRE (IVORY COAST).pkl
260,2024-01-26,0.000000,ZEEPAY_GHANA,MODEL_2d_ZEEPAY_GHANA.pkl


In [29]:
control=temp_df.loc[temp_df.payer_country=="ELEKTRA (MEXICO)_MEXICO"]

In [30]:
control

Unnamed: 0,pred_date,pred,payer_country,model
114,2024-01-26,8676394.0,ELEKTRA (MEXICO)_MEXICO,MODEL_2d_ELEKTRA (MEXICO)_MEXICO.pkl
115,2024-01-27,10162740.0,ELEKTRA (MEXICO)_MEXICO,MODEL_2d_ELEKTRA (MEXICO)_MEXICO.pkl


In [31]:
control=temp_df.loc[temp_df.payer_country=="BANCOPPEL (APPRIZA)_MEXICO"]

In [32]:
control

Unnamed: 0,pred_date,pred,payer_country,model
58,2024-01-26,5383788.0,BANCOPPEL (APPRIZA)_MEXICO,MODEL_2d_BANCOPPEL (APPRIZA)_MEXICO.pkl
59,2024-01-27,6068382.0,BANCOPPEL (APPRIZA)_MEXICO,MODEL_2d_BANCOPPEL (APPRIZA)_MEXICO.pkl


In [33]:
control=temp_df.loc[temp_df.payer_country=="BANRURAL (RYT)_GUATEMALA"]

In [34]:
control

Unnamed: 0,pred_date,pred,payer_country,model
72,2024-01-26,4070959.0,BANRURAL (RYT)_GUATEMALA,MODEL_2d_BANRURAL (RYT)_GUATEMALA_2d.pkl
73,2024-01-27,5436082.0,BANRURAL (RYT)_GUATEMALA,MODEL_2d_BANRURAL (RYT)_GUATEMALA_2d.pkl


In [35]:
control=temp_df.loc[temp_df.payer_country=="BANCO INDUSTRIAL_GUATEMALA"]

In [36]:
control

Unnamed: 0,pred_date,pred,payer_country,model
48,2024-01-26,2760194.0,BANCO INDUSTRIAL_GUATEMALA,MODEL_2d_BANCO INDUSTRIAL_GUATEMALA.pkl
49,2024-01-27,3563781.0,BANCO INDUSTRIAL_GUATEMALA,MODEL_2d_BANCO INDUSTRIAL_GUATEMALA.pkl


In [37]:
control=temp_df.loc[temp_df.payer_country=="BBVA - BANCOMER (BTS)_MEXICO"]

In [38]:
control

Unnamed: 0,pred_date,pred,payer_country,model
76,2024-01-26,854530.12912,BBVA - BANCOMER (BTS)_MEXICO,MODEL_2d_BBVA - BANCOMER (BTS)_MEXICO.pkl
77,2024-01-27,866486.138858,BBVA - BANCOMER (BTS)_MEXICO,MODEL_2d_BBVA - BANCOMER (BTS)_MEXICO.pkl


In [39]:
control=temp_df.loc[temp_df.payer_country=="ENVIOS CON CLABE (APPRIZA)_MEXICO"]

In [40]:
control

Unnamed: 0,pred_date,pred,payer_country,model
120,2024-01-26,1247445.0,ENVIOS CON CLABE (APPRIZA)_MEXICO,MODEL_2d_ENVIOS CON CLABE (APPRIZA)_MEXICO.pkl
121,2024-01-27,1325423.0,ENVIOS CON CLABE (APPRIZA)_MEXICO,MODEL_2d_ENVIOS CON CLABE (APPRIZA)_MEXICO.pkl


In [41]:
control=temp_df.loc[temp_df.payer_country=="BANORTE (UT)_MEXICO"]

In [42]:
control

Unnamed: 0,pred_date,pred,payer_country,model
64,2024-01-26,561341.073991,BANORTE (UT)_MEXICO,MODEL_2d_BANORTE (UT)_MEXICO.pkl
65,2024-01-27,608713.178084,BANORTE (UT)_MEXICO,MODEL_2d_BANORTE (UT)_MEXICO.pkl


### ADD ID_COUNTRY AND ID_PAYER FROM DAILY_CHECK_GP TABLE

In [43]:
# DB Setting
bucket_name = 's3://viamericas-datalake-dev-us-east-1-283731589572-athena/'
origin_name = 'AwsDataCatalog'
database_name= 'analytics'
table_name = 'daily_check_gp'

In [44]:
df = wr.athena.read_sql_table(
    table=table_name,
    database=database_name,
)

awswrangler.athena._utils INFO  Created CTAS table "analytics"."temp_table_6b504c64b50e4ebe8140760814bdba5f"


In [45]:
df['payer_country'] = df['payer'] + '_' + df['country']

In [46]:
df_id = df[['payer_country', 'id_main_branch', 'id_country']].drop_duplicates().dropna(subset='id_main_branch')

In [47]:
len(df_id)

370

In [48]:
df_final = pd.merge(temp_df, df_id, on='payer_country', how='left')

In [49]:
# Insertar la columna 'processing_date' al principio del DataFrame
df_final.insert(0, 'processing_date', today)

In [50]:
control=df.loc[(df.payer_country=="BANCOPPEL (APPRIZA)_MEXICO") & (df.date>'2024-01-25') & (df.date<'2024-01-28')]
control

Unnamed: 0,payer,country,date,tx,amount,coupon_count,gp,id_main_branch,id_country,day,payer_country
48251,BANCOPPEL (APPRIZA),MEXICO,2024-01-26,11749,5616014.48,272,45082.8362,T239,MEX,2024-01-26,BANCOPPEL (APPRIZA)_MEXICO
82830,BANCOPPEL (APPRIZA),MEXICO,2024-01-27,13664,5934033.24,298,50307.3083,T239,MEX,2024-01-27,BANCOPPEL (APPRIZA)_MEXICO


In [51]:
df_final['processing_date'] = pd.to_datetime(df_final['processing_date'] )

In [52]:
df_final['id_country'].isnull().sum()

0

In [53]:
df_final['id_main_branch'].isnull().sum()

0

In [54]:
# Splitting 'payer' & 'country'
df_final[['payer', 'country']] = df_final['payer_country'].str.split('_', expand=True)

In [55]:
df_final.head()

Unnamed: 0,processing_date,pred_date,pred,payer_country,model,id_main_branch,id_country,payer,country
0,2024-06-12,2024-01-26,42132.100326,24XORO_MEXICO,MODEL_2d_24XORO_MEXICO.pkl,T314,MEX,24XORO,MEXICO
1,2024-06-12,2024-01-27,24509.257391,24XORO_MEXICO,MODEL_2d_24XORO_MEXICO.pkl,T314,MEX,24XORO,MEXICO
2,2024-06-12,2024-01-26,16845.073341,ABANK (TN)_EL SALVADOR,MODEL_2d_ABANK (TN)_EL SALVADOR.pkl,T282,ELS,ABANK (TN),EL SALVADOR
3,2024-06-12,2024-01-27,13952.075612,ABANK (TN)_EL SALVADOR,MODEL_2d_ABANK (TN)_EL SALVADOR.pkl,T282,ELS,ABANK (TN),EL SALVADOR
4,2024-06-12,2024-01-26,3210.051757,AFEX_CHILE,MODEL_2d_AFEX_CHILE.pkl,T089,CHI,AFEX,CHILE


In [None]:
df_final.info()

In [None]:
df_final.pred.sum()

### SAVE THE INFERENCES 
##### THIS IS THE INPUT FOR ADDING MAPES TO HISTORIC MAPES EXCEL

In [None]:
bucket_name = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
path_inference = '/ABTv3_update/' # Folder under analysis

In [None]:
#wr.s3.to_csv(
    df=df_final,
    path='s3://viamericas-datalake-dev-us-east-1-283731589572-analytics/ABTv3_update/Inferences_2d/predictions_2d.csv',
    dataset=False,
    index=False
)

In [None]:
control=df_final.loc[df_final.payer_country=="GIROSMEX_MEXICO"]

In [None]:
control