In [30]:
# !pip install awswrangler
# !pip install skforecast
# !pip install pmdarima

In [190]:
import awswrangler as wr
import pandas as pd
import boto3
import pickle
from io import BytesIO
from io import StringIO
import joblib
import os
from datetime import datetime, timedelta
import warnings

# skforecast
from skforecast.Sarimax import Sarimax
from skforecast.ForecasterSarimax import ForecasterSarimax
from skforecast.model_selection_sarimax import backtesting_sarimax
from skforecast.model_selection_sarimax import grid_search_sarimax
from sklearn.metrics import mean_absolute_error


In [6]:
# Set up S3 client
client = boto3.client('s3')
bucket_name = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
path = 'ABTv3/Last_releases/' # Folder under analysis

ABTversion = 'v3'
path_ABT = f'ABT{ABTversion}/' 

In [7]:
# Specify the CSV file key
csv_key = None

# List objects in the S3 path
response = client.list_objects(Bucket=bucket_name, Prefix=path_ABT)

# Find the CSV file in the S3 path
for obj in response.get('Contents', []):
    if obj['Key'].endswith('.csv'):
        csv_key = obj['Key']
        break
        
# Check if CSV file is found
if csv_key is not None:
    # Read CSV content from S3
    csv_response = client.get_object(Bucket=bucket_name, Key=csv_key)
    csv_content = csv_response['Body'].read().decode('utf-8')

    # Transform CSV content to DataFrame
    data = pd.read_csv(StringIO(csv_content))
    data['date'] = pd.to_datetime(data['date']).dt.date
    print("CSV file loaded")
else:
    print("No CSV file found in the specified S3 path.")

CSV file loaded


In [8]:
#data = data[data['date'] < pd.to_datetime('2023-12-25').date()]

In [9]:
elements = client.list_objects(Bucket=bucket_name, Prefix=path)

#Listing pkl files
pkl_files = [obj['Key'] for obj in elements.get('Contents', []) if obj['Key'].endswith('.pkl')
                and ('MODEL_2d_' in obj['Key']) and not obj['Key'].endswith('_2d.pkl')]

In [10]:
len(pkl_files)

130

### Prueba con 1 payer

In [11]:
pkl_files[6]

'ABTv3/Last_releases/ARGENPER_ARGENTINA/MODEL_2d_ARGENPER_ARGENTINA.pkl'

In [12]:
file_key = pkl_files[6]

# Descargar el archivo pkl desde S3 y cargarlo en memoria
response = client.get_object(Bucket=bucket_name, Key=file_key)
buffer = BytesIO(response['Body'].read())
forecaster = joblib.load(buffer)

In [14]:
payer = 'ARGENPER_ARGENTINA'

#### PAYER SETTING ####
datos = data.loc[data.payer_country==payer]
datos['date'] = pd.to_datetime(datos['date'])
datos.set_index('date', inplace=True)
datos = datos.asfreq('D')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  datos['date'] = pd.to_datetime(datos['date'])


In [15]:
last_window_date = forecaster.last_window.index[-1] + pd.Timedelta(days=1)
test_date = pd.Timestamp('2023-12-25') # El primer dia de test seria el dia a predecir

data_last_window = datos.loc[last_window_date:test_date - pd.Timedelta(days=1)]
data_last_window[forecaster.exog_col_names] = data_last_window[forecaster.exog_col_names].fillna(0)

data_test = datos.loc[test_date:test_date + pd.Timedelta(days=1)] 
data_test[forecaster.exog_col_names] = data_test[forecaster.exog_col_names].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_last_window[forecaster.exog_col_names] = data_last_window[forecaster.exog_col_names].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_test[forecaster.exog_col_names] = data_test[forecaster.exog_col_names].fillna(0)


In [16]:
predictions = forecaster.predict(
                  steps            = 2,
                  exog             = data_test[forecaster.exog_col_names],
                  last_window      = data_last_window['amount'],
                  last_window_exog = data_last_window[forecaster.exog_col_names]
              )

In [17]:
predictions

2023-12-25    485.241489
2023-12-26    308.361684
Freq: D, Name: pred, dtype: float64

In [18]:
pd.DataFrame(predictions, columns=['pred']).reset_index()

Unnamed: 0,index,pred
0,2023-12-25,485.241489
1,2023-12-26,308.361684


### Bucle para procesar todos

In [19]:
# Initialize an empty DataFrame to store the results
df_temp = pd.DataFrame(columns=['date', 'pred', 'payer_country', 'model'])

In [20]:
df_temp.empty

True

In [194]:
i = 1

# Iterate over pkl files
for file_key in pkl_files:
    # Extract payer_country from file_key
    payer_country = file_key.split('/')[2]
    print(payer_country)
    
    # Download pkl file from S3 and load it into memory
    response = client.get_object(Bucket=bucket_name, Key=file_key)
    buffer = BytesIO(response['Body'].read())
    forecaster = joblib.load(buffer)
    
    #### PAYER SETTING ####
    # Filter data for the specific payer_country
    datos = data[data['payer_country'] == payer_country].copy()
#    datos = data.loc[data.payer_country == payer_country]
    datos['date'] = pd.to_datetime(datos['date'])
    datos.set_index('date', inplace=True)
    datos = datos.asfreq('D')

    # Predictions settings
    last_window_date = forecaster.last_window.index[-1] + pd.Timedelta(days=1)
    test_date = pd.Timestamp('2023-12-25')  # The first test day would be the day to predict

    # Extract data for last window and test period
    data_last_window = datos.loc[last_window_date:test_date - pd.Timedelta(days=1)].copy()
#    data_last_window = datos.loc[last_window_date:test_date - pd.Timedelta(days=1)]
    data_last_window[forecaster.exog_col_names] = data_last_window[forecaster.exog_col_names].fillna(0)
    data_test = datos.loc[test_date:test_date + pd.Timedelta(days=1)].copy()
#    data_test = datos.loc[test_date:test_date + pd.Timedelta(days=1)] 
    data_test[forecaster.exog_col_names] = data_test[forecaster.exog_col_names].fillna(0)

    try:
        
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
        # Make predictions
        predictions = forecaster.predict(
                          steps            = 2,
                          exog             = data_test[forecaster.exog_col_names],
                          last_window      = data_last_window['amount'],
                          last_window_exog = data_last_window[forecaster.exog_col_names]
                      )
        # Store predictions in a temporary DataFrame
        df_temp = pd.DataFrame(predictions, columns=['pred']).reset_index()
#        print(df_temp)

    except:
        # If an exception occurs, set predictions to zero
        print("\033[1;31m" + f"Error processing {payer_country}" + "\033[0m")
        predictions = [0, 0]
        df_temp = pd.DataFrame({'index': [test_date, test_date + pd.Timedelta(days=1)], 'pred': predictions})


    # Add additional columns
    df_temp['payer_country'] = payer_country
    df_temp['model'] = file_key.split('/')[-1]

    # Concatenate df_temp with the main DataFrame
    if i:
        temp_df = df_temp.copy()
        i = 0
    else:
        temp_df = pd.concat([temp_df, df_temp], ignore_index=True)

24XORO_MEXICO
ABANK (TN)_EL SALVADOR
AFEX_CHILE
AFRO INTERNACIONAL_GUINEA
AFRO INTERNACIONAL_SIERRA LEONE
AIRPAK_MEXICO
ARGENPER_ARGENTINA
ARGENPER_BOLIVIA
ARGENPER_CHILE
ARGENPER_PERU
BAM - BANCO AGROMERCANTIL (UT)_GUATEMALA
BANCO AGRICOLA_EL SALVADOR
BANCO ATLANTIDA_HONDURAS
BANCO BHD LEON_DOMINICAN REPUBLIC
BANCO BISA_BOLIVIA
BANCO DAVIVIENDA SALVADORENO (BTS)_EL SALVADOR
BANCO DAYCOVAL_BRAZIL
BANCO DE CREDITO DEL PERU - BCP (UT)_PERU
BANCO DE GUAYAQUIL_ECUADOR
BANCO DE OCCIDENTE_HONDURAS
BANCO DE ORO (BDO)_PHILIPPINES
BANCO DEL AUSTRO_ECUADOR
BANCO GANADERO (BOLIVIA)_BOLIVIA
BANCO INDUSTRIAL ELS (RED CHAPINA)_EL SALVADOR
BANCO INDUSTRIAL_GUATEMALA
BANCO PICHINCHA (TN)_ECUADOR
BANCO POPULAR HONDURAS (UT)_HONDURAS
[1;31mError en procesar BANCO POPULAR HONDURAS (UT)_HONDURAS[0m
BANCO RENDIMENTO_BRAZIL
BANCOLOMBIA_COLOMBIA
BANCOPPEL (APPRIZA)_MEXICO
BANHCAFE_HONDURAS
BANK OF PHILIPPINE ISLANDS (BPI)_PHILIPPINES
BANORTE (UT)_MEXICO
BANPAIS (RED CHAPINA)_HONDURAS
BANPRO_NICARAGUA
BANRU

In [195]:
len(temp_df)

260

In [196]:
temp_df[temp_df['pred'] < 0]

Unnamed: 0,index,pred,payer_country,model
48,2023-12-25,-148947.677845,BANCO INDUSTRIAL_GUATEMALA,MODEL_2d_BANCO INDUSTRIAL_GUATEMALA.pkl
54,2023-12-25,-67191.858226,BANCO RENDIMENTO_BRAZIL,MODEL_2d_BANCO RENDIMENTO_BRAZIL.pkl
58,2023-12-25,-19169.543009,BANCOPPEL (APPRIZA)_MEXICO,MODEL_2d_BANCOPPEL (APPRIZA)_MEXICO.pkl
70,2023-12-25,-26701.657343,BANRURAL (HONDURAS)_HONDURAS,MODEL_2d_BANRURAL (HONDURAS)_HONDURAS.pkl
108,2023-12-25,-12963.753813,ELEKTRA (BTS)_GUATEMALA,MODEL_2d_ELEKTRA (BTS)_GUATEMALA.pkl
110,2023-12-25,-95825.648347,ELEKTRA (BTS)_HONDURAS,MODEL_2d_ELEKTRA (BTS)_HONDURAS.pkl
147,2023-12-26,-6.873834,KORI_BENIN,MODEL_2d_KORI_BENIN.pkl


In [197]:
# Replace values in 'pred' column with 0 where 'pred' is less than 0
temp_df.loc[temp_df['pred'] < 0, 'pred'] = 0

In [204]:
# Convert 'index' column to date format
temp_df['index'] = pd.to_datetime(temp_df['index']).dt.date.astype(str)

In [209]:
# Adding processing date
temp_df['index'] =  temp_df['index'] + '_' + today.strftime('%Y-%m-%d')

In [211]:
temp_df = pd.pivot_table(temp_df, index=['payer_country','model'], columns='index', values='pred', aggfunc='first', fill_value=None).reset_index()

In [95]:
## LEVANTO ULTIMA ABT ACTUALIZADA
prefix = 'abt_parquet/'

today = datetime.now().date()
yesterday = today - timedelta(days=1)

today_folder = 'dt=' + today.strftime('%Y-%m-%d') + '/'
yesterday_folder = 'dt=' + yesterday.strftime('%Y-%m-%d') + '/'

In [96]:
objects = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix+yesterday_folder)

In [97]:
import s3fs
import pyarrow.parquet as pq

# Check if objects were found
if 'Contents' in objects:
    # Get the Parquet file path within the folder
    file_key = objects['Contents'][0]['Key']
    
    # Create an S3 FileSystem client
    s3_file_system = s3fs.S3FileSystem()

    # Read the Parquet file using pyarrow
    with s3_file_system.open(f's3://{bucket_name}/{file_key}', mode='rb') as file:
        parquet_table = pq.read_table(file)
    
    # Convert the Parquet table to a DataFrame (optional)
    df = parquet_table.to_pandas()
else:
    print(f"No Parquet files found in the folder {prefix+yesterday_folder}.")

In [98]:
#parquet_table.to_pandas()

In [99]:
df['id_country'].isnull().sum()

32051

In [100]:
df['id_main_branch'].isnull().sum()

19343

In [101]:
#df.isnull().sum().to_list()

In [72]:
df_id = df[['payer_country', 'id_main_branch', 'id_country']].drop_duplicates().dropna(subset='id_main_branch')

In [80]:
#df_id.to_excel('df_id.xlsx')

In [74]:
len(df_id)

98

In [78]:
len(df['payer_country'].unique())

97

In [159]:
#sorted(df['payer_country'].unique())

In [212]:
df_final = pd.merge(temp_df, df_id, on='payer_country', how='left')

In [213]:
df_final['processing_date'] = today

In [214]:
### Workaround > Elimino duplicados
df_final = df_final.drop_duplicates(subset=['payer_country'], keep='first')

In [215]:
df_final.columns

Index(['payer_country', 'model', '2023-12-25_2024-03-19',
       '2023-12-26_2024-03-19', 'id_main_branch', 'id_country',
       'processing_date'],
      dtype='object')

In [216]:
# Splitting 'payer' & 'country'
df_final[['payer', 'country']] = df_final['payer_country'].str.split('_', expand=True)

In [217]:
df_final[['processing_date', 'payer_country','payer', 'country', 'id_main_branch', 'id_country',
         'model', '2023-12-25_2024-03-19','2023-12-26_2024-03-19' ]]

Unnamed: 0,processing_date,payer_country,payer,country,id_main_branch,id_country,model,2023-12-25_2024-03-19,2023-12-26_2024-03-19
0,2024-03-19,24XORO_MEXICO,24XORO,MEXICO,T314,MEX,MODEL_2d_24XORO_MEXICO.pkl,36947.319758,43751.589258
1,2024-03-19,ABANK (TN)_EL SALVADOR,ABANK (TN),EL SALVADOR,T282,,MODEL_2d_ABANK (TN)_EL SALVADOR.pkl,9590.269894,8998.724809
2,2024-03-19,AFEX_CHILE,AFEX,CHILE,T089,,MODEL_2d_AFEX_CHILE.pkl,5632.096572,5745.325640
3,2024-03-19,AFRO INTERNACIONAL_GUINEA,AFRO INTERNACIONAL,GUINEA,T297,,MODEL_2d_AFRO INTERNACIONAL_GUINEA.pkl,406.490380,72.425212
4,2024-03-19,AFRO INTERNACIONAL_SIERRA LEONE,AFRO INTERNACIONAL,SIERRA LEONE,T297,,MODEL_2d_AFRO INTERNACIONAL_SIERRA LEONE.pkl,23457.119481,23622.173239
...,...,...,...,...,...,...,...,...,...
127,2024-03-19,WALMART (UT)_MEXICO,WALMART (UT),MEXICO,,,MODEL_2d_WALMART (UT)_MEXICO.pkl,73846.434934,252735.429911
128,2024-03-19,YES BANK_INDIA,YES BANK,INDIA,T253,IND,MODEL_2d_YES BANK_INDIA.pkl,26430.191603,21818.088564
129,2024-03-19,ZEEPAY_CAMEROON,ZEEPAY,CAMEROON,T280,,MODEL_2d_ZEEPAY_CAMEROON.pkl,0.000000,0.000000
130,2024-03-19,ZEEPAY_COTE D'IVOIRE (IVORY COAST),ZEEPAY,COTE D'IVOIRE (IVORY COAST),T280,,MODEL_2d_ZEEPAY_COTE D'IVOIRE (IVORY COAST).pkl,0.000000,0.000000
