In [None]:
import boto3

In [None]:
import pandas as pd
from io import StringIO

In [None]:
# Set up S3 client
client = boto3.client('s3')

In [None]:
folders_list =  ['forecast_1d_25_11_2022', 'forecast_1d_20_12_2022', 'forecast_1d_29_12_2022', 
                'forecast_1d_20_03_2023', 'forecast_1d_05_05_2023', 'forecast_1d_30_06_2023', 'forecast_1d_10_09_2023']

In [None]:
# Bucket name and common prefix
bucket_forecast = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
common_path = 'Forecast/Forecast_analysis/'

In [None]:
# Initialize a list to store DataFrames
dfs = []

In [None]:
# Iterate over each folder in the list
for folder_item in folders_list:
    # Build the new path for the current folder
    path = f'{common_path}{folder_item}/'
    
    # List objects in the folder
    elements = client.list_objects(Bucket=bucket_forecast, Prefix=path)

    # Iterate over each object in the folder
    for obj in elements.get('Contents', []):
        # Check if the object is a CSV file
        if obj['Key'].endswith('.csv'):
            # Read CSV content from S3
            response = client.get_object(Bucket=bucket_forecast, Key=obj['Key'])
            csv_content = response['Body'].read().decode('utf-8')

            # Transform CSV content to DataFrame
            df = pd.read_csv(StringIO(csv_content))

            
            df['folder_id'] = folder_item # 'folder_id' column to indentify the forecast

            dfs.append(df)

In [None]:
# Concatenate all DataFrames into a single one
final_df = pd.concat(dfs, ignore_index=True)

In [None]:
final_df['folder_id'].unique()

In [34]:
final_df['date'] = pd.to_datetime(final_df['date']).dt.date

In [35]:
final_df.drop('item_id', axis=1, inplace=True)

In [36]:
final_df = final_df[['folder_id', 'date','country', 'payer', 'p10', 'p50', 'p90', 'mean']]

In [37]:
final_df['date'].unique()

array([datetime.date(2022, 12, 21), datetime.date(2022, 12, 30),
       datetime.date(2023, 3, 21), datetime.date(2023, 5, 6),
       datetime.date(2023, 9, 11), datetime.date(2022, 11, 26)],
      dtype=object)

In [None]:
final_df = final_df.applymap(lambda x: x.upper() if isinstance(x, str) else x)

In [None]:
final_df.head()

In [19]:
# DB Setting
bucket_name = 's3://viamericas-datalake-dev-us-east-1-283731589572-athena/'
origin_name = 'AwsDataCatalog'
database_name= 'analytics'
table_name = 'daily_check'

In [21]:
#pip install pyathena

In [22]:
from pyathena import connect

#Connection
conn = connect(s3_staging_dir=f'{bucket_name}queryresults/',
               region_name='us-east-1')
df_daily_check = pd.read_sql(f'SELECT * FROM {origin_name}.{database_name}.{table_name};', conn)

  df_daily_check = pd.read_sql(f'SELECT * FROM {origin_name}.{database_name}.{table_name};', conn)


In [23]:
df_daily_check['date'] = pd.to_datetime(df_daily_check['date']).dt.date

In [24]:
df_daily_check.head()

Unnamed: 0,payer,country,date,tx,amount,coupon_count
0,ECUAGIROS,ECUADOR,2022-06-27,144,106780.51,3
1,BANCO BISA,BOLIVIA,2023-07-04,8,3650.0,0
2,COOPERATIVA FRONTERIZA (UT),HONDURAS,2022-07-28,29,12747.0,1
3,ABANK (TN),EL SALVADOR,2022-02-01,73,16275.0,1
4,COOPERATIVA CACIL (UT),HONDURAS,2022-08-14,29,10516.0,0


In [40]:
# Merge both df, keeping only daily_check data related to forecast
df_forecast = pd.merge(df_daily_check, final_df, on=['date','payer','country'], how='inner') 

In [41]:
df_forecast['date'].nunique()

6

In [42]:
df_forecast = df_forecast.loc[df_forecast.amount!=0] # To process

In [43]:
df_forecast["pe"] = abs((df_forecast['amount'] - df_forecast['mean']) / df_forecast['amount'])

In [44]:
df_forecast.pe.min(), df_forecast.pe.max()

(0.0010471080255033559, 37.006381835938)

In [45]:
df_forecast

Unnamed: 0,payer,country,date,tx,amount,coupon_count,folder_id,p10,p50,p90,mean,pe
0,ENVICON (ANTERIORMENTE ENVIOS CONFIANZA),MEXICO,2023-03-21,5,5425.79,0,FORECAST_1D_20_03_2023,900.414218,2532.237232,3852.664767,2479.142827,0.543082
1,ENVICON (ANTERIORMENTE ENVIOS CONFIANZA),MEXICO,2023-03-21,5,5425.79,0,FORECAST_1D_20_03_2023,900.414218,2532.237232,3852.664767,2479.142827,0.543082
2,ARGENPER,PERU,2023-05-06,35,9074.60,0,FORECAST_1D_05_05_2023,3395.548091,5836.441886,8225.175661,5792.348341,0.361697
3,ARGENPER,PERU,2023-05-06,35,9074.60,0,FORECAST_1D_05_05_2023,3395.548091,5836.441886,8225.175661,5792.348341,0.361697
4,AFEX,CHILE,2022-12-21,15,4594.00,0,FORECAST_1D_20_12_2022,1710.034373,4590.164825,6774.867471,4481.421242,0.024506
...,...,...,...,...,...,...,...,...,...,...,...,...
1074,INTERMEX,MEXICO,2023-03-21,750,277484.17,5,FORECAST_1D_20_03_2023,248249.706987,291045.540367,326468.436511,290278.960944,0.046110
1075,INTERMEX,MEXICO,2023-03-21,750,277484.17,5,FORECAST_1D_20_03_2023,248249.706987,291045.540367,326468.436511,290278.960944,0.046110
1076,BANRURAL (HONDURAS),HONDURAS,2023-03-21,412,131286.35,3,FORECAST_1D_20_03_2023,104483.991729,131091.985160,153001.197092,130787.387522,0.003801
1077,BANRURAL (HONDURAS),HONDURAS,2023-03-21,412,131286.35,3,FORECAST_1D_20_03_2023,104483.991729,131091.985160,153001.197092,130787.387522,0.003801


In [46]:
df_forecast = df_forecast.sort_values(['folder_id', 'country', 'payer','date'])

In [47]:
df_forecast.head(20)

Unnamed: 0,payer,country,date,tx,amount,coupon_count,folder_id,p10,p50,p90,mean,pe
445,ARGENPER,ARGENTINA,2023-05-06,3,370.75,0,FORECAST_1D_05_05_2023,82.403453,721.255706,1373.7174,707.977127,0.909581
446,ARGENPER,ARGENTINA,2023-05-06,3,370.75,0,FORECAST_1D_05_05_2023,82.403453,721.255706,1373.7174,707.977127,0.909581
398,BANCO BISA,BOLIVIA,2023-05-06,22,7935.0,0,FORECAST_1D_05_05_2023,2677.524762,8941.877138,15033.117704,8877.098098,0.118727
399,BANCO BISA,BOLIVIA,2023-05-06,22,7935.0,0,FORECAST_1D_05_05_2023,2677.524762,8941.877138,15033.117704,8877.098098,0.118727
339,BANCO GANADERO (BOLIVIA),BOLIVIA,2023-05-06,9,5392.12,0,FORECAST_1D_05_05_2023,198.62595,3630.581908,7263.355936,3691.235851,0.315439
340,BANCO GANADERO (BOLIVIA),BOLIVIA,2023-05-06,9,5392.12,0,FORECAST_1D_05_05_2023,198.62595,3630.581908,7263.355936,3691.235851,0.315439
607,BANCO DAYCOVAL,BRAZIL,2023-05-06,234,140443.04,1,FORECAST_1D_05_05_2023,176263.323237,218070.415233,257561.645166,217647.226795,0.549719
608,BANCO DAYCOVAL,BRAZIL,2023-05-06,234,140443.04,1,FORECAST_1D_05_05_2023,176263.323237,218070.415233,257561.645166,217647.226795,0.549719
350,BANCO RENDIMENTO,BRAZIL,2023-05-06,159,116839.74,1,FORECAST_1D_05_05_2023,20496.736489,84796.048108,148818.934281,84432.546514,0.277364
351,BANCO RENDIMENTO,BRAZIL,2023-05-06,159,116839.74,1,FORECAST_1D_05_05_2023,20496.736489,84796.048108,148818.934281,84432.546514,0.277364


In [48]:
df_forecast['payer'].unique()

array(['ARGENPER', 'BANCO BISA', 'BANCO GANADERO (BOLIVIA)',
       'BANCO DAYCOVAL', 'BANCO RENDIMENTO', 'MSBB MONEY', 'TRANSPAY',
       'ZEEPAY', 'AFEX', 'BANCOLOMBIA', 'DAVIVIENDA', 'GRUPO EXITO (TN)',
       'BANCO BHD LEON', 'CARIBE EXPRESS', 'BANCO DE GUAYAQUIL',
       'BANCO DEL AUSTRO', 'EASY PAGOS', 'ECUAGIROS', 'GLOBAL ENVIOS',
       'ABANK (TN)', 'BANCO AGRICOLA',
       'BANCO DAVIVIENDA SALVADORENO (BTS)', 'CREDOMATIC',
       'CUSCA (CITI - RECIBA NETWORKS)', 'FEDECACES (EL SALVADOR)',
       'FEDECREDITO (RYT)', 'PROMERICA', 'NAFA',
       'BAM - BANCO AGROMERCANTIL (UT)', 'BANCO INDUSTRIAL',
       'BANRURAL (RYT)', 'BANTRAB (UT)', 'ELEKTRA (BTS)',
       'G Y T CONTINENTAL', 'MICOOPE-FENACOAC (RED CHAPINA)',
       'PROMERICA (GUA)', 'UNITRANSFER', 'BANCO ATLANTIDA',
       'BANCO DE OCCIDENTE', 'BANCO POPULAR HONDURAS (UT)', 'BANHCAFE',
       'BANPAIS (RED CHAPINA)', 'BANRURAL (HONDURAS)',
       'COOPERATIVA CACIL (UT)', 'COOPERATIVA FRONTERIZA (UT)',
       'DAV

In [50]:
# ELEKTRA MEXICO
df_forecast[df_forecast['payer'] == 'ELEKTRA (MEXICO)'].groupby('folder_id')['pe'].mean()

folder_id
FORECAST_1D_05_05_2023    0.126028
FORECAST_1D_10_09_2023    0.212293
FORECAST_1D_20_03_2023    0.157090
FORECAST_1D_20_12_2022    0.002657
FORECAST_1D_25_11_2022    0.052759
FORECAST_1D_29_12_2022    0.081638
Name: pe, dtype: float64

In [51]:
# ELEKTRA MEXICO
df_forecast[df_forecast['country'] == 'MEXICO'].groupby(['folder_id', 'country'])['pe'].mean()

folder_id               country
FORECAST_1D_05_05_2023  MEXICO     0.176420
FORECAST_1D_10_09_2023  MEXICO     0.267668
FORECAST_1D_20_03_2023  MEXICO     0.309759
FORECAST_1D_20_12_2022  MEXICO     0.098781
FORECAST_1D_25_11_2022  MEXICO     0.169074
FORECAST_1D_29_12_2022  MEXICO     0.164888
Name: pe, dtype: float64

In [122]:
df_mex = df_forecast[df_forecast['country'] == 'MEXICO']

In [123]:
df_first_grouped = df_first.sort_values(['folder_id','country']).groupby(['folder_id', 'country'])['pe'].mean().reset_index()

In [127]:
pd.set_option('display.float_format', '{:.2f}'.format)
df_first_grouped[df_first_grouped['country'] == 'ARGENTINA']

Unnamed: 0,folder_id,country,pe
0,FORECAST_7D_05_05_2023,ARGENTINA,0.44
39,FORECAST_7D_10_09_2023,ARGENTINA,0.29
84,FORECAST_7D_20_03_2023,ARGENTINA,0.1
119,FORECAST_7D_20_12_2022,ARGENTINA,0.25
153,FORECAST_7D_25_11_2022,ARGENTINA,0.36
187,FORECAST_7D_29_12_2022,ARGENTINA,0.52
221,FORECAST_7D_30_06_2023,ARGENTINA,0.46


In [126]:
pd.set_option('display.float_format', '{:.2f}'.format)
df_first_grouped[df_first_grouped['country'] == 'MEXICO']

Unnamed: 0,folder_id,country,pe
24,FORECAST_7D_05_05_2023,MEXICO,0.15
69,FORECAST_7D_10_09_2023,MEXICO,0.29
105,FORECAST_7D_20_03_2023,MEXICO,0.3
139,FORECAST_7D_20_12_2022,MEXICO,0.13
173,FORECAST_7D_25_11_2022,MEXICO,0.2
207,FORECAST_7D_29_12_2022,MEXICO,0.18
250,FORECAST_7D_30_06_2023,MEXICO,0.18


In [129]:
df_mex.head()

Unnamed: 0,payer,country,date,tx,amount,coupon_count,folder_id,p10,p50,p90,mean,pe
3050,AIRPAK,MEXICO,2023-05-06,282,107151.17,3,FORECAST_7D_05_05_2023,70957.97,100672.45,130386.93,100672.45,0.06
4367,AIRPAK,MEXICO,2023-05-07,396,160340.3,6,FORECAST_7D_05_05_2023,80323.75,114335.59,148347.43,114335.59,0.29
1535,AIRPAK,MEXICO,2023-05-08,412,130867.52,3,FORECAST_7D_05_05_2023,62449.74,89185.2,115920.67,89185.2,0.32
3632,AIRPAK,MEXICO,2023-05-09,396,122977.91,4,FORECAST_7D_05_05_2023,49373.96,70742.7,92111.44,70742.7,0.42
820,AIRPAK,MEXICO,2023-05-10,318,85153.38,7,FORECAST_7D_05_05_2023,46803.01,67278.42,87753.84,67278.42,0.21


In [132]:
df_mex_7d=df_forecast.loc[df_forecast.country=='MEXICO']
df_mex_7d.head()

Unnamed: 0,payer,country,date,tx,amount,coupon_count,folder_id,p10,p50,p90,mean,pe
3050,AIRPAK,MEXICO,2023-05-06,282,107151.17,3,FORECAST_7D_05_05_2023,70957.97,100672.45,130386.93,100672.45,0.06
4367,AIRPAK,MEXICO,2023-05-07,396,160340.3,6,FORECAST_7D_05_05_2023,80323.75,114335.59,148347.43,114335.59,0.29
1535,AIRPAK,MEXICO,2023-05-08,412,130867.52,3,FORECAST_7D_05_05_2023,62449.74,89185.2,115920.67,89185.2,0.32
3632,AIRPAK,MEXICO,2023-05-09,396,122977.91,4,FORECAST_7D_05_05_2023,49373.96,70742.7,92111.44,70742.7,0.42
820,AIRPAK,MEXICO,2023-05-10,318,85153.38,7,FORECAST_7D_05_05_2023,46803.01,67278.42,87753.84,67278.42,0.21


In [138]:
results_mex = df_mex_7d.groupby(['country', 'date']).agg({'amount': 'sum', 'mean': 'sum'}).reset_index()


In [139]:
results_mex.head()

Unnamed: 0,country,date,amount,mean
0,MEXICO,2022-11-26,16392615.77,19134014.07
1,MEXICO,2022-11-27,15367715.39,18280479.36
2,MEXICO,2022-11-28,14468737.03,15052783.7
3,MEXICO,2022-11-29,12165362.6,12484108.97
4,MEXICO,2022-11-30,11639041.43,11576455.73


In [149]:
results_mex['date'] = pd.to_datetime(results_mex['date'])

In [150]:
results_mex['forecast_error']= results_mex.amount - results_mex['mean']


In [161]:
results_mex.head(2)

Unnamed: 0,country,date,amount,mean,forecast_error,abs_forecast_error
0,MEXICO,2022-11-26,16392615.77,19134014.07,-2741398.31,2741398.31
1,MEXICO,2022-11-27,15367715.39,18280479.36,-2912763.97,2912763.97


In [155]:
results_mex['abs_forecast_error'] = results_mex['forecast_error'].abs()

In [160]:
results_mex.head(15)

Unnamed: 0,country,date,amount,mean,forecast_error,abs_forecast_error
0,MEXICO,2022-11-26,16392615.77,19134014.07,-2741398.31,2741398.31
1,MEXICO,2022-11-27,15367715.39,18280479.36,-2912763.97,2912763.97
2,MEXICO,2022-11-28,14468737.03,15052783.7,-584046.67,584046.67
3,MEXICO,2022-11-29,12165362.6,12484108.97,-318746.37,318746.37
4,MEXICO,2022-11-30,11639041.43,11576455.73,62585.69,62585.69
5,MEXICO,2022-12-01,11656846.26,11953925.2,-297078.94,297078.94
6,MEXICO,2022-12-02,17738682.7,17442805.64,295877.06,295877.06
7,MEXICO,2022-12-21,14385666.55,13802305.4,583361.15,583361.15
8,MEXICO,2022-12-22,16016125.68,14175883.17,1840242.51,1840242.51
9,MEXICO,2022-12-23,22185561.56,19998555.54,2187006.02,2187006.02


In [158]:
results_mex.abs_forecast_error.mean()

2424944.9285719595

In [159]:
results_mex.amount.mean()

15958012.678420408

In [162]:
results_mex.abs_forecast_error.sum()

118822301.50002602

In [163]:
results_mex.amount.sum()

781942621.2426

In [164]:
print(results_mex.abs_forecast_error.sum()/results_mex.amount.sum())

0.15195782691983617


In [None]:
###SIN DATOS DE 2022##

In [168]:
results_mex_2023=results_mex.loc[results_mex.date>='2023-01-01']

In [169]:
results_mex_2023.head()

Unnamed: 0,country,date,amount,mean,forecast_error,abs_forecast_error
16,MEXICO,2023-01-01,4004948.21,12035763.95,-8030815.74,8030815.74
17,MEXICO,2023-01-02,15195059.45,13840919.03,1354140.42,1354140.42
18,MEXICO,2023-01-03,12807895.02,12005806.89,802088.13,802088.13
19,MEXICO,2023-01-04,11630571.28,11618820.64,11750.64,11750.64
20,MEXICO,2023-01-05,10658682.47,12049546.5,-1390864.03,1390864.03


In [170]:
results_mex_2023.abs_forecast_error.mean()

2470668.787897782

In [171]:
results_mex_2023.amount.mean()

16791953.8284

In [None]:
results_mex_2023.abs_forecast_error.sum()

In [172]:
results_mex_2023.amount.sum()

554134476.3372

In [173]:
print(results_mex_2023.abs_forecast_error.sum()/results_mex_2023.amount.sum())

0.1471340865479974
