In [2]:
import boto3

In [3]:
import pandas as pd
from io import StringIO

In [4]:
# Set up S3 client
client = boto3.client('s3')

In [5]:
folders_list =  ['level_country_10_09_2023']

In [6]:
# Bucket name and common prefix
bucket_forecast = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
common_path = 'Forecast/Forecast_analysis/'

In [7]:
# Initialize a list to store DataFrames
dfs = []

In [33]:
# Iterate over each folder in the list
for folder_item in folders_list:
    # Build the new path for the current folder
    path = f'{common_path}{folder_item}/'
    
    # List objects in the folder
    elements = client.list_objects(Bucket=bucket_forecast, Prefix=path)

    # Iterate over each object in the folder
    for obj in elements.get('Contents', []):
        # Check if the object is a CSV file
        if obj['Key'].endswith('.csv'):
            # Read CSV content from S3
            response = client.get_object(Bucket=bucket_forecast, Key=obj['Key'])
            csv_content = response['Body'].read().decode('utf-8')

            # Transform CSV content to DataFrame
            df = pd.read_csv(StringIO(csv_content))

            
            df['folder_id'] = folder_item # 'folder_id' column to indentify the forecast

            dfs.append(df)

In [34]:
# Concatenate all DataFrames into a single one
final_df = pd.concat(dfs, ignore_index=True)

In [35]:
final_df['folder_id'].unique()

array(['level_country_10_09_2023'], dtype=object)

In [36]:
final_df['date'] = pd.to_datetime(final_df['date']).dt.date

In [37]:
final_df

Unnamed: 0,item_id,date,p10,p50,p90,mean,folder_id
0,philippines,2023-09-11,763147.187500,901665.062500,1.017391e+06,8.923422e+05,level_country_10_09_2023
1,philippines,2023-09-12,758869.875000,909481.187500,1.069623e+06,9.004291e+05,level_country_10_09_2023
2,philippines,2023-09-13,816989.437500,934749.437500,1.106833e+06,9.426761e+05,level_country_10_09_2023
3,philippines,2023-09-14,837347.625000,956794.750000,1.131879e+06,9.639903e+05,level_country_10_09_2023
4,philippines,2023-09-15,871680.875000,997848.375000,1.143093e+06,1.000493e+06,level_country_10_09_2023
...,...,...,...,...,...,...,...
639,united states,2023-09-13,90348.882812,110202.460938,1.404780e+05,1.120975e+05,level_country_10_09_2023
640,united states,2023-09-14,81936.453125,103233.921875,1.344908e+05,1.047100e+05,level_country_10_09_2023
641,united states,2023-09-15,106647.898438,130330.734375,1.589139e+05,1.312919e+05,level_country_10_09_2023
642,united states,2023-09-16,101069.945312,123758.804688,1.486414e+05,1.257572e+05,level_country_10_09_2023


In [39]:
final_df.rename(columns={'item_id':'country'}, inplace=True)

In [40]:
final_df = final_df[['folder_id', 'date', 'country','p10', 'p50', 'p90', 'mean']]

In [41]:
final_df['date'].nunique()

7

In [42]:
final_df = final_df.applymap(lambda x: x.upper() if isinstance(x, str) else x)

  final_df = final_df.applymap(lambda x: x.upper() if isinstance(x, str) else x)


In [43]:
final_df.head()

Unnamed: 0,folder_id,date,country,p10,p50,p90,mean
0,LEVEL_COUNTRY_10_09_2023,2023-09-11,PHILIPPINES,763147.1875,901665.0625,1017391.0,892342.2
1,LEVEL_COUNTRY_10_09_2023,2023-09-12,PHILIPPINES,758869.875,909481.1875,1069623.0,900429.1
2,LEVEL_COUNTRY_10_09_2023,2023-09-13,PHILIPPINES,816989.4375,934749.4375,1106833.0,942676.1
3,LEVEL_COUNTRY_10_09_2023,2023-09-14,PHILIPPINES,837347.625,956794.75,1131879.0,963990.3
4,LEVEL_COUNTRY_10_09_2023,2023-09-15,PHILIPPINES,871680.875,997848.375,1143093.0,1000493.0


In [19]:
# DB Setting
bucket_name = 's3://viamericas-datalake-dev-us-east-1-283731589572-athena/'
origin_name = 'AwsDataCatalog'
database_name= 'analytics'
table_name = 'daily_check'

In [22]:
#pip install pyathena

In [23]:
from pyathena import connect

#Connection
conn = connect(s3_staging_dir=f'{bucket_name}queryresults/',
               region_name='us-east-1')
df_daily_check = pd.read_sql(f'SELECT * FROM {origin_name}.{database_name}.{table_name};', conn)

  df_daily_check = pd.read_sql(f'SELECT * FROM {origin_name}.{database_name}.{table_name};', conn)


In [24]:
df_daily_check['date'] = pd.to_datetime(df_daily_check['date']).dt.date

In [25]:
df_daily_check.head()

Unnamed: 0,payer,country,date,tx,amount,coupon_count
0,ECUAGIROS,ECUADOR,2022-06-27,144,106780.51,3
1,BANCO BISA,BOLIVIA,2023-07-04,8,3650.0,0
2,COOPERATIVA FRONTERIZA (UT),HONDURAS,2022-07-28,29,12747.0,1
3,ABANK (TN),EL SALVADOR,2022-02-01,73,16275.0,1
4,COOPERATIVA CACIL (UT),HONDURAS,2022-08-14,29,10516.0,0


In [29]:
grouped_daily = df_daily_check.groupby(['date','country']).agg({
    'amount': 'sum',
    'tx': 'sum',
    'coupon_count': 'sum'
}).reset_index()

In [31]:
grouped_daily['amount'].sum()

25110624454.414898

In [79]:
# Merge both df, keeping only daily_check data related to forecast
df_forecast = pd.merge(grouped_daily, final_df, on=['date','country'], how='inner').drop_duplicates()

In [80]:
df_forecast['date'].nunique()

7

In [81]:
df_forecast['country'].nunique()

45

In [83]:
df_forecast = df_forecast.loc[df_forecast.amount!=0] # To process

In [84]:
df_forecast["pe_mean"] = abs((df_forecast['amount'] - df_forecast['mean']) / df_forecast['amount'])

In [92]:
df_forecast["pe_P50"] = abs((df_forecast['amount'] - df_forecast['p50']) / df_forecast['amount'])

In [105]:
df_forecast["pe_P10"] = abs((df_forecast['amount'] - df_forecast['p10']) / df_forecast['amount'])

In [93]:
df_forecast.pe_mean.min(), df_forecast.pe_mean.max()

(0.0011624069905307145, 9.697691870517213)

In [94]:
df_forecast.pe_P50.min(), df_forecast.pe_P50.max()

(0.0009499706076898692, 9.35990030257459)

In [107]:
df_forecast[df_forecast['pe_mean'] == 9.697691870517213]

Unnamed: 0,date,country,amount,tx,coupon_count,folder_id,p10,p50,p90,mean,pe_mean,pe_P50,pe_P10
278,2023-09-14,SENEGAL,122.0,1,0,LEVEL_COUNTRY_10_09_2023,-348.5,1263.91,2950.64,1305.12,9.7,9.36,3.86


In [96]:
df_forecast = df_forecast.sort_values(['folder_id', 'country', 'date'])

In [106]:
df_forecast[df_forecast['country'] == 'MEXICO']

Unnamed: 0,date,country,amount,tx,coupon_count,folder_id,p10,p50,p90,mean,pe_mean,pe_P50,pe_P10
48,2023-09-11,MEXICO,17073946.44,34597,593,LEVEL_COUNTRY_10_09_2023,17972578.0,19857966.0,21445312.0,19734292.0,0.16,0.16,0.05
124,2023-09-12,MEXICO,13249850.38,26801,424,LEVEL_COUNTRY_10_09_2023,14772725.0,16132289.0,17629524.0,16086806.0,0.21,0.22,0.11
192,2023-09-13,MEXICO,12225455.26,24402,397,LEVEL_COUNTRY_10_09_2023,13836153.0,14990697.0,16621203.0,15097825.0,0.23,0.23,0.13
264,2023-09-14,MEXICO,13377976.16,26891,474,LEVEL_COUNTRY_10_09_2023,15637608.0,16862650.0,18724292.0,16942434.0,0.27,0.26,0.17
336,2023-09-15,MEXICO,21357505.72,46491,891,LEVEL_COUNTRY_10_09_2023,23547386.0,25324260.0,27435166.0,25336734.0,0.19,0.19,0.1
408,2023-09-16,MEXICO,23379643.53,52205,976,LEVEL_COUNTRY_10_09_2023,25199712.0,27278920.0,29663218.0,27412840.0,0.17,0.17,0.08
478,2023-09-17,MEXICO,24184710.24,52207,850,LEVEL_COUNTRY_10_09_2023,23965270.0,26159272.0,28612824.0,26294434.0,0.09,0.08,0.01


In [98]:
df_forecast['country'].nunique()

45

In [101]:
df_first = df_forecast.drop_duplicates(subset=['folder_id', 'country'], keep='first')

In [61]:
df_mex = df_forecast[df_forecast['country'] == 'MEXICO']

In [63]:
df_first_grouped = df_first.sort_values(['folder_id','country']).groupby(['folder_id', 'country'])['pe'].mean().reset_index()

In [64]:
pd.set_option('display.float_format', '{:.2f}'.format)
df_first_grouped[df_first_grouped['country'] == 'ARGENTINA']

Unnamed: 0,folder_id,country,pe
0,LEVEL_COUNTRY_10_09_2023,ARGENTINA,0.48


In [65]:
pd.set_option('display.float_format', '{:.2f}'.format)
df_first_grouped[df_first_grouped['country'] == 'MEXICO']

Unnamed: 0,folder_id,country,pe
30,LEVEL_COUNTRY_10_09_2023,MEXICO,0.16
