In [2]:
import boto3

In [3]:
import pandas as pd
from io import StringIO

In [4]:
# Set up S3 client
client = boto3.client('s3')

In [5]:
folders_list =  ['level_country_10_09_2023']

In [6]:
# Bucket name and common prefix
bucket_forecast = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
common_path = 'Forecast/Forecast_analysis/'

<a data-toggle="collapse" href="#collapseExample" role="button" aria-expanded="false" aria-controls="collapseExample">
  <span style="color:green; font-size:2em; font-weight:bold;">Generating DF</span>
</a>

In [7]:
# Initialize a list to store DataFrames
dfs = []

In [8]:
# Iterate over each folder in the list
for folder_item in folders_list:
    # Build the new path for the current folder
    path = f'{common_path}{folder_item}/'
    
    # List objects in the folder
    elements = client.list_objects(Bucket=bucket_forecast, Prefix=path)

    # Iterate over each object in the folder
    for obj in elements.get('Contents', []):
        # Check if the object is a CSV file
        if obj['Key'].endswith('.csv'):
            # Read CSV content from S3
            response = client.get_object(Bucket=bucket_forecast, Key=obj['Key'])
            csv_content = response['Body'].read().decode('utf-8')

            # Transform CSV content to DataFrame
            df = pd.read_csv(StringIO(csv_content))

            
            df['folder_id'] = folder_item # 'folder_id' column to indentify the forecast

            dfs.append(df)

In [9]:
# Concatenate all DataFrames into a single one
final_df = pd.concat(dfs, ignore_index=True)

In [10]:
final_df['folder_id'].unique()

array(['level_country_10_09_2023'], dtype=object)

In [11]:
final_df['date'] = pd.to_datetime(final_df['date']).dt.date

In [12]:
final_df.rename(columns={'item_id':'country'}, inplace=True)

In [13]:
final_df = final_df.applymap(lambda x: x.upper() if isinstance(x, str) else x)

  final_df = final_df.applymap(lambda x: x.upper() if isinstance(x, str) else x)


In [14]:
#final_df = final_df[['folder_id', 'date', 'country','p10', 'p50', 'p90', 'mean']]
final_df = final_df[final_df['country'] == 'MEXICO'][['folder_id', 'date', 'country', 'p10', 'p50', 'p90', 'mean']]

In [15]:
final_df['date'].nunique()

7

<div class="collapse" id="collapseExample">

In [16]:
final_df

Unnamed: 0,folder_id,date,country,p10,p50,p90,mean
80,LEVEL_COUNTRY_10_09_2023,2023-09-11,MEXICO,18202400.0,21408620.0,24614830.0,21408620.0
81,LEVEL_COUNTRY_10_09_2023,2023-09-12,MEXICO,14653100.0,17330280.0,20007460.0,17330280.0
218,LEVEL_COUNTRY_10_09_2023,2023-09-11,MEXICO,17972580.0,19857970.0,21445310.0,19734290.0
219,LEVEL_COUNTRY_10_09_2023,2023-09-12,MEXICO,14772720.0,16132290.0,17629520.0,16086810.0
220,LEVEL_COUNTRY_10_09_2023,2023-09-13,MEXICO,13836150.0,14990700.0,16621200.0,15097820.0
221,LEVEL_COUNTRY_10_09_2023,2023-09-14,MEXICO,15637610.0,16862650.0,18724290.0,16942430.0
222,LEVEL_COUNTRY_10_09_2023,2023-09-15,MEXICO,23547390.0,25324260.0,27435170.0,25336730.0
223,LEVEL_COUNTRY_10_09_2023,2023-09-16,MEXICO,25199710.0,27278920.0,29663220.0,27412840.0
224,LEVEL_COUNTRY_10_09_2023,2023-09-17,MEXICO,23965270.0,26159270.0,28612820.0,26294430.0


In [17]:
# DB Setting
bucket_name = 's3://viamericas-datalake-dev-us-east-1-283731589572-athena/'
origin_name = 'AwsDataCatalog'
database_name= 'analytics'
table_name = 'daily_check'

In [21]:
#pip install pyathena

In [22]:
from pyathena import connect

#Connection
conn = connect(s3_staging_dir=f'{bucket_name}queryresults/',
               region_name='us-east-1')
df_daily_check = pd.read_sql(f'SELECT * FROM {origin_name}.{database_name}.{table_name};', conn)

  df_daily_check = pd.read_sql(f'SELECT * FROM {origin_name}.{database_name}.{table_name};', conn)


In [23]:
df_daily_check['date'] = pd.to_datetime(df_daily_check['date']).dt.date

In [24]:
df_daily_check.head()

Unnamed: 0,payer,country,date,tx,amount,coupon_count
0,TELECOMM TELEGRAFOS (APPRIZA),MEXICO,2023-04-24,977,536661.67,22
1,ELEKTRA (BTS),GUATEMALA,2021-02-27,157,24916.09,121
2,OXXO (TN),MEXICO,2021-10-05,210,15696.39,37
3,TRANSFER DIRECTO,MEXICO,2023-08-12,123,80301.68,2
4,BANCO DE GUAYAQUIL,ECUADOR,2021-12-31,58,15688.22,2


In [25]:
grouped_daily = df_daily_check.groupby(['date','country']).agg({
    'amount': 'sum',
    'tx': 'sum',
    'coupon_count': 'sum'
}).reset_index()

In [26]:
grouped_daily['amount'].sum()

25110624454.414898

In [27]:
# Merge both df, keeping only daily_check data related to forecast
df_forecast = pd.merge(grouped_daily, final_df, on=['date','country'], how='inner').drop_duplicates()

In [28]:
df_forecast['date'].nunique()

7

In [29]:
df_forecast

Unnamed: 0,date,country,amount,tx,coupon_count,folder_id,p10,p50,p90,mean
0,2023-09-11,MEXICO,17073946.44,34597,593,LEVEL_COUNTRY_10_09_2023,18202400.0,21408620.0,24614830.0,21408620.0
1,2023-09-11,MEXICO,17073946.44,34597,593,LEVEL_COUNTRY_10_09_2023,17972580.0,19857970.0,21445310.0,19734290.0
2,2023-09-12,MEXICO,13249850.38,26801,424,LEVEL_COUNTRY_10_09_2023,14653100.0,17330280.0,20007460.0,17330280.0
3,2023-09-12,MEXICO,13249850.38,26801,424,LEVEL_COUNTRY_10_09_2023,14772720.0,16132290.0,17629520.0,16086810.0
4,2023-09-13,MEXICO,12225455.26,24402,397,LEVEL_COUNTRY_10_09_2023,13836150.0,14990700.0,16621200.0,15097820.0
5,2023-09-14,MEXICO,13377976.16,26891,474,LEVEL_COUNTRY_10_09_2023,15637610.0,16862650.0,18724290.0,16942430.0
6,2023-09-15,MEXICO,21357505.72,46491,891,LEVEL_COUNTRY_10_09_2023,23547390.0,25324260.0,27435170.0,25336730.0
7,2023-09-16,MEXICO,23379643.53,52205,976,LEVEL_COUNTRY_10_09_2023,25199710.0,27278920.0,29663220.0,27412840.0
8,2023-09-17,MEXICO,24184710.24,52207,850,LEVEL_COUNTRY_10_09_2023,23965270.0,26159270.0,28612820.0,26294430.0


In [30]:
df_forecast["pe_mean"] = abs((df_forecast['amount'] - df_forecast['mean']) / df_forecast['amount'])
df_forecast["pe_P10"] = abs((df_forecast['amount'] - df_forecast['p10']) / df_forecast['amount'])
df_forecast["pe_P50"] = abs((df_forecast['amount'] - df_forecast['p50']) / df_forecast['amount'])

In [31]:
df_forecast.pe_mean.min(), df_forecast.pe_mean.max()

(0.08723378279350441, 0.307960271392282)

In [32]:
df_forecast

Unnamed: 0,date,country,amount,tx,coupon_count,folder_id,p10,p50,p90,mean,pe_mean,pe_P10,pe_P50
0,2023-09-11,MEXICO,17073946.44,34597,593,LEVEL_COUNTRY_10_09_2023,18202400.0,21408620.0,24614830.0,21408620.0,0.253876,0.066092,0.253876
1,2023-09-11,MEXICO,17073946.44,34597,593,LEVEL_COUNTRY_10_09_2023,17972580.0,19857970.0,21445310.0,19734290.0,0.155813,0.052632,0.163057
2,2023-09-12,MEXICO,13249850.38,26801,424,LEVEL_COUNTRY_10_09_2023,14653100.0,17330280.0,20007460.0,17330280.0,0.30796,0.105907,0.30796
3,2023-09-12,MEXICO,13249850.38,26801,424,LEVEL_COUNTRY_10_09_2023,14772720.0,16132290.0,17629520.0,16086810.0,0.214112,0.114935,0.217545
4,2023-09-13,MEXICO,12225455.26,24402,397,LEVEL_COUNTRY_10_09_2023,13836150.0,14990700.0,16621200.0,15097820.0,0.23495,0.13175,0.226187
5,2023-09-14,MEXICO,13377976.16,26891,474,LEVEL_COUNTRY_10_09_2023,15637610.0,16862650.0,18724290.0,16942430.0,0.266442,0.168907,0.260478
6,2023-09-15,MEXICO,21357505.72,46491,891,LEVEL_COUNTRY_10_09_2023,23547390.0,25324260.0,27435170.0,25336730.0,0.186315,0.102534,0.185731
7,2023-09-16,MEXICO,23379643.53,52205,976,LEVEL_COUNTRY_10_09_2023,25199710.0,27278920.0,29663220.0,27412840.0,0.172509,0.077848,0.166781
8,2023-09-17,MEXICO,24184710.24,52207,850,LEVEL_COUNTRY_10_09_2023,23965270.0,26159270.0,28612820.0,26294430.0,0.087234,0.009074,0.081645


In [33]:
# OVERALL ACCURACY
df_forecast['pe_mean'].mean(), df_forecast['pe_P10'].mean(), df_forecast['pe_P50'].mean()

(0.20880135162364455, 0.09218653358508652, 0.2070289921933232)

In [34]:
# Forecast Precision
(df_forecast['p90'] - df_forecast['p10']).mean()

4107432.8427375373

In [35]:
# Precision
specific_date_country = df_forecast.loc[0]
specific_date_country_accuracy_mean = specific_date_country['pe_mean'].mean()
specific_date_country_accuracy_P10 = specific_date_country['pe_P10'].mean()
specific_date_country_accuracy_P50 = specific_date_country['pe_P50'].mean()
specific_date_country_precision = (specific_date_country['p90'] - specific_date_country['p10']).mean()
specific_date_country_precision

6412429.7432798445

In [36]:
import sagemaker
from sagemaker import get_execution_role

#Set up SageMaker session and role
sagemaker_session = sagemaker.Session()
role = 'arn:aws:iam::283731589572:role/service-role/AmazonSageMaker-ExecutionRole-20231127T122316'

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


In [40]:
#pip install awswrangler

In [41]:
#pip install awswrangler[redshift]

In [42]:
import awswrangler as wr

database = 'viamericas'
cluster_id = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
sql_table = 'forecast_mexico'
redshift_role_arn = 'arn:aws:iam::283731589572:role/redshift-role'

# Connect to Redshift with IAM authentication
wr.redshift.connect(database=database, cluster_identifier=cluster_id, iam_role=redshift_role_arn, dbname=database)


InvalidArgumentCombination: Failed attempt to connect. You MUST pass a connection name (Glue Catalog) OR a secret_id as argument.

In [42]:
# Creating the table @ RS (replace if exists)

wr.redshift.to_sql(
    dataframe=df_forecast,
    table=sql_table,
    schema=database,
    index=False,
    database=database,
    cluster_identifier=cluster_id,
    iam_role=redshift_role_arn,
    mode='overwrite',  # Overwrite mode is used for both creating and replacing
    if_exists='replace'  # Replace the table if it exists, create otherwise
)

TypeError: got an unexpected keyword argument 'dataframe'

In [None]:
# Close the Redshift connection
wr.redshift.close()