In [12]:
import boto3
import pandas as pd
from io import BytesIO

In [13]:
# Config cliente de S3
s3 = boto3.client('s3')

In [22]:
# Bbucket y prefijo de carpetas
bucket_name = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
prefix_ABTv3 = 'ABTv3/Last_releases_7d_8lags/'

In [23]:
# Obtengo lista de carpetas en el bucket
def list_folders(bucket_name, prefix):
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix, Delimiter='/')
    folders = [prefix['Prefix'].split('/')[-2] for prefix in response.get('CommonPrefixes', [])]
    return folders

In [24]:
# Llamo funcion para listar carpetas
folders_v2 = list_folders(bucket_name, prefix_ABTv3)

In [31]:
# Leo los xlsx y consolido

def read_files(bucket_name, prefix):
    dfs = []
    
    # Iterar sobre cada carpeta
    for folder_name in folders_v2:
        # Obtener la lista de objetos en la carpeta
        objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=f'{prefix}{folder_name}/')['Contents']
        
        # Buscar archivos xlsx en la carpeta
        excel_objects = [obj for obj in objects if obj['Key'].endswith('.xlsx') and '2d' not in obj['Key']]
        
        if excel_objects:
            print(excel_objects)
            # Leer el primer archivo xlsx encontrado
            obj = s3.get_object(Bucket=bucket_name, Key=excel_objects[0]['Key'])
            excel_data = obj['Body'].read() # No se puede leer con pandas directo de S3
            
            df = pd.read_excel(BytesIO(excel_data))
            # Agrego columna con el nombre de la carpeta
            df['folder_name'] = folder_name
            # Agregar el df a la lista
            dfs.append(df)
        else:
            print(f"No se encontraron archivos xlsx en la carpeta {folder_name}.")
    
    if dfs:
        # Concateno todo
        consolidated_df = pd.concat(dfs, ignore_index=True)
        return consolidated_df
    else:
        print("No se encontraron archivos xlsx en ninguna de las carpetas especificadas.")
        return None

In [32]:
# Llamo funcion para leer xlsx y consolidar en un unico DF
df = read_files(bucket_name, prefix_ABTv3)

[{'Key': 'ABTv3/Last_releases_7d_8lags/24XORO_MEXICO/predicciones_test_7d24XORO_MEXICO.xlsx', 'LastModified': datetime.datetime(2024, 3, 14, 17, 30, 17, tzinfo=tzlocal()), 'ETag': '"85310c769ee752299e464fbcd51ae0b4"', 'Size': 15915, 'StorageClass': 'STANDARD'}]
[{'Key': 'ABTv3/Last_releases_7d_8lags/ABANK (TN)_EL SALVADOR/predicciones_test_7dABANK (TN)_EL SALVADOR.xlsx', 'LastModified': datetime.datetime(2024, 3, 14, 16, 43, 42, tzinfo=tzlocal()), 'ETag': '"96ae811dc32f6f53cced50e6c11903f4"', 'Size': 15348, 'StorageClass': 'STANDARD'}]
[{'Key': 'ABTv3/Last_releases_7d_8lags/AFEX_CHILE/predicciones_test_7dAFEX_CHILE.xlsx', 'LastModified': datetime.datetime(2024, 3, 14, 18, 33, 17, tzinfo=tzlocal()), 'ETag': '"ebccfe37f5598a53d27b9c4fc3a31c47"', 'Size': 15405, 'StorageClass': 'STANDARD'}]
[{'Key': 'ABTv3/Last_releases_7d_8lags/AFRO INTERNACIONAL_GUINEA/predicciones_test_7dAFRO INTERNACIONAL_GUINEA.xlsx', 'LastModified': datetime.datetime(2024, 3, 15, 12, 39, 52, tzinfo=tzlocal()), 'ETag'

In [33]:
##Cortamos la base al 18/12/23##
df=df.loc[df.date<='2023-12-18']
df.date.max()

Timestamp('2023-12-18 00:00:00')

In [35]:
df.head(3)

Unnamed: 0,date,valor_real,valor_predicho,mape,error_abs,folder_name
0,2023-06-22,40879.68,30687.138736,24.933026,10192.541264,24XORO_MEXICO
1,2023-06-23,45534.13,25007.22153,45.080269,20526.90847,24XORO_MEXICO
2,2023-06-24,36969.46,33851.723411,8.433276,3117.736589,24XORO_MEXICO


In [100]:
#NECESITAMOS CALCULAR EL MAPE PROMEDIO DIARIO A DOS DIAS Y EL ERROR ABSOLUTO DIARIO A 2D##
## POR PAYER Y POR COUNTRY##

In [101]:
## SE CORRIGEN LOS VALORES PREDICHOS NEGATIVOS##

In [37]:
valor_a_reemplazar = 0
df['valor_predicho'] = df['valor_predicho'].apply(lambda x: valor_a_reemplazar if x < 0 else x)

In [103]:
#df.to_excel('predicciones_all_payers_2d.xlsx')

In [38]:
import numpy as np
def mean_without_inf(x):
    x_filtered = x.replace([np.inf, -np.inf], np.nan).dropna()
    return np.mean(x_filtered)

# Aplicar la función personalizada en el método agg
df_ABTv3 = df.groupby('folder_name').agg({'mape': mean_without_inf, 'error_abs': 'mean', 'valor_real': 'sum'}).reset_index()


In [39]:
df_ABTv3.rename(columns={'folder_name': 'payer_country', 'mape': 'mape_v3', 'error_abs': 'error_abs_mean', 'valor_real': 'sum_amount'}, inplace=True)

In [40]:
df_ABTv3=df_ABTv3.sort_values(by='mape_v3')

In [41]:
#df_ABTv3.to_excel('mapes_prom_diarios_f_8d_14_03_24.xlsx')

In [42]:
df.head(3)

Unnamed: 0,date,valor_real,valor_predicho,mape,error_abs,folder_name
0,2023-06-22,40879.68,30687.138736,24.933026,10192.541264,24XORO_MEXICO
1,2023-06-23,45534.13,25007.22153,45.080269,20526.90847,24XORO_MEXICO
2,2023-06-24,36969.46,33851.723411,8.433276,3117.736589,24XORO_MEXICO


#### By Country

In [43]:
df['country'] = df['folder_name'].apply(lambda x: x.split('_')[-1])
df['date']=pd.to_datetime(df['date'])
df.reset_index(drop=True, inplace=True)

In [44]:
df.head()

Unnamed: 0,date,valor_real,valor_predicho,mape,error_abs,folder_name,country
0,2023-06-22,40879.68,30687.138736,24.933026,10192.541264,24XORO_MEXICO,MEXICO
1,2023-06-23,45534.13,25007.22153,45.080269,20526.90847,24XORO_MEXICO,MEXICO
2,2023-06-24,36969.46,33851.723411,8.433276,3117.736589,24XORO_MEXICO,MEXICO
3,2023-06-25,59696.11,59939.758076,0.408147,243.648076,24XORO_MEXICO,MEXICO
4,2023-06-26,36271.54,43805.782795,20.771775,7534.242795,24XORO_MEXICO,MEXICO


In [45]:
df_country=df.groupby(['country','date']).agg({'valor_real': 'sum', 'valor_predicho': 'sum', 'valor_real': 'sum'}).reset_index()

In [46]:
###DESVIO ABSOLUTO POR DIA##
df_country['error_abs']= abs(df_country.valor_predicho - df_country.valor_real)

###MAPE DIA COUNTRY##
df_country['mape']= df_country.error_abs/df_country.valor_real

In [47]:
promedio_mape_por_country = df_country.groupby('country').agg({'mape': mean_without_inf , 'error_abs': 'mean', 'valor_real': 'sum' }).reset_index()

In [48]:
promedio_mape_por_country.rename(columns={'valor_real': 'sum_amount'}, inplace=True)

In [49]:
promedio_mape_por_country['mape']=promedio_mape_por_country.mape * 100

In [50]:
promedio_mape_por_country.sort_values(by='mape', ascending=True)

Unnamed: 0,country,mape,error_abs,sum_amount
14,EL SALVADOR,6.397689,92367.48,259962400.0
22,HONDURAS,7.433348,147473.6,366324400.0
12,DOMINICAN REPUBLIC,8.097179,34990.74,76806300.0
30,MEXICO,8.448203,1478754.0,3367170000.0
19,GUATEMALA,8.489455,647456.3,1469487000.0
32,NICARAGUA,10.10157,70541.32,134682600.0
13,ECUADOR,10.592713,57999.06,98958770.0
34,PERU,12.238467,17512.95,28412910.0
9,COLOMBIA,14.438955,94258.0,132534700.0
35,PHILIPPINES,14.554763,97608.15,136222400.0


In [51]:
promedio_mape_por_country.shape

(44, 4)

In [None]:
##HASTA ACÁ CON LA SALIDA DE LOS MODELOS DE FORECAST EN BACKTESTING A 2D##

In [52]:
promedio_mape_por_country=promedio_mape_por_country.sort_values(by='mape', ascending=True)
promedio_mape_por_country.to_excel('mapes_promedio_por_country_14_03_2024_f_8d.xlsx')