In [2]:
import boto3
import pandas as pd
from io import BytesIO

In [3]:
# Config cliente de S3
s3 = boto3.client('s3')

In [4]:
# Bbucket y prefijo de carpetas
bucket_name = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
prefix_ABTv2 = 'ABTv2/'
prefix_v1 = 'prueba_sk_forecast/'
#root_list = ['prueba_sk_forecast/', 'ABTv2/']

In [5]:
# Obtengo lista de carpetas en el bucket
def list_folders(bucket_name, prefix):
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix, Delimiter='/')
    folders = [prefix['Prefix'].split('/')[-2] for prefix in response.get('CommonPrefixes', [])]
    return folders

In [6]:
# Llamo funcion para listar carpetas
folders_v2 = list_folders(bucket_name, prefix_ABTv2)

In [7]:
# Llamo funcion para listar carpetas
folders_v1 = list_folders(bucket_name, prefix_v1)

In [11]:
# Leo los xlsx y consolido

def read_files(bucket_name, prefix, folders):
    dfs = []
    
    # Iterar sobre cada carpeta
    for folder_name in folders:
        # Obtener la lista de objetos en la carpeta
        objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=f'{prefix}{folder_name}/')['Contents']
        
        # Buscar archivos xlsx en la carpeta
        excel_objects = [obj for obj in objects if obj['Key'].endswith('.xlsx')]
        
        if excel_objects:
#            print(excel_objects)
            # Leer el primer archivo xlsx encontrado
            obj = s3.get_object(Bucket=bucket_name, Key=excel_objects[0]['Key'])
            excel_data = obj['Body'].read() # No se puede leer con pandas directo de S3
            
            df = pd.read_excel(BytesIO(excel_data))
            # Agrego columna con el nombre de la carpeta
            df['folder_name'] = folder_name
            # Agregar el df a la lista
            dfs.append(df)
        else:
            print(f"No se encontraron archivos xlsx en la carpeta {folder_name}.")
    
    if dfs:
        # Concateno todo
        consolidated_df = pd.concat(dfs, ignore_index=True)
        return consolidated_df
    else:
        print("No se encontraron archivos xlsx en ninguna de las carpetas especificadas.")
        return None

In [12]:
# Llamo funcion para leer xlsx y consolidar en un unico DF
df_v2 = read_files(bucket_name, prefix_ABTv2, folders_v2)

No se encontraron archivos xlsx en la carpeta FEDECREDITO (RYT)_EL SALVADOR.
No se encontraron archivos xlsx en la carpeta TOP 10 PAYERS (ENERO 2021-OCT 2023).
No se encontraron archivos xlsx en la carpeta jerarquico.


In [13]:
df_v1 = read_files(bucket_name, prefix_v1, folders_v1)

No se encontraron archivos xlsx en la carpeta AFRO INTERNACIONAL_SIERRA LEONE.
No se encontraron archivos xlsx en la carpeta BANCO PICHINCHA (TN)_ECUADOR.
No se encontraron archivos xlsx en la carpeta MASTERCARD ATM NETWORK_UNITED STATES (ATM & DEPOSITS).
No se encontraron archivos xlsx en la carpeta TELEDOLAR-NIC (TN)_NICARAGUA.
No se encontraron archivos xlsx en la carpeta UNITRANSFER_HAITI.
No se encontraron archivos xlsx en la carpeta jerarquico.


In [14]:
df_merged = pd.merge(df_v1, df_v2, on=['date','folder_name', 'valor_real'], how='outer', suffixes=('_v1', '_v2'))

In [19]:
df_merged[df_merged['folder_name'] == 'WALMART (UT)_MEXICO'].tail(20) # Revisar por que no trae algunos valores

Unnamed: 0,date,valor_real,valor_predicho_v1,mape_v1,error_abs_v1,folder_name,valor_predicho_v2,mape_v2,error_abs_v2
15190,2023-08-29,283785.23,,,,WALMART (UT)_MEXICO,297066.820645,4.680156,13281.590645
15191,2023-09-02,656004.54,,,,WALMART (UT)_MEXICO,648752.341396,1.10551,7252.198604
15192,2023-09-07,357475.0,,,,WALMART (UT)_MEXICO,382581.587235,7.023313,25106.587235
15193,2023-10-01,447440.45,,,,WALMART (UT)_MEXICO,494629.104065,10.546354,47188.654065
15194,2023-10-03,352144.74,,,,WALMART (UT)_MEXICO,294924.159897,16.249165,57220.580103
15195,2023-10-04,341449.25,,,,WALMART (UT)_MEXICO,319080.633851,6.551081,22368.616149
15196,2023-10-06,604532.85,,,,WALMART (UT)_MEXICO,586342.551187,3.008984,18190.298813
15197,2023-10-07,597692.7,,,,WALMART (UT)_MEXICO,627733.816196,5.026181,30041.116196
15198,2023-10-08,436568.94,,,,WALMART (UT)_MEXICO,490118.775137,12.266066,53549.835137
15199,2023-10-09,330574.13,,,,WALMART (UT)_MEXICO,364419.100758,10.238239,33844.970758


In [21]:
df_merged.rename(columns={'folder_name':'payer_country'}, inplace=True)

In [27]:
# Agrupar por payer y calcular el promedio de MAPE
df_payer_v1 = df_merged.groupby('payer_country')['mape_v1'].mean().reset_index()
df_payer_v2 = df_merged.groupby('payer_country')['mape_v2'].mean().reset_index()

In [29]:
df_payer_full = pd.merge(df_payer_v1, df_payer_v2, on='payer_country')

In [31]:
df_payer_full.dropna(subset=['mape_v1', 'mape_v2']) # ACA PARA VER SOLO LOS QUE TIENEN VALORES EN AMBOS

Unnamed: 0,payer_country,mape_v1,mape_v2
9,BANCO AGRICOLA_EL SALVADOR,7.552671,7.317878
10,BANCO ATLANTIDA_HONDURAS,8.616765,8.473772
17,BANCO DE OCCIDENTE_HONDURAS,9.259279,9.895282
18,BANCO DE ORO (BDO)_PHILIPPINES,21.448455,15.727868
22,BANCO INDUSTRIAL_GUATEMALA,9.34231,9.143188
23,BANCO RENDIMENTO_BRAZIL,18.61089,19.017267
24,BANCOLOMBIA_COLOMBIA,14.149273,12.729791
25,BANCOPPEL (APPRIZA)_MEXICO,8.71575,8.483816
28,BANORTE (UT)_MEXICO,10.396381,10.562635
32,BANRURAL (RYT)_GUATEMALA,8.007167,7.975607


#### By Country

In [34]:
df_merged['country'] = df_merged['payer_country'].apply(lambda x: x.split('_')[-1])
df_merged['date']=pd.to_datetime(df_merged['date'])

In [35]:
df_merged.head()

Unnamed: 0,date,valor_real,valor_predicho_v1,mape_v1,error_abs_v1,payer_country,valor_predicho_v2,mape_v2,error_abs_v2,country
0,2023-06-22,40879.68,37418.168826,8.467559,3461.511174,24XORO_MEXICO,,,,MEXICO
1,2023-06-23,45534.13,33711.662003,25.96397,11822.467997,24XORO_MEXICO,,,,MEXICO
2,2023-06-24,36969.46,43115.96231,16.625892,6146.50231,24XORO_MEXICO,,,,MEXICO
3,2023-06-25,59696.11,57171.042589,4.229869,2525.067411,24XORO_MEXICO,,,,MEXICO
4,2023-06-26,36271.54,44938.698015,23.895203,8667.158015,24XORO_MEXICO,,,,MEXICO


In [36]:
df_country=df_merged.groupby(['country','date']).agg({'valor_real': 'sum', 
                                                      'valor_predicho_v1': 'sum',
                                                      'valor_predicho_v2':'sum' }).reset_index()

In [38]:
###DESVIO ABSOLUTO POR DIA##
df_country['error_abs_v1']= abs(df_country.valor_predicho_v1 - df_country.valor_real)
df_country['error_abs_v2']= abs(df_country.valor_predicho_v2 - df_country.valor_real)
###MAPE DIA COUNTRY##
df_country['mape_v1']= df_country.error_abs_v1/df_country.valor_real
df_country['mape_v2']= df_country.error_abs_v2/df_country.valor_real

In [39]:
promedio_mape_por_country_v1 = df_country.groupby('country')['mape_v1'].mean().reset_index()
promedio_mape_por_country_v2 = df_country.groupby('country')['mape_v2'].mean().reset_index()

In [40]:
df_country_full = pd.merge(promedio_mape_por_country_v1, promedio_mape_por_country_v2, on='country')

In [42]:
df_country_full.dropna(subset=['mape_v1', 'mape_v2'])

Unnamed: 0,country,mape_v1,mape_v2
0,BANGLADESH,inf,1.0
1,BELGIUM,inf,1.0
2,BENIN,inf,1.0
3,BOLIVIA,0.330126,1.0
4,BRAZIL,0.168769,0.788449
5,BURKINA FASO,inf,1.0
6,CAMEROON,inf,1.0
7,CHILE,0.500514,1.0
8,COLOMBIA,0.254719,0.459272
9,COSTA RICA,0.177238,1.0


In [43]:
df_viam = pd.read_excel('Book7 (1).xlsx')

In [44]:
df_viam['date'] = pd.to_datetime(df_viam['Date'], errors='coerce')
df_viam.dropna(subset='Date', inplace=True)
df_viam.drop('Date', axis=1, inplace=True)

In [45]:
# Voy a quitar los registros de los dias sabados y domingos
df_viam['day_of_week'] = df_viam['date'].dt.dayofweek
df_viam = df_viam[~df_viam['day_of_week'].isin([5, 6])]

In [46]:
# Genero un df con los datos diarios de MEXICO para comparar
df_mex= df_country.loc[df_country.country=="MEXICO"]
df_mex = df_mex.sort_values('date').reset_index(drop=True)
df_mex.drop('country', axis=1, inplace=True)

In [47]:
df_mex['day_of_week'] = df_mex['date'].dt.dayofweek

In [50]:
df_mex

Unnamed: 0,date,valor_real,valor_predicho_v1,valor_predicho_v2,error_abs_v1,error_abs_v2,mape_v1,mape_v2,day_of_week
0,2023-06-22,2.157071e+07,1.423297e+07,1.331009e+07,7.337744e+06,8.260617e+06,0.340172,0.382955,3
1,2023-06-23,2.236626e+07,2.160235e+07,2.074806e+07,7.639038e+05,1.618195e+06,0.034154,0.072350,4
2,2023-06-24,3.472768e+07,2.449457e+07,2.297985e+07,1.023311e+07,1.174783e+07,0.294667,0.338284,5
3,2023-06-25,3.799082e+07,2.257500e+07,2.122194e+07,1.541582e+07,1.676888e+07,0.405777,0.441393,6
4,2023-06-26,2.916572e+07,1.724518e+07,1.679584e+07,1.192054e+07,1.236988e+07,0.408718,0.424124,0
...,...,...,...,...,...,...,...,...,...
117,2023-10-17,2.683396e+07,1.441947e+07,1.529698e+07,1.241448e+07,1.153697e+07,0.462641,0.429939,1
118,2023-10-18,3.065460e+07,1.256923e+07,1.228065e+07,1.808537e+07,1.837395e+07,0.589972,0.599387,2
119,2023-10-19,3.244510e+07,1.495051e+07,1.469414e+07,1.749459e+07,1.775096e+07,0.539206,0.547108,3
120,2023-10-20,5.021071e+07,2.368869e+07,2.288284e+07,2.652202e+07,2.732787e+07,0.528214,0.544264,4


In [51]:
import numpy as np

def sumar_valores_fines_de_semana(row):
    # Verificar si el día es viernes
    if row['day_of_week'] == 4:
        # Obtener el índice de la fila actual
        current_index = row.name
        
        if current_index + 2 < len(df_mex):
        # Sumar los valores de 'valor_real' y 'valor_predicho' de la fila actual,
        # la siguiente fila (sábado) y la fila después de la siguiente (domingo)
            row['valor_real'] += df_mex.loc[current_index + 1, 'valor_real'] + df_mex.loc[current_index + 2, 'valor_real']
            row['valor_predicho_v1'] += df_mex.loc[current_index + 1, 'valor_predicho_v1'] + df_mex.loc[current_index + 2, 'valor_predicho_v1']
            row['valor_predicho_v2'] += df_mex.loc[current_index + 1, 'valor_predicho_v2'] + df_mex.loc[current_index + 2, 'valor_predicho_v2']
    return row

In [52]:
df_mex = df_mex.apply(sumar_valores_fines_de_semana, axis=1)

In [53]:
df_mex = df_mex[~df_mex['day_of_week'].isin([5, 6])]

In [55]:
df_compare = pd.merge(df_viam, df_mex, on=['date', 'day_of_week'], how='inner')

In [56]:
###DESVIO ABSOLUTO POR DIA##
df_compare['error_abs_v1']= abs(df_compare.valor_predicho_v1 - df_compare.valor_real)
df_compare['error_abs_v2']= abs(df_compare.valor_predicho_v2 - df_compare.valor_real)

###MAPE DIA COUNTRY##
df_compare['mape_v1']= df_compare.error_abs_v1/df_compare.valor_real
df_compare['mape_v2']= df_compare.error_abs_v2/df_compare.valor_real

In [57]:
###DESVIO ABSOLUTO POR DIA##
df_compare['error_abs_viam']= abs(df_compare['Final Estimate'] - df_compare.valor_real)

###MAPE DIA COUNTRY##
df_compare['mape_viam']= df_compare.error_abs_viam/df_compare.valor_real

In [58]:
df_compare

Unnamed: 0,Estimate,Final Estimate,Vol Sold,date,day_of_week,valor_real,valor_predicho_v1,valor_predicho_v2,error_abs_v1,error_abs_v2,mape_v1,mape_v2,error_abs_viam,mape_viam
0,13500000.0,13500000.0,15076661.88,2023-06-22,3.0,2.157071e+07,1.423297e+07,1.331009e+07,7.337744e+06,8.260617e+06,0.340172,0.382955,8.070711e+06,0.374151
1,71978850.0,71978850.0,67556932.92,2023-06-23,4.0,9.508476e+07,6.867192e+07,6.494985e+07,2.641283e+07,3.013491e+07,0.277782,0.316927,2.310591e+07,0.243003
2,15830400.0,15830400.0,16833570.69,2023-06-26,0.0,2.916572e+07,1.724518e+07,1.679584e+07,1.192054e+07,1.236988e+07,0.408718,0.424124,1.333532e+07,0.457226
3,12985875.0,12985875.0,12991152.96,2023-06-27,1.0,1.300551e+07,1.463971e+07,1.398579e+07,1.634197e+06,9.802792e+05,0.125654,0.075374,1.963938e+04,0.001510
4,12500000.0,12500000.0,11500048.56,2023-06-28,2.0,1.144005e+07,1.226122e+07,1.191130e+07,8.211693e+05,4.712524e+05,0.071780,0.041193,1.059950e+06,0.092653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,17000000.0,17000000.0,16686425.97,2023-10-16,0.0,3.237077e+07,1.811846e+07,1.828333e+07,1.425231e+07,1.408744e+07,0.440283,0.435190,1.537077e+07,0.474835
83,13000000.0,13000000.0,13531110.03,2023-10-17,1.0,2.683396e+07,1.441947e+07,1.529698e+07,1.241448e+07,1.153697e+07,0.462641,0.429939,1.383396e+07,0.515539
84,13080000.0,13080000.0,15593882.79,2023-10-18,2.0,3.065460e+07,1.256923e+07,1.228065e+07,1.808537e+07,1.837395e+07,0.589972,0.599387,1.757460e+07,0.573310
85,16000000.0,16000000.0,16500528.70,2023-10-19,3.0,3.244510e+07,1.495051e+07,1.469414e+07,1.749459e+07,1.775096e+07,0.539206,0.547108,1.644510e+07,0.506859
