In [3]:
# !pip install awswrangler
# !pip install holidays

In [4]:
import awswrangler as wr
import pandas as pd
import datetime as dt
from datetime import datetime
import holidays

In [5]:
#TIMEFRAME
start_date = '2021-01-01'
end_date = '2023-12-31'

### DFs generation

In [6]:
# DB Setting
bucket_name = 's3://viamericas-datalake-dev-us-east-1-283731589572-athena/'
origin_name = 'AwsDataCatalog'
database_name= 'viamericas'
table_name = 'daily_check_gp'

In [7]:
df = wr.athena.read_sql_table(
    table=table_name,
    database=database_name,
)

In [8]:
df.day.max() # Solo para chequear

'2024-02-03'

In [9]:
# Convert the 'date' column to datetime format
df['day'] = pd.to_datetime(df['day'])
# Grouping by 'payer' and 'country' concatenated for this level of granularity
df['payer_country'] = df['payer'] + '_' + df['country']
# Margin (when tx !=0)
df['margin'] = df.apply(lambda row: row['gp'] / row['tx'] if row['tx'] != 0 else 0, axis=1)
df['margin'] = df['margin'].apply(lambda x: float(x)).round(4)

In [10]:
# Specify date range
df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

In [11]:
#Filtros
df = df[df['payer'] != 'EXPIRED ORDERS']
df = df[df['amount'] != 0] # Excluding 0 (flag A & Flag C), defined in EDA

In [12]:
df['amount'].sum()

Decimal('27565805252.2325')

In [13]:
#df.isna().sum() # Reviso si hay valores nulos

In [14]:
def fill_missing_dates(df, start_date, end_date):
    """
    Fill missing dates in the DataFrame with zero values and ensure all date ranges are covered.

    Args:
        df (pandas.DataFrame): Input DataFrame with columns 'date', 'amount', 'tx_cancelled', 'payer_country', etc.
        start_date (str or datetime.date): Start date of the desired date range.
        end_date (str or datetime.date): End date of the desired date range.

    Returns:
        pandas.DataFrame: DataFrame with missing dates filled and all date ranges covered.
    """
    # Convertir la columna 'date' a tipo datetime si aún no lo está
    df['date'] = pd.to_datetime(df['date'])
    
    # Definir el rango de fechas deseado
    date_range = pd.date_range(start=start_date, end=end_date)
    
    # Obtener el rango de fechas mínimo y máximo para cada 'payer_country'
    payer_country_ranges = df.groupby('payer_country')['date'].agg(['min', 'max']).reset_index()
    payer_country_ranges['min'] = payer_country_ranges['min'].fillna(pd.to_datetime(start_date))
    payer_country_ranges['max'] = payer_country_ranges['max'].fillna(pd.to_datetime(end_date))
    
    # Combinar el DataFrame original con el DataFrame de todas las combinaciones de fechas
    df_filled = pd.DataFrame()
    for index, row in payer_country_ranges.iterrows():
        payer_country = row['payer_country']
        start_payer = row['min']
        end_payer = row['max']
        
        # Filtrar el DataFrame original por 'payer_country'
        df_payer = df[df['payer_country'] == payer_country]
        
        # Rellenar valores faltantes en el rango de fechas del 'payer_country'
        date_range_payer = pd.date_range(start=start_payer, end=end_payer)
        date_combinations = pd.DataFrame({'date': date_range_payer, 'payer_country': payer_country})
        df_combined = pd.merge(date_combinations, df_payer, on=['date', 'payer_country'], how='left')
        
        # Rellenar valores faltantes con cero
        numeric_columns = ['amount', 'coupon_count', 'tx', 'gp', 'margin']
        df_combined[numeric_columns] = df_combined[numeric_columns].fillna(0)
        
        # Rellenar valores faltantes en las columnas 'payer' y 'country' utilizando el método ffill
        df_combined[['payer', 'country']] = df_combined[['payer', 'country']].ffill()
        
        # Rellenar valores faltantes en la columna 'day' con los valores de la columna 'date' cuando sea NaN
        df_combined['day'] = df_combined['day'].fillna(df_combined['date'])
        
        df_filled = pd.concat([df_filled, df_combined], ignore_index=True)
    
    return df_filled

In [15]:
# Fill missing dates in df_filtered
df_filled = fill_missing_dates(df, start_date, end_date)

In [16]:
df_filled['amount'].sum()

Decimal('27565805252.2325')

In [17]:
#df_filled[df_filled['tx'] == 0] # Hay casos donde tx es 0, pero GP no es cero

In [18]:
#Connection to daily_forex 
forex_table = 'last_daily_forex_country'

rates = wr.athena.read_sql_table(
    table=forex_table,
    database=database_name)

In [19]:
# FOREX - Selecting columns & renaming
rates['day'] = pd.to_datetime(rates['day'])
rates = rates[['day','country','max_feed_price']]

### UNIVERSE

In [20]:
# AGING FILTER

def aging_filter(df):
    """
    Filter a DataFrame based on aging criteria described in aging.ipynb

    Args:
        df (pandas.DataFrame): Input DataFrame with columns 'date', 'payer_country', 'amount', and 'tx'.

    Returns:
        pandas.DataFrame: Filtered DataFrame containing only the rows that meet the aging criteria.
    """
    # Find the last date in the sample
    last_date_sample = df['day'].max()

    # Calculate the limit date, one day before the last date in the sample
    limit_date = last_date_sample - pd.Timedelta(days=1)

    # Aggregate data by 'payer_country'
    result = (
        df.groupby('payer_country')
        .agg(
            first_date=('day', 'min'),
            last_date=('day', 'max'),
            total_amount=('amount', 'sum'),
            total_transactions=('tx', 'sum')
        )
        .reset_index()
    )

    # Calculate age of payer
    result['age_payer'] = ((limit_date - result['first_date']).dt.days / 30).round(2)

    # Calculate active time
    result['active_time'] = ((result['last_date'] - result['first_date']).dt.days / 30).round(2)

    # Calculate inactive time
    result['inactive_time'] = ((limit_date - result['last_date']).dt.days / 30).round(2)

    # Sort the DataFrame by 'total_amount' from highest to lowest
    result = result.sort_values(by='total_amount', ascending=False)

    # Filter the DataFrame based on conditions
    aging_universe = result.loc[
        (result.age_payer >= 3) & 
        (result.inactive_time <= 3) & 
        (result.total_amount > 10000) & 
        (result.total_transactions > 50)
    ]
    
    return aging_universe

In [21]:
# Defining Universe
df_aging = aging_filter(df_filled) #Filtering 'payer_country' based on Aging notebook
df_filtered = df_filled[df_filled['payer_country'].isin(df_aging['payer_country'])] # Applying aging filters 
df_filtered['day'] = pd.to_datetime(df_filtered['day'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['day'] = pd.to_datetime(df_filtered['day'])


In [22]:
df_filtered['amount'].sum(), df_filtered.shape

(Decimal('26753064533.7617'), (114347, 10))

In [23]:
df_filtered.sort_values(['payer_country','date'])

Unnamed: 0,date,payer_country,payer,country,tx,amount,coupon_count,gp,day,margin
0,2023-02-28,24XORO_MEXICO,24XORO,MEXICO,1,20.0000,0,3.5600000,2023-02-28,3.5600
1,2023-03-01,24XORO_MEXICO,24XORO,MEXICO,1,25.0000,0,6.2600000,2023-03-01,6.2600
2,2023-03-02,24XORO_MEXICO,24XORO,MEXICO,0,0,0,0,2023-03-02,0.0000
3,2023-03-03,24XORO_MEXICO,24XORO,MEXICO,0,0,0,0,2023-03-03,0.0000
4,2023-03-04,24XORO_MEXICO,24XORO,MEXICO,0,0,0,0,2023-03-04,0.0000
...,...,...,...,...,...,...,...,...,...,...
143855,2023-12-17,ZEEPAY_GHANA,ZEEPAY,GHANA,1,173.0000,0,10.5000000,2023-12-17,10.5000
143856,2023-12-18,ZEEPAY_GHANA,ZEEPAY,GHANA,7,3176.0000,0,27.7000000,2023-12-18,3.9571
143857,2023-12-19,ZEEPAY_GHANA,ZEEPAY,GHANA,4,585.4300,0,14.9000000,2023-12-19,3.7250
143858,2023-12-20,ZEEPAY_GHANA,ZEEPAY,GHANA,3,580.0100,0,11.5900000,2023-12-20,3.8633


### VARIABLES

In [24]:
def generate_lag_and_variation(df, num_lags):
    """
    Generate lagged values and variations for a given df

    Args:
        df (pandas.DataFrame): Input df with columns 'symbol' and 'feed_price'.
        num_lags (int): Number of lagged values to generate.

    Returns:
        pandas.DataFrame: df with lagged values and variations added as new columns.
    """
    # Create columns for each day's lag up to the defined maximum
    for i in range(1, num_lags + 1):
        col_name = f'rate_lag_{i}'
        # Shift the 'feed_price' column grouped by 'symbol'
        df[col_name] = df.groupby('country')['max_feed_price'].shift(i)

    # Calculate the variation columns between consecutive lags
    for i in range(1, num_lags):
        col_name = f'var_rate_lag_{i}'
        # Calculate the difference between consecutive lag columns
        df[col_name] = df[f'rate_lag_{i}'] - df[f'rate_lag_{i + 1}']

    return df


In [25]:
rates_number = 30
rates = rates.sort_values(['country','day'])
rates = generate_lag_and_variation(rates, rates_number)

In [26]:
# Primera fusion: traigo rates al df donde filtré el universo
df1 = pd.merge(df_filtered, rates, on=['day', 'country'], how='left')

In [27]:
df1['date'] = pd.to_datetime(df1['date'])

In [28]:
df1['amount'].sum(), df1.shape

(Decimal('26753064533.7617'), (114347, 70))

In [29]:
### EFFECT OF CANCELED TRANSACTIONS ###
# ES DISTINTA PORQUE DAILY_CHECK TIENE ALGUNOS FILTROS Y ESTA NO
database_name= 'analytics'
table2_name = 'daily_sales_count_cancelled_v2'##WE LOAD THE BASE WITH CANCELLATIONS

df_canc = wr.athena.read_sql_table(
    table=table2_name,
    database=database_name)

In [30]:
#df_canc['date'].max()

In [31]:
df_canc['date'] = pd.to_datetime(df_canc['date'])
df_canc['payer_country'] = df_canc['payer'] +'_'+ df_canc['country']
# Specific date range
df_canc = df_canc[(df_canc['date'] >= start_date) & (df_canc['date'] <= end_date)]

In [32]:
df_canc['amount'].sum(), df_canc.shape, df_canc['payer_country'].nunique()

(Decimal('27565805252.2325'), (110714, 7), 326)

In [33]:
df_canc.head()

Unnamed: 0,payer,country,date,amount,tx_cancelled,day,payer_country
0,MICOOPE-FENACOAC (RED CHAPINA),GUATEMALA,2021-09-18,120143.28,5,2021-09-18,MICOOPE-FENACOAC (RED CHAPINA)_GUATEMALA
1,FEDECREDITO (RYT),EL SALVADOR,2021-09-18,209394.0,53,2021-09-18,FEDECREDITO (RYT)_EL SALVADOR
2,DAVIVIENDA,COLOMBIA,2021-09-18,44320.47,17,2021-09-18,DAVIVIENDA_COLOMBIA
3,PROMERICA,EL SALVADOR,2021-09-18,31312.0,11,2021-09-18,PROMERICA_EL SALVADOR
4,ABANK (TN),EL SALVADOR,2021-09-18,21180.0,11,2021-09-18,ABANK (TN)_EL SALVADOR


In [34]:
def fill_missing_dates(df, start_date, end_date):
    """
    Fill missing dates in the DataFrame with zero values and ensure all date ranges are covered.

    Args:
        df (pandas.DataFrame): Input DataFrame with columns 'date', 'amount', 'tx_cancelled', 'payer_country', etc.
        start_date (str or datetime.date): Start date of the desired date range.
        end_date (str or datetime.date): End date of the desired date range.

    Returns:
        pandas.DataFrame: DataFrame with missing dates filled and all date ranges covered.
    """
    # Create an empty DataFrame with the specified date range
    date_range = pd.date_range(start=start_date, end=end_date)
    df_fill = pd.DataFrame({'date': date_range, 'amount': 0, 'tx_cancelled': 0})
    df_fill['date'] = pd.to_datetime(df_fill['date']).dt.date

    # Sort the original DataFrame by 'country', 'payer', and 'date'
    df = df.sort_values(by=['country', 'payer', 'date'])

    # Create an empty DataFrame to hold the result
    result_df = pd.DataFrame()

    # Loop through each 'payer_country'
    for payer_country in df['payer_country'].unique():
        # Filter DataFrame by 'payer_country'
        df_aux = df[df['payer_country'] == payer_country]

        # Combine df_aux (payer_country) with df_fill, keeping values from df_aux and filling missing dates
        merged_df = df_aux.set_index('date').combine_first(df_fill.set_index('date')).reset_index()

        # Fill missing values in specified columns
        columns_to_fill = ['payer', 'country', 'payer_country']
        merged_df[columns_to_fill] = merged_df[columns_to_fill].ffill().bfill()

        # Concatenate the result with the final DataFrame
        result_df = pd.concat([result_df, merged_df], ignore_index=True)

    return result_df

In [35]:
# Call the function with the specified start_date and end_date
df_full = fill_missing_dates(df_canc, start_date, end_date)

In [36]:
df_full['amount'].sum(), df_full.shape

(Decimal('27565805252.2325'), (356970, 7))

In [37]:
df_full

Unnamed: 0,date,amount,country,day,payer,payer_country,tx_cancelled
0,2021-01-01,0,ARGENTINA,,AFEX,AFEX_ARGENTINA,0
1,2021-01-02,0,ARGENTINA,,AFEX,AFEX_ARGENTINA,0
2,2021-01-03,0,ARGENTINA,,AFEX,AFEX_ARGENTINA,0
3,2021-01-04,0,ARGENTINA,,AFEX,AFEX_ARGENTINA,0
4,2021-01-05,0,ARGENTINA,,AFEX,AFEX_ARGENTINA,0
...,...,...,...,...,...,...,...
356965,2023-12-27,0,ZIMBABWE,,TRANSFERTO - THUNES,TRANSFERTO - THUNES_ZIMBABWE,0
356966,2023-12-28,0,ZIMBABWE,,TRANSFERTO - THUNES,TRANSFERTO - THUNES_ZIMBABWE,0
356967,2023-12-29,0,ZIMBABWE,,TRANSFERTO - THUNES,TRANSFERTO - THUNES_ZIMBABWE,0
356968,2023-12-30,0,ZIMBABWE,,TRANSFERTO - THUNES,TRANSFERTO - THUNES_ZIMBABWE,0


In [38]:
def generate_tx_lags_and_variation(df, tx_count):
    """
    Generate lag columns for cancelled transactions and their variations.

    Args:
    - df: DataFrame containing transaction data
    - tx_count: Number of periods for lag calculation

    Returns:
    - df: DataFrame with added lag and variation columns
    """
    # Sort the dataset based on country, payer, and date
    df = df.sort_values(by=['country', 'payer', 'date'])

    # Create columns for each day's lag up to the defined maximum
    for i in range(1, tx_count + 1):
        col_name = f'tx_cancelled_lag_{i}'
        # Shift the 'tx_cancelled' column grouped by 'country' and 'payer'
        df[col_name] = df.groupby(['country', 'payer'])['tx_cancelled'].shift(i)

    # Calculate the variation columns between consecutive delays
    for i in range(1, tx_count):
        col_name = f'var_tx_cancelled_lag_{i}'
        # Calculate the difference between consecutive lag columns
        df[col_name] = df[f'tx_cancelled_lag_{i}'] - df[f'tx_cancelled_lag_{i + 1}']

    return df

In [39]:
# Call the function and assign the result back to df2
tx_cancelled_lags = 30
df2 = generate_tx_lags_and_variation(df_full, tx_cancelled_lags)
df2['day'] = pd.to_datetime(df2['day'])

In [40]:
df2['amount'].sum(), df2.shape # Acá voy a tener mas filas porque relleno

(Decimal('27565805252.2325'), (356970, 66))

In [41]:
# Coupon ratio
df1['ratio_coupon_tx']=df1.coupon_count/df1.tx

In [42]:
df1.columns.to_list()

['date',
 'payer_country',
 'payer',
 'country',
 'tx',
 'amount',
 'coupon_count',
 'gp',
 'day',
 'margin',
 'max_feed_price',
 'rate_lag_1',
 'rate_lag_2',
 'rate_lag_3',
 'rate_lag_4',
 'rate_lag_5',
 'rate_lag_6',
 'rate_lag_7',
 'rate_lag_8',
 'rate_lag_9',
 'rate_lag_10',
 'rate_lag_11',
 'rate_lag_12',
 'rate_lag_13',
 'rate_lag_14',
 'rate_lag_15',
 'rate_lag_16',
 'rate_lag_17',
 'rate_lag_18',
 'rate_lag_19',
 'rate_lag_20',
 'rate_lag_21',
 'rate_lag_22',
 'rate_lag_23',
 'rate_lag_24',
 'rate_lag_25',
 'rate_lag_26',
 'rate_lag_27',
 'rate_lag_28',
 'rate_lag_29',
 'rate_lag_30',
 'var_rate_lag_1',
 'var_rate_lag_2',
 'var_rate_lag_3',
 'var_rate_lag_4',
 'var_rate_lag_5',
 'var_rate_lag_6',
 'var_rate_lag_7',
 'var_rate_lag_8',
 'var_rate_lag_9',
 'var_rate_lag_10',
 'var_rate_lag_11',
 'var_rate_lag_12',
 'var_rate_lag_13',
 'var_rate_lag_14',
 'var_rate_lag_15',
 'var_rate_lag_16',
 'var_rate_lag_17',
 'var_rate_lag_18',
 'var_rate_lag_19',
 'var_rate_lag_20',
 'var_rat

In [43]:
def generate_coupon_tx_lags(df, tx_count):
    """
    Generate lag columns for coupon_tx ratio

    Args:
    - df: DataFrame containing transaction data
    - tx_count: Number of periods for lag calculation

    Returns:
    - df: DataFrame with added lag and variation columns
    """
    # Sort the dataset based on country, payer, and date
    df = df.sort_values(by=['country', 'payer', 'date'])

    # Create columns for each day's lag up to the defined maximum
    for i in range(1, tx_count + 1):
        col_name = f'ratio_coupon_tx_lag_{i}'
        # Shift the 'ratio_coupon_tx' column grouped by 'country' and 'payer'
        df[col_name] = df.groupby(['country', 'payer'])['ratio_coupon_tx'].shift(i)

    return df

In [44]:
# Call the function and assign the result back to df1
tx_ratio_coupon_tx_lags = 30
df1 = generate_coupon_tx_lags(df1, tx_ratio_coupon_tx_lags)

In [45]:
df1['amount'].sum(), df1.shape # Acá voy a tener mas filas porque relleno

(Decimal('26753064533.7617'), (114347, 101))

In [46]:
def generate_tx_lags(df, tx_count):
    """
    Generate lags columns for txs

    Args:
    - df: DataFrame containing transaction data
    - tx_count: Number of periods for lag calculation

    Returns:
    - df: DataFrame with added lag and variation columns
    """
    # Sort the dataset based on country, payer, and date
    df = df.sort_values(by=['country', 'payer', 'date'])

    # Create columns for each day's lag up to the defined maximum
    for i in range(1, tx_count + 1):
        col_name = f'tx_lag_{i}'
        # Shift the 'tx' column grouped by 'country' and 'payer'
        df[col_name] = df.groupby(['country', 'payer'])['tx'].shift(i)

    return df

In [47]:
# Call the function and assign the result back to df1
tx_lags = 30
df1 = generate_tx_lags(df1, tx_lags)

In [48]:
df1['amount'].sum(), df1.shape # Acá voy a tener mas filas porque relleno

(Decimal('26753064533.7617'), (114347, 131))

In [49]:
def generate_margin_lags(df, margin_lags):
    """
    Generate lag columns for margin

    Args:
    - df: DataFrame containing transaction data
    - margin_lags: Number of periods for lag calculation

    Returns:
    - df: DataFrame with added lag columns for margin
    """
    # Sort the dataset based on country, payer, and date
    df = df.sort_values(by=['country', 'payer', 'date'])

    # Create columns for each day's lag up to the defined maximum
    for i in range(1, margin_lags + 1):
        col_name = f'margin_lag_{i}'
        # Shift the 'margin' column grouped by 'country' and 'payer'
        df[col_name] = df.groupby(['country', 'payer'])['margin'].shift(i)

    return df

In [50]:
# Call the function and assign the result back to df1
margin_lags = 10
df1 = generate_margin_lags(df1, margin_lags)

In [51]:
df1['amount'].sum(), df1.shape # Acá voy a tener mas filas porque relleno

(Decimal('26753064533.7617'), (114347, 141))

In [52]:
#### Workaround ####

# # Fusionar df1 y df2 basándonos en 'date' y 'payer_country'
# df_merged = pd.merge(df1, df2, on=['date', 'payer_country'], how='left', suffixes=('_df1', '_df2'))

# # Seleccionar las columnas de df2 que no están en df1 y coinciden en 'date' y 'payer_country'
# columns_to_add = [col for col in df2.columns if col not in df1.columns]
# df_addition = df_merged.loc[:, columns_to_add]

# # Unir las columnas seleccionadas con df1
# df_final = pd.concat([df1, df_addition], axis=1)

In [53]:
df_final = pd.merge(df1, df2, on=['date', 'payer', 'country', 'payer_country', 'amount'], how='inner')
df_final['date'] = pd.to_datetime(df_final['date'])

In [54]:
df_final['amount'].sum(), df_final.shape # Acá voy a tener mas filas porque relleno

(Decimal('26753064533.7617'), (114347, 202))

In [55]:
df_final.loc[(df_final['payer_country'] == 'ELEKTRA (MEXICO)_MEXICO') & (df_final['date'] == '2022-07-04')]

Unnamed: 0,date,payer_country,payer,country,tx,amount,coupon_count,gp,day_x,margin,...,var_tx_cancelled_lag_20,var_tx_cancelled_lag_21,var_tx_cancelled_lag_22,var_tx_cancelled_lag_23,var_tx_cancelled_lag_24,var_tx_cancelled_lag_25,var_tx_cancelled_lag_26,var_tx_cancelled_lag_27,var_tx_cancelled_lag_28,var_tx_cancelled_lag_29
77046,2022-07-04,ELEKTRA (MEXICO)_MEXICO,ELEKTRA (MEXICO),MEXICO,8364,4152238.92,90,54173.7271,2022-07-04,6.477,...,-76,74,-8,156,80,23,-71,-91,-29,-30


### DUMMIES

In [56]:
def mark_us_holidays(df):
    """
    Mark US holidays, excluding specified holidays and those with 'Observed'.

    Args:
        df (DataFrame): DataFrame containing a 'date' column in datetime format.

    Returns:
        DataFrame: DataFrame with an additional 'is_holiday' column, where 1 indicates a US holiday and 0 otherwise.
    """
    # Obtener las fechas mínima y máxima del DataFrame
    min_date = df['date'].min().year
    max_date = df['date'].max().year + 1
    print(min_date, max_date)
    
    # Cargar los feriados de Estados Unidos
    us_holidays = holidays.US(years=range(min_date, max_date))

    # Lista de días festivos a excluir
    holidays_to_exclude = ["Washington's Birthday", "Columbus Day"]
    
    # Filtrar los días festivos que deben ser excluidos
    filtered_holidays = {date: name for date, name in us_holidays.items() if name not in holidays_to_exclude and 'observed' not in name.lower() }
#    print(filtered_holidays) # Habilitando este print puedo ver que feriados son los que estamos marcando
    
    # Crear una lista de fechas de feriados
    holidays_list = list(filtered_holidays.keys())
    
    # Marcar los días festivos en el DataFrame
    df['is_holiday'] = df['date'].isin(holidays_list).astype(int)
    
    return df

In [110]:
#Applying holiday function 
df_final = mark_us_holidays(df_final)

2021 2024


In [87]:
def calculate_var_30ds(window, row, df_final):
    """
    Calculate the variable 'var_30ds' based on the average amount in the last 30 days.

    Parameters:
    window (int): The window size in days for the calculation.
    row (pandas.Series): The row containing the data for the current observation.
    df_final (pandas.DataFrame): The DataFrame containing the final dataset.

    Returns:
    float or None: The calculated variable 'var_30ds' if applicable, else None.
    """
    # Check if the current day is a holiday
    if row['is_holiday'] == 1:
        # Filter the DataFrame to get only the last 30 days for the current 'payer_country'
        filter_condition = (df_final['payer_country'] == row['payer_country']) & \
                           (df_final['date'] >= (row['date'] - pd.Timedelta(days=window))) & \
                           (df_final['date'] < row['date'])
        filtered_df = df_final[filter_condition]
        
        # Calculate the average amount for the current 'payer_country' in the last 30 days
        avg_amount = filtered_df['amount'].mean()

        # Print filtered DataFrame for debugging
#        if (row['payer_country'] == 'ELEKTRA (MEXICO)_MEXICO') and (row['date'] == datetime.strptime('2023-09-04', '%Y-%m-%d')):
#            print(filtered_df)
        
        # Calculate var_30ds according to the specified formula
        if avg_amount != 0 and row['amount'] != 0:
            var_30ds = float(row['amount']) / float(avg_amount) - 1  # Convert avg_amount to float before division
            return var_30ds  
        else:
            return 0
    else:
        return None

In [88]:
# Aplicar la función calculate_var_30ds a cada fila del DataFrame
window = 30
df_final['var_30ds'] = df_final.apply(lambda row: calculate_var_30ds(window, row, df_final), axis=1)
df_final['var_30ds'] = df_final['var_30ds'].fillna(0)

In [89]:
def mark_post_holiday(df):
    """
    Mark days after holidays. Usually post holiday days tend to rise sales

    Args:
        df (DataFrame): DataFrame containing a 'is_holiday' column indicating holidays.

    Returns:
        DataFrame: DataFrame with an additional 'post_holiday' column, where 1 indicates a day after a holiday.
    """
    post_holiday = []
    for idx, row in df.iterrows():
        is_holiday = row['is_holiday']
        if is_holiday == 1:
            post_holiday.append(0)
        else:
            if idx > 0 and df.loc[idx - 1, 'is_holiday'] == 1:
                post_holiday.append(1)
            else:
                post_holiday.append(0)
    df['post_holiday'] = post_holiday
    
    return df

In [112]:
df_final = mark_post_holiday(df_final)

In [114]:
def mark_fourth_july(df):
    """
    Mark the Fourth of July in the DataFrame.

    Args:
        df (DataFrame): DataFrame containing a 'date' column in datetime format.

    Returns:
        DataFrame: DataFrame with an additional 'is_fourth_of_july' column.
    """
    # Check if the date is the Fourth of July
    df['is_fourth_of_july'] = (
        (df['date'].dt.month == 7) & (df['date'].dt.day == 4)
    ).astype(int)
    
    return df

In [115]:
df_final = mark_fourth_july(df_final)

In [116]:
def mark_christmas_day(df):
    """
    Marks Christmas Day (December 25th) in the DataFrame.

    This function identifies December 25th for each year present in the DataFrame
    and marks it as Christmas Day in the DataFrame.

    Args:
    df (DataFrame): The DataFrame containing the date column.

    Returns:
    DataFrame: The DataFrame with Christmas Day marked.

    Raises:
    ValueError: If the DataFrame does not contain a 'date' column.
    """
    # Verificar si la columna 'date' existe en el DataFrame
    if 'date' not in df.columns:
        raise ValueError("DataFrame debe contener una columna 'date'.")

    # Crear una nueva columna para marcar el Día de Navidad
    df['christmas_day'] = 0

    # Iterar sobre cada año presente en el DataFrame
    for year in df['date'].dt.year.unique():
        # Marcar el 25 de diciembre para el año actual
        christmas_date = datetime(year, 12, 25)
        # Marcar filas correspondientes al Día de Navidad para el año actual
        df.loc[(df['date'].dt.year == year) & (df['date'].dt.month == 12) & (df['date'].dt.day == 25), 'christmas_day'] = 1

    return df

In [117]:
df_final = mark_christmas_day(df_final)

In [118]:
def mark_new_year_day(df):
    """
    Marks New year (January 1st) in the DataFrame.

    This function identifies January 1st for each year present in the DataFrame
    and marks it as Christmas Day in the DataFrame.

    Args:
    df (DataFrame): The DataFrame containing the date column.

    Returns:
    DataFrame: The DataFrame with New Year marked.

    Raises:
    ValueError: If the DataFrame does not contain a 'date' column.
    """
    # Verificar si la columna 'date' existe en el DataFrame
    if 'date' not in df.columns:
        raise ValueError("DataFrame debe contener una columna 'date'.")

    # Crear una nueva columna para marcar Año Nuevo
    df['new_year_day'] = 0

    # Iterar sobre cada año presente en el DataFrame
    for year in df['date'].dt.year.unique():
        # Marcar el 1 enero para el año actual
        new_year_date = datetime(year, 1, 1)
        # Marcar filas correspondientes a Año Nuevo para el año actual
        df.loc[(df['date'].dt.year == year) & (df['date'].dt.month == 1) & (df['date'].dt.day == 1), 'new_year_day'] = 1

    return df

In [119]:
df_final = mark_new_year_day(df_final)

In [120]:
from datetime import timedelta

def thanksgiving_date(year):
    """
    Calcula la fecha de Acción de Gracias para un año dado.

    Args:
    year (int): El año para el que se quiere calcular la fecha de Acción de Gracias.

    Returns:
    datetime: La fecha de Acción de Gracias para el año dado.
    """
    # Se sabe que Acción de Gracias es el cuarto jueves de noviembre
    # Se determina el día del primer jueves de noviembre
    first_of_november = datetime(year, 11, 1)
    while first_of_november.weekday() != 3:  # 3 representa el jueves
        first_of_november += timedelta(days=1)

    # Luego se suma 3 semanas (21 días) para obtener el cuarto jueves
    thanksgiving = first_of_november + timedelta(weeks=3)
    return thanksgiving

def mark_thanksgiving_day(df):
    """
    Marca el Día de Acción de Gracias en el DataFrame.

    Esta función identifica el Día de Acción de Gracias en noviembre para cada año presente en el DataFrame
    y lo marca en el DataFrame.

    Args:
    df (DataFrame): El DataFrame que contiene la columna de fecha.

    Returns:
    DataFrame: El DataFrame con el Día de Acción de Gracias marcado.

    Raises:
    ValueError: Si el DataFrame no contiene una columna 'date'.
    """
    # Verificar si la columna 'date' existe en el DataFrame
    if 'date' not in df.columns:
        raise ValueError("El DataFrame debe contener una columna 'date'.")

    # Crear una nueva columna para marcar el Día de Acción de Gracias
    df['thanksgiving_day'] = 0

    # Iterar sobre cada año presente en el DataFrame
    for year in df['date'].dt.year.unique():
        # Calcular la fecha de Acción de Gracias para el año actual
        thanksgiving = thanksgiving_date(year)
        # Marcar filas correspondientes a Acción de Gracias para el año actual
        df.loc[(df['date'].dt.year == year) & (df['date'].dt.month == 11) & (df['date'].dt.day == thanksgiving.day), 'thanksgiving_day'] = 1
        # Marcar 'is_holiday' como 0 cuando se marca 1 en 'thanksgiving_day'
        df.loc[(df['date'].dt.year == year) & (df['date'].dt.month == 11) & (df['date'].dt.day == thanksgiving.day), 'is_holiday'] = 0


    return df

In [121]:
df_final = mark_thanksgiving_day(df_final)

In [122]:
df_final[df_final['thanksgiving_day'] == 1]['date'].unique()

<DatetimeArray>
['2021-11-25 00:00:00', '2022-11-24 00:00:00', '2023-11-23 00:00:00']
Length: 3, dtype: datetime64[ns]

In [123]:
# Reviso
df_final.loc[(df_final['payer_country'] == 'ELEKTRA (MEXICO)_MEXICO') & (df_final['date'] == '2023-11-23')]

Unnamed: 0,date,payer_country,payer,country,tx,amount,coupon_count,gp,day_x,margin,...,var_tx_cancelled_lag_28,var_tx_cancelled_lag_29,is_holiday,is_fourth_of_july,var_30ds,christmas_day,new_year_day,thanksgiving_day,post_holiday,day_of_the_dead
77553,2023-11-23,ELEKTRA (MEXICO)_MEXICO,ELEKTRA (MEXICO),MEXICO,9762,4957891.25,174,31914.6590447,2023-11-23,3.2693,...,50,-55,0,0,-0.417957,0,0,1,0,0


In [124]:
# Create a boolean mask to filter by country and date
mask = (df_final['country'] == 'MEXICO') & (df_final['date'].dt.month == 11) & (df_final['date'].dt.day == 2)
# Mark the Day of the Dead according to the mask
df_final['day_of_the_dead'] = 0  # Initialize with 0
df_final.loc[mask, 'day_of_the_dead'] = 1  # Mark as 1 where the mask is True

In [125]:
# Reviso
df_final.loc[(df_final['payer_country'] == 'ELEKTRA (MEXICO)_MEXICO') & (df_final['date'] == '2022-11-02')]

Unnamed: 0,date,payer_country,payer,country,tx,amount,coupon_count,gp,day_x,margin,...,var_tx_cancelled_lag_28,var_tx_cancelled_lag_29,is_holiday,is_fourth_of_july,var_30ds,christmas_day,new_year_day,thanksgiving_day,post_holiday,day_of_the_dead
77167,2022-11-02,ELEKTRA (MEXICO)_MEXICO,ELEKTRA (MEXICO),MEXICO,8591,4475729.67,86,48866.6414,2022-11-02,5.6881,...,49,-173,0,0,0.0,0,0,0,0,1


In [126]:
def correcting_holidays(df_final, separate_flags):
    """
    Corrects the holiday markings in the DataFrame and adjusts the 'is_holiday' column based on exceptions.

    Args:
        df_final (DataFrame): DataFrame containing a 'date' column in datetime format and the 'is_holiday' column.
        separate_flags (list): List of column names where an exception should be considered.

    Returns:
        DataFrame: DataFrame with 'is_holiday' adjusted according to exceptions.
    """
    # Check if the 'date' column exists in the DataFrame
    if 'date' not in df_final.columns:
        raise ValueError("The DataFrame must contain a 'date' column.")

    # Iterate over the holiday columns where exceptions should be considered
    for flag in holidays_to_exclude:
        if flag not in df_final.columns:
            raise ValueError(f"The column '{flag}' does not exist in the DataFrame.")

        # If the holiday column has a value of 1, mark 'is_holiday' as 0 for the same row
        df_final.loc[df_final[flag] == 1, 'is_holiday'] = 0

    return df_final

In [127]:
#Lista con los holidays a excluir (nombres de las columnas)
holidays_to_exclude =  ['is_fourth_of_july', 'christmas_day', 'new_year_day', 
                        'thanksgiving_day']

# Aplicar la función
df_final = correcting_holidays(df_final, holidays_to_exclude)

In [128]:
# Reviso
df_final.loc[(df_final['payer_country'] == 'ELEKTRA (MEXICO)_MEXICO') & (df_final['date'] == '2022-07-05')]

Unnamed: 0,date,payer_country,payer,country,tx,amount,coupon_count,gp,day_x,margin,...,var_tx_cancelled_lag_28,var_tx_cancelled_lag_29,is_holiday,is_fourth_of_july,var_30ds,christmas_day,new_year_day,thanksgiving_day,post_holiday,day_of_the_dead
77047,2022-07-05,ELEKTRA (MEXICO)_MEXICO,ELEKTRA (MEXICO),MEXICO,14557,8005870.41,178,63192.3037,2022-07-05,4.341,...,-91,-29,0,0,0.0,0,0,0,1,0


### Chequeos

In [129]:
df_final.loc[(df_final['payer_country'] == 'ELEKTRA (MEXICO)_MEXICO') & (df_final['date'] == '2022-12-26')]

Unnamed: 0,date,payer_country,payer,country,tx,amount,coupon_count,gp,day_x,margin,...,var_tx_cancelled_lag_28,var_tx_cancelled_lag_29,is_holiday,is_fourth_of_july,var_30ds,christmas_day,new_year_day,thanksgiving_day,post_holiday,day_of_the_dead
77221,2022-12-26,ELEKTRA (MEXICO)_MEXICO,ELEKTRA (MEXICO),MEXICO,15975,7564267.22,146,55560.4611,2022-12-26,3.478,...,109,-124,0,0,0.0,0,0,0,1,0


In [130]:
df_final.loc[(df_final['payer_country'] == 'ELEKTRA (MEXICO)_MEXICO') & (df_final['date'] == '2022-01-02')]

Unnamed: 0,date,payer_country,payer,country,tx,amount,coupon_count,gp,day_x,margin,...,var_tx_cancelled_lag_28,var_tx_cancelled_lag_29,is_holiday,is_fourth_of_july,var_30ds,christmas_day,new_year_day,thanksgiving_day,post_holiday,day_of_the_dead
76863,2022-01-02,ELEKTRA (MEXICO)_MEXICO,ELEKTRA (MEXICO),MEXICO,11433,4883713.86,540,51938.8805,2022-01-02,4.5429,...,-30,63,0,0,0.0,0,0,0,1,0


In [131]:
df_final.loc[(df_final['payer_country'] == 'ELEKTRA (MEXICO)_MEXICO') & (df_final['date'] == '2023-07-04')]

Unnamed: 0,date,payer_country,payer,country,tx,amount,coupon_count,gp,day_x,margin,...,var_tx_cancelled_lag_28,var_tx_cancelled_lag_29,is_holiday,is_fourth_of_july,var_30ds,christmas_day,new_year_day,thanksgiving_day,post_holiday,day_of_the_dead
77411,2023-07-04,ELEKTRA (MEXICO)_MEXICO,ELEKTRA (MEXICO),MEXICO,7090,3795075.98,105,40150.9144067,2023-07-04,5.663,...,-74,-40,0,1,-0.554127,0,0,0,0,0


In [132]:
df_final.loc[(df_final['payer_country'] == 'ELEKTRA (MEXICO)_MEXICO') & (df_final['date'] == '2023-09-04')]

Unnamed: 0,date,payer_country,payer,country,tx,amount,coupon_count,gp,day_x,margin,...,var_tx_cancelled_lag_28,var_tx_cancelled_lag_29,is_holiday,is_fourth_of_july,var_30ds,christmas_day,new_year_day,thanksgiving_day,post_holiday,day_of_the_dead
77473,2023-09-04,ELEKTRA (MEXICO)_MEXICO,ELEKTRA (MEXICO),MEXICO,14724,8005536.4,248,122203.0235271,2023-09-04,8.2996,...,-45,-54,1,0,-0.09106,0,0,0,0,0


In [133]:
df_final.loc[(df_final['payer_country'] == 'ELEKTRA (MEXICO)_MEXICO') & (df_final['date'] == '2023-01-01')]

Unnamed: 0,date,payer_country,payer,country,tx,amount,coupon_count,gp,day_x,margin,...,var_tx_cancelled_lag_28,var_tx_cancelled_lag_29,is_holiday,is_fourth_of_july,var_30ds,christmas_day,new_year_day,thanksgiving_day,post_holiday,day_of_the_dead
77227,2023-01-01,ELEKTRA (MEXICO)_MEXICO,ELEKTRA (MEXICO),MEXICO,4559,2018279.42,14,10556.7577189,2023-01-01,2.3156,...,34,154,0,0,-0.73953,0,1,0,0,0


In [134]:
# Filling NaN in exogenous and lags
df_final.fillna(0, inplace=True)

  df_final.fillna(0, inplace=True)


### Guardo en S3

In [138]:
# Dejo el archivo en S3

bucket = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
prefix_abt = 'ABTv4'
file_name = 'ABTv4.parquet'

# Ruta S3
s3_path = f"s3://{bucket}/{prefix_abt}/{file_name}"

wr.s3.to_parquet(df_final, path=s3_path, index=False)

{'paths': ['s3://viamericas-datalake-dev-us-east-1-283731589572-analytics/ABTv4/ABTv4.csv'],
 'partitions_values': {}}

In [136]:
df_final['amount'].sum()

Decimal('26753064533.7617')

In [137]:
df_final_filtered = df_final.loc[df_final['date'] <= '2023-10-20']
df_final_filtered['amount'].sum()

Decimal('24239232368.7517')

In [84]:
#df_final.isna().sum().to_list()