In [2]:
# !pip install awswrangler
# !pip install holidays

In [3]:
import awswrangler as wr
import pandas as pd
import datetime as dt
from datetime import datetime
import holidays

In [4]:
#TIMEFRAME - Only for dev purpose
start_date = '2021-01-01'
#end_date = '2023-12-31'
end_date = '2024-02-03'

### DFs generation

In [5]:
# DB Setting
bucket_name = 's3://viamericas-datalake-dev-us-east-1-283731589572-athena/'
origin_name = 'AwsDataCatalog'
database_name= 'analytics'
table_name = 'daily_check_gp'

In [6]:
df = wr.athena.read_sql_table(
    table=table_name,
    database=database_name,
)

In [19]:
#Connection to daily_forex 
forex_table = 'last_daily_forex_country'

rates = wr.athena.read_sql_table(
    table=forex_table,
    database=database_name)

In [30]:
### EFFECT OF CANCELED TRANSACTIONS ###
# ES DISTINTA PORQUE DAILY_CHECK TIENE ALGUNOS FILTROS Y ESTA NO
database_name= 'analytics'
table2_name = 'daily_sales_count_cancelled_v2'## WE LOAD THE BASE WITH CANCELLATIONS

df_canc = wr.athena.read_sql_table(
    table=table2_name,
    database=database_name)

### DATA PREP

In [8]:
# Convert the 'date' column to datetime format
df['day'] = pd.to_datetime(df['day'])
# Grouping by 'payer' and 'country' concatenated for this level of granularity
df['payer_country'] = df['payer'] + '_' + df['country']
# Margin (when tx !=0)
df['margin'] = df.apply(lambda row: row['gp'] / row['tx'] if row['tx'] != 0 else 0, axis=1)
df['margin'] = df['margin'].apply(lambda x: float(x)).round(4)

In [10]:
# Specify date range
df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]

In [12]:
#Filtering data
df = df[df['payer'] != 'EXPIRED ORDERS']
df = df[df['amount'] != 0] # Excluding 0 (flag A & Flag C), defined in EDA

In [13]:
#df.isna().sum() # Reviso si hay valores nulos

In [20]:
# FOREX - Selecting columns & renaming
rates['day'] = pd.to_datetime(rates['day'])
rates = rates[['day','country','max_feed_price']]

In [32]:
# CANCELLATIONS 
df_canc['date'] = pd.to_datetime(df_canc['date'])
df_canc['payer_country'] = df_canc['payer'] +'_'+ df_canc['country']
# Specific date range
df_canc = df_canc[(df_canc['date'] >= start_date) & (df_canc['date'] <= end_date)]

In [14]:
def fill_missing_dates(df, start_date, end_date):
    """
    Fill missing dates in the DataFrame with zero values and ensure all date ranges are covered.

    Args:
        df (pandas.DataFrame): Input DataFrame with columns 'date', 'amount', 'tx_cancelled', 'payer_country', etc.
        start_date (str or datetime.date): Start date of the desired date range.
        end_date (str or datetime.date): End date of the desired date range.

    Returns:
        pandas.DataFrame: DataFrame with missing dates filled and all date ranges covered.
    """
    # Convert the 'date' column to datetime type if it's not already
    df['date'] = pd.to_datetime(df['date'])
    
    # Define the desired date range
    date_range = pd.date_range(start=start_date, end=end_date)
    
    # Get the minimum and maximum date range for each 'payer_country', 'id_country', 'id_main_branch'
    
    # Group by 'payer_country', 'id_country', 'id_main_branch' and aggregate min and max dates
    payer_country_ranges = df.groupby(['payer_country','id_country','id_main_branch'])['date'].agg(['min', 'max']).reset_index()
    
    # Fill missing 'min' and 'max' dates with start_date and end_date respectively
    payer_country_ranges['min'] = payer_country_ranges['min'].fillna(pd.to_datetime(start_date))
    payer_country_ranges['max'] = payer_country_ranges['max'].fillna(pd.to_datetime(end_date))
    
    # Combine the original DataFrame with the DataFrame of all date combinations
    df_filled = pd.DataFrame()
    for index, row in payer_country_ranges.iterrows():
        payer_country = row['payer_country']
        start_payer = row['min']
        end_payer = row['max']
        payer_id_country = row['id_country']
        payer_id_main_branch = row['id_main_branch']
        
        # Filter the original DataFrame by 'payer_country'
        df_payer = df[df['payer_country'] == payer_country]
        
        # Fill missing values in 'payer_country' date range
        date_range_payer = pd.date_range(start=start_payer, end=end_payer)
        date_combinations = pd.DataFrame({'date': date_range_payer, 'payer_country': payer_country, 'id_country': payer_id_country, 'id_main_branch':payer_id_main_branch})
        df_combined = pd.merge(date_combinations, df_payer, on=['date', 'payer_country'], how='left')
        
        # Fill missing numeric values with zero
        numeric_columns = ['amount', 'coupon_count', 'tx', 'gp', 'margin']
        df_combined[numeric_columns] = df_combined[numeric_columns].fillna(0)
        
        # Fill missing 'payer' and 'country' values using the ffill method
        df_combined[['payer', 'country','id_country','id_main_branch']] = df_combined[['payer', 'country','id_country_x','id_main_branch_x']].ffill()
        
        # Fill missing 'day' values with 'date' values when NaN
        df_combined['day'] = df_combined['day'].fillna(df_combined['date'])
        
        df_filled = pd.concat([df_filled, df_combined], ignore_index=True)
    
    # Remove redundant columns
    df_filled = df_filled.drop(columns=['id_country_x','id_country_y','id_main_branch_x','id_main_branch_y'])
    
    return df_filled

In [15]:
# Fill missing dates in df_filtered
df_filled = fill_missing_dates(df, start_date, end_date)

### UNIVERSE

In [21]:
# AGING FILTER

def aging_filter(df):
    """
    Filter a DataFrame based on aging criteria described in aging.ipynb

    Args:
        df (pandas.DataFrame): Input DataFrame with columns 'date', 'payer_country', 'amount', and 'tx'.

    Returns:
        pandas.DataFrame: Filtered DataFrame containing only the rows that meet the aging criteria.
    """
    # Find the last date in the sample
    last_date_sample = df['day'].max()

    # Calculate the limit date, one day before the last date in the sample
    limit_date = last_date_sample - pd.Timedelta(days=1)

    # Aggregate data by 'payer_country'
    result = (
        df.groupby('payer_country')
        .agg(
            first_date=('day', 'min'),
            last_date=('day', 'max'),
            total_amount=('amount', 'sum'),
            total_transactions=('tx', 'sum')
        )
        .reset_index()
    )

    # Calculate age of payer
    result['age_payer'] = ((limit_date - result['first_date']).dt.days / 30).round(2)

    # Calculate active time
    result['active_time'] = ((result['last_date'] - result['first_date']).dt.days / 30).round(2)

    # Calculate inactive time
    result['inactive_time'] = ((limit_date - result['last_date']).dt.days / 30).round(2)

    # Sort the DataFrame by 'total_amount' from highest to lowest
    result = result.sort_values(by='total_amount', ascending=False)

    # Filter the DataFrame based on conditions
    aging_universe = result.loc[
        (result.age_payer >= 3) & 
        (result.inactive_time <= 3) & 
        (result.total_amount > 10000) & 
        (result.total_transactions > 50)
    ]
    
    return aging_universe

In [22]:
# Defining Universe
df_aging = aging_filter(df_filled) #Filtering 'payer_country' based on Aging notebook
df_filtered = df_filled[df_filled['payer_country'].isin(df_aging['payer_country'])] # Applying aging filters 
df_filtered['day'] = pd.to_datetime(df_filtered['day'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['day'] = pd.to_datetime(df_filtered['day'])


### VARIABLES

In [25]:
def generate_lag_and_variation(df, num_lags):
    """
    Generate lagged values and variations for a given df

    Args:
        df (pandas.DataFrame): Input df with columns 'symbol' and 'feed_price'.
        num_lags (int): Number of lagged values to generate.

    Returns:
        pandas.DataFrame: df with lagged values and variations added as new columns.
    """
    # Create columns for each day's lag up to the defined maximum
    for i in range(1, num_lags + 1):
        col_name = f'rate_lag_{i}'
        # Shift the 'feed_price' column grouped by 'symbol'
        df[col_name] = df.groupby('country')['max_feed_price'].shift(i)

    # Calculate the variation columns between consecutive lags
    for i in range(1, num_lags):
        col_name = f'var_rate_lag_{i}'
        # Calculate the difference between consecutive lag columns
        df[col_name] = df[f'rate_lag_{i}'] - df[f'rate_lag_{i + 1}']

    return df


In [26]:
rates_number = 30
rates = rates.sort_values(['country','day'])
rates = generate_lag_and_variation(rates, rates_number)

In [27]:
# First step: Rates +  df_filtered (defined by Universe)
df1 = pd.merge(df_filtered, rates, on=['day', 'country'], how='left')

In [28]:
df1['date'] = pd.to_datetime(df1['date'])

In [35]:
def fill_missing_dates(df, start_date, end_date):
    """
    Fill missing dates in the DataFrame with zero values and ensure all date ranges are covered.

    Args:
        df (pandas.DataFrame): Input DataFrame with columns 'date', 'amount', 'tx_cancelled', 'payer_country', etc.
        start_date (str or datetime.date): Start date of the desired date range.
        end_date (str or datetime.date): End date of the desired date range.

    Returns:
        pandas.DataFrame: DataFrame with missing dates filled and all date ranges covered.
    """
    # Create an empty DataFrame with the specified date range
    date_range = pd.date_range(start=start_date, end=end_date)
    df_fill = pd.DataFrame({'date': date_range, 'amount': 0, 'tx_cancelled': 0})
    df_fill['date'] = pd.to_datetime(df_fill['date']).dt.date

    # Sort the original DataFrame by 'country', 'payer', and 'date'
    df = df.sort_values(by=['country', 'payer', 'date'])

    # Create an empty DataFrame to hold the result
    result_df = pd.DataFrame()

    # Loop through each 'payer_country'
    for payer_country in df['payer_country'].unique():
        # Filter DataFrame by 'payer_country'
        df_aux = df[df['payer_country'] == payer_country]

        # Combine df_aux (payer_country) with df_fill, keeping values from df_aux and filling missing dates
        merged_df = df_aux.set_index('date').combine_first(df_fill.set_index('date')).reset_index()

        # Fill missing values in specified columns
        columns_to_fill = ['payer', 'country', 'payer_country']
        merged_df[columns_to_fill] = merged_df[columns_to_fill].ffill().bfill()

        # Concatenate the result with the final DataFrame
        result_df = pd.concat([result_df, merged_df], ignore_index=True)

    return result_df

In [36]:
# Call the function with the specified start_date and end_date
df_full = fill_missing_dates(df_canc, start_date, end_date)

In [39]:
def generate_tx_lags_and_variation(df, tx_count):
    """
    Generate lag columns for cancelled transactions and their variations.

    Args:
    - df: DataFrame containing transaction data
    - tx_count: Number of periods for lag calculation

    Returns:
    - df: DataFrame with added lag and variation columns
    """
    # Sort the dataset based on country, payer, and date
    df = df.sort_values(by=['country', 'payer', 'date'])

    # Create columns for each day's lag up to the defined maximum
    for i in range(1, tx_count + 1):
        col_name = f'tx_cancelled_lag_{i}'
        # Shift the 'tx_cancelled' column grouped by 'country' and 'payer'
        df[col_name] = df.groupby(['country', 'payer'])['tx_cancelled'].shift(i)

    # Calculate the variation columns between consecutive delays
    for i in range(1, tx_count):
        col_name = f'var_tx_cancelled_lag_{i}'
        # Calculate the difference between consecutive lag columns
        df[col_name] = df[f'tx_cancelled_lag_{i}'] - df[f'tx_cancelled_lag_{i + 1}']

    return df

In [40]:
# Call the function and assign the result back to df2
tx_cancelled_lags = 30
df2 = generate_tx_lags_and_variation(df_full, tx_cancelled_lags)
df2['day'] = pd.to_datetime(df2['day'])

In [42]:
# Coupon ratio
df1['ratio_coupon_tx']=df1.coupon_count/df1.tx

In [44]:
def generate_coupon_tx_lags(df, tx_count):
    """
    Generate lag columns for coupon_tx ratio

    Args:
    - df: DataFrame containing transaction data
    - tx_count: Number of periods for lag calculation

    Returns:
    - df: DataFrame with added lag and variation columns
    """
    # Sort the dataset based on country, payer, and date
    df = df.sort_values(by=['country', 'payer', 'date'])

    # Create columns for each day's lag up to the defined maximum
    for i in range(1, tx_count + 1):
        col_name = f'ratio_coupon_tx_lag_{i}'
        # Shift the 'ratio_coupon_tx' column grouped by 'country' and 'payer'
        df[col_name] = df.groupby(['country', 'payer'])['ratio_coupon_tx'].shift(i)

    return df

In [45]:
# Call the function and assign the result back to df1
tx_ratio_coupon_tx_lags = 30
df1 = generate_coupon_tx_lags(df1, tx_ratio_coupon_tx_lags)

In [47]:
def generate_tx_lags(df, tx_count):
    """
    Generate lags columns for txs

    Args:
    - df: DataFrame containing transaction data
    - tx_count: Number of periods for lag calculation

    Returns:
    - df: DataFrame with added lag and variation columns
    """
    # Sort the dataset based on country, payer, and date
    df = df.sort_values(by=['country', 'payer', 'date'])

    # Create columns for each day's lag up to the defined maximum
    for i in range(1, tx_count + 1):
        col_name = f'tx_lag_{i}'
        # Shift the 'tx' column grouped by 'country' and 'payer'
        df[col_name] = df.groupby(['country', 'payer'])['tx'].shift(i)

    return df

In [48]:
# Call the function and assign the result back to df1
tx_lags = 30
df1 = generate_tx_lags(df1, tx_lags)

In [50]:
def generate_margin_lags(df, margin_lags):
    """
    Generate lag columns for margin

    Args:
    - df: DataFrame containing transaction data
    - margin_lags: Number of periods for lag calculation

    Returns:
    - df: DataFrame with added lag columns for margin
    """
    # Sort the dataset based on country, payer, and date
    df = df.sort_values(by=['country', 'payer', 'date'])

    # Create columns for each day's lag up to the defined maximum
    for i in range(1, margin_lags + 1):
        col_name = f'margin_lag_{i}'
        # Shift the 'margin' column grouped by 'country' and 'payer'
        df[col_name] = df.groupby(['country', 'payer'])['margin'].shift(i)

    return df

In [51]:
# Call the function and assign the result back to df1
margin_lags = 10
df1 = generate_margin_lags(df1, margin_lags)

In [54]:
# Merging dataframes
df_final = pd.merge(df1, df2, on=['date', 'payer', 'country', 'payer_country', 'amount'], how='inner')
df_final['date'] = pd.to_datetime(df_final['date'])

### DUMMIES

In [57]:
def mark_us_holidays(df):
    """
    Mark US holidays, excluding specified holidays and those with 'Observed'.

    Args:
        df (DataFrame): DataFrame containing a 'date' column in datetime format.

    Returns:
        DataFrame: DataFrame with an additional 'is_holiday' column, where 1 indicates a US holiday and 0 otherwise.
    """
    # Get the minimum and maximum dates
    min_date = df['date'].min().year
    max_date = df['date'].max().year + 1
    print(min_date, max_date)
    
    # Load US holidays
    us_holidays = holidays.US(years=range(min_date, max_date))

    # List of holidays to exclude
    holidays_to_exclude = ["Washington's Birthday", "Columbus Day"]
    
    # Filter holidays that should be excludeds
    filtered_holidays = {date: name for date, name in us_holidays.items() if name not in holidays_to_exclude and 'observed' not in name.lower() }
#    print(filtered_holidays) # Habilitando este print puedo ver que feriados son los que estamos marcando
    
    # Create a list of holiday dates
    holidays_list = list(filtered_holidays.keys())
    
    # Mark holidays in the DataFrame
    df['is_holiday'] = df['date'].isin(holidays_list).astype(int)
    
    return df

In [58]:
#Applying holiday function 
df_final = mark_us_holidays(df_final)

2021 2025


In [59]:
def mark_fourth_july(df):
    """
    Mark the Fourth of July in the DataFrame.

    Args:
        df (DataFrame): DataFrame containing a 'date' column in datetime format.

    Returns:
        DataFrame: DataFrame with an additional 'is_fourth_of_july' column.
    """
    # Check if the date is the Fourth of July
    df['is_fourth_of_july'] = (
        (df['date'].dt.month == 7) & (df['date'].dt.day == 4)
    ).astype(int)
    
    return df

In [60]:
df_final = mark_fourth_july(df_final)

In [61]:
def calculate_var_30ds(window, row, df_final):
    """
    Calculate the variable 'var_30ds' based on the average amount in the last 30 days.

    Parameters:
    window (int): The window size in days for the calculation.
    row (pandas.Series): The row containing the data for the current observation.
    df_final (pandas.DataFrame): The DataFrame containing the final dataset.

    Returns:
    float or None: The calculated variable 'var_30ds' if applicable, else None.
    """
    # Check if the current day is a holiday
    if row['is_holiday'] == 1:
        # Filter the DataFrame to get only the last 30 days for the current 'payer_country'
        filter_condition = (df_final['payer_country'] == row['payer_country']) & \
                           (df_final['date'] >= (row['date'] - pd.Timedelta(days=window))) & \
                           (df_final['date'] < row['date'])
        filtered_df = df_final[filter_condition]
        
        # Calculate the average amount for the current 'payer_country' in the last 30 days
        avg_amount = filtered_df['amount'].mean()

        # Print filtered DataFrame for debugging
#        if (row['payer_country'] == 'ELEKTRA (MEXICO)_MEXICO') and (row['date'] == datetime.strptime('2023-09-04', '%Y-%m-%d')):
#            print(filtered_df)
        
        # Calculate var_30ds according to the specified formula
        if avg_amount != 0 and row['amount'] != 0:
            var_30ds = float(row['amount']) / float(avg_amount) - 1  # Convert avg_amount to float before division
            return var_30ds  
        else:
            return 0
    else:
        return None

In [62]:
# Applying calculate_var_30ds to each holiday date
window = 30
df_final['var_30ds'] = df_final.apply(lambda row: calculate_var_30ds(window, row, df_final), axis=1)
df_final['var_30ds'] = df_final['var_30ds'].fillna(0)

In [64]:
def mark_christmas_day(df):
    """
    Marks Christmas Day (December 25th) in the DataFrame.

    This function identifies December 25th for each year present in the DataFrame
    and marks it as Christmas Day in the DataFrame.

    Args:
    df (DataFrame): The DataFrame containing the date column.

    Returns:
    DataFrame: The DataFrame with Christmas Day marked.

    Raises:
    ValueError: If the DataFrame does not contain a 'date' column.
    """
    # Check if the 'date' column exists in the DataFrame
    if 'date' not in df.columns:
        raise ValueError("DataFrame must contain a 'date' column.")

    # Create a new column to mark Christmas Day
    df['christmas_day'] = 0

    # Iterate over each year present in the DataFrame
    for year in df['date'].dt.year.unique():
        # Mark December 25th for the current year
        christmas_date = datetime(year, 12, 25)
        # Mark rows corresponding to Christmas Day for the current year
        df.loc[(df['date'].dt.year == year) & (df['date'].dt.month == 12) & (df['date'].dt.day == 25), 'christmas_day'] = 1

    return df

In [65]:
df_final = mark_christmas_day(df_final)

In [66]:
def mark_new_year_day(df):
    """
    Marks New year (January 1st) in the DataFrame.

    This function identifies January 1st for each year present in the DataFrame
    and marks it as Christmas Day in the DataFrame.

    Args:
    df (DataFrame): The DataFrame containing the date column.

    Returns:
    DataFrame: The DataFrame with New Year marked.

    Raises:
    ValueError: If the DataFrame does not contain a 'date' column.
    """
    # Check if the 'date' column exists in the DataFrame
    if 'date' not in df.columns:
        raise ValueError("DataFrame must contain a 'date' column.")

    # Create a new column to mark New Year's Day
    df['new_year_day'] = 0

    # Iterate over each year present in the DataFrame
    for year in df['date'].dt.year.unique():
        # Mark January 1st for the current year
        new_year_date = datetime(year, 1, 1)
        # Mark rows corresponding to New Year's Day for the current year
        df.loc[(df['date'].dt.year == year) & (df['date'].dt.month == 1) & (df['date'].dt.day == 1), 'new_year_day'] = 1

    return df

In [67]:
df_final = mark_new_year_day(df_final)

In [69]:
def mark_post_holiday(df):
    """
    Mark days after holidays. Usually post holiday days tend to rise sales

    Args:
        df (DataFrame): DataFrame containing a 'is_holiday' column indicating holidays.

    Returns:
        DataFrame: DataFrame with an additional 'post_holiday' column, where 1 indicates a day after a holiday.
    """
    post_holiday = []
    for idx, row in df.iterrows():
        is_holiday = row['is_holiday']
        if is_holiday == 1:
            post_holiday.append(0)
        else:
            if idx > 0 and df.loc[idx - 1, 'is_holiday'] == 1:
                post_holiday.append(1)
            else:
                post_holiday.append(0)
    df['post_holiday'] = post_holiday
    
    return df

In [70]:
df_final = mark_post_holiday(df_final)

In [71]:
def correcting_holidays(df, holidays_to_exclude):
    """
    Marks specified dates in the list as non-holidays.

    Args:
        df (DataFrame): DataFrame containing an 'is_holiday' column indicating holidays.
        holidays_to_exclude (list): List of dates in month-year format to be marked as non-holidays.

    Returns:
        DataFrame: Modified DataFrame with dates marked as non-holidays in the 'is_holiday' column.
    """
    # Convertir las fechas a formato mes-día (mm-dd) para comparación
    df['month_day'] = df['date'].dt.strftime('%m-%d')

    # Marcar como no festivo (0) los días que están en la lista de fechas a excluir
    df.loc[df['month_day'].isin(holidays_to_exclude), 'is_holiday'] = 0

    # Eliminar la columna temporal 'month_day'
    df.drop(columns=['month_day'], inplace=True)

    return df

In [72]:
holidays_to_exclude = ['07-04', '12-25', '01-01']  # Formato mes-día

# Applying fuction
df_final = correcting_holidays(df_final, holidays_to_exclude)

In [78]:
# Filling NaN in exogenous and lags
df_final.fillna(0, inplace=True)

  df_final.fillna(0, inplace=True)


### Saving df_final to S3

In [80]:
# S3 Settings

bucket = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
prefix_abt = 'ABTv3_update'
file_name = 'ABTv3_update.csv'

# S3 path
s3_path = f"s3://{bucket}/{prefix_abt}/{file_name}"

#Saving
wr.s3.to_csv(df_final, path=s3_path, index=False)

{'paths': ['s3://viamericas-datalake-dev-us-east-1-283731589572-analytics/ABTv3_update/ABTv3_update.csv'],
 'partitions_values': {}}