In [15]:
!pip install awswrangler
!pip install skforecast
!pip install pmdarima

[0m

In [16]:
import awswrangler as wr
import pandas as pd
import boto3
import pickle
from io import BytesIO
from io import StringIO
import joblib
import os
from datetime import datetime, timedelta
import warnings

# skforecast
from skforecast.Sarimax import Sarimax
from skforecast.ForecasterSarimax import ForecasterSarimax
from skforecast.model_selection_sarimax import backtesting_sarimax
from skforecast.model_selection_sarimax import grid_search_sarimax
from sklearn.metrics import mean_absolute_error


### Load ABT and .pkl

In [17]:
# Set up S3 client
client = boto3.client('s3')
bucket_name = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
#path = 'ABTv3_update/top_payers_abt_v3update' # Folder under analysis
path = 'ABTv3_updatetop_payers_abt_v3update/' # Folder under analysis

#ABTversion = 'v3_update'
ABTversion = 'v6'
path_ABT = f'ABT{ABTversion}/' 

In [18]:
# Specify the CSV file key
csv_key = None

# List objects in the S3 path
response = client.list_objects(Bucket=bucket_name, Prefix=path_ABT)

# Find the CSV file in the S3 path
for obj in response.get('Contents', []):
    if obj['Key'].endswith('.csv'):
        csv_key = obj['Key']
        break
        
# Check if CSV file is found
if csv_key is not None:
    # Read CSV content from S3
    csv_response = client.get_object(Bucket=bucket_name, Key=csv_key)
    csv_content = csv_response['Body'].read().decode('utf-8')

    # Transform CSV content to DataFrame
    data = pd.read_csv(StringIO(csv_content))
    data['date'] = pd.to_datetime(data['date']).dt.date
    print("CSV file loaded")
else:
    print("No CSV file found in the specified S3 path.")

CSV file loaded


In [19]:
data['date'] = pd.to_datetime(data['date'])

In [20]:
data.date.max()

Timestamp('2024-02-11 00:00:00')

In [21]:
# Mark with 1 in 'day_of_the_dead' when 'date' is Nov. 2
data.loc[data['date'].dt.month.eq(11) & data['date'].dt.day.eq(2), 'day_of_the_dead'] = 1

### PARQUET

In [22]:
#uri='s3://viamericas-datalake-dev-us-east-1-283731589572-analytics/abt_parquet/dt=2024-05-07/'

In [23]:
#data=wr.s3.read_parquet(uri)

In [25]:
data.date.max()

Timestamp('2023-12-20 00:00:00')

In [26]:
data.payer_country.nunique()

133

## LOAD .pkl

In [27]:
elements = client.list_objects(Bucket=bucket_name, Prefix=path)

#Listing pkl files
pkl_files = [obj['Key'] for obj in elements.get('Contents', []) if obj['Key'].endswith('.pkl')
                and ('MODEL_2d' in obj['Key']) and not obj['Key'].endswith('_7d.pkl')]  # File structure 

In [28]:
len(pkl_files)

1

In [29]:
pkl_files

['ABTv3_updatetop_payers_abt_v3update/TELECOMM TELEGRAFOS (APPRIZA)_MEXICO/MODEL_2d_TELECOMM TELEGRAFOS (APPRIZA)_MEXICO_2dv2.pkl']

In [30]:
today = datetime.now().date()

### Loop to process all

In [31]:
# Initialize an empty DataFrame to store the results
df_temp = pd.DataFrame(columns=['date', 'pred', 'payer_country', 'model'])

In [32]:
# Payers to present errors in the inference process.
payer_countries_pinched = []

In [33]:
#data=data.loc[data.payer_country=='ELEKTRA (MEXICO)_MEXICO']

In [34]:
i = 1

# Iterate over pkl files
for file_key in pkl_files:
    # Extract payer_country from file_key
    payer_country = file_key.split('/')[1]
    print(payer_country)
    
    # Download pkl file from S3 and load it into memory
    response = client.get_object(Bucket=bucket_name, Key=file_key)
    buffer = BytesIO(response['Body'].read())
    forecaster = joblib.load(buffer)
    
    #### PAYER SETTING ####
    # Filter data for the specific payer_country
    datos = data[data['payer_country'] == payer_country].copy()
#    datos = data.loc[data.payer_country == payer_country]
    datos['date'] = pd.to_datetime(datos['date'])
    datos.set_index('date', inplace=True)
    datos = datos.asfreq('D')

    # Predictions settings
    last_window_date = forecaster.last_window.index[-1] + pd.Timedelta(days=1) #19/12/23 first day in last window
    test_date = forecaster.last_window.index[-1] + pd.Timedelta(days=38) # The first test day would be the day to predict 29/12/23-==11-(esto CRECE SIEMPRE UN DIA MAS)

    # Extract data for last window and test period
    data_last_window = datos.loc[last_window_date:test_date - pd.Timedelta(days=1)].copy() #From 19/12/23 to test minus one day 28/12/23
    data_last_window[forecaster.exog_col_names] = data_last_window[forecaster.exog_col_names].fillna(0)##extracting exogenous and filling missing
    data_test = datos.loc[test_date:test_date + pd.Timedelta(days=1)].copy()#29/12/2023 we have data in test (first day to predict) and their extension (2 days) (7D IN 8D MODEL)
    data_test[forecaster.exog_col_names] = data_test[forecaster.exog_col_names].fillna(0)##we fill in missings

    try:
        
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
        # Make predictions
        predictions = forecaster.predict(
                          steps            = 2, # Days to predict
                          exog             = data_test[forecaster.exog_col_names],
                          last_window      = data_last_window['amount'],
                          last_window_exog = data_last_window[forecaster.exog_col_names]
                      )
        # Store predictions in a temporary DataFrame
        df_temp = pd.DataFrame(predictions, columns=['pred']).reset_index()

    except:
        # If an exception occurs, set predictions to zero
        print("\033[1;31m" + f"Error processing {payer_country}" + "\033[0m")
        predictions = [0, 0]
        df_temp = pd.DataFrame({'index': [test_date, test_date + pd.Timedelta(days=1)], 'pred': predictions})
        payer_countries_pinched.append(payer_country)


    # Add additional columns
    df_temp['payer_country'] = payer_country
    df_temp['model'] = file_key.split('/')[-1]

    # Concatenate df_temp with the main DataFrame
    if i:
        temp_df = df_temp.copy()
        i = 0
    else:
        temp_df = pd.concat([temp_df, df_temp], ignore_index=True)

TELECOMM TELEGRAFOS (APPRIZA)_MEXICO
[1;31mError processing TELECOMM TELEGRAFOS (APPRIZA)_MEXICO[0m


In [35]:
datos.tail(18)

Unnamed: 0_level_0,payer_country,payer,country,tx,amount,coupon_count,gp,day_x,margin,max_day,...,var_tx_cancelled_lag_28,var_tx_cancelled_lag_29,is_holiday,var_30ds,post_holiday,is_fourth_of_july,christmas_day,new_year_day,thanksgiving_day,day_of_the_dead
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-12-03,TELECOMM TELEGRAFOS (APPRIZA)_MEXICO,TELECOMM TELEGRAFOS (APPRIZA),MEXICO,1086,665224.66,21,8243.0265,2023-12-03,7.5903,2024-02-03,...,-16,7,0,0.0,0,0,0,0,0,0
2023-12-04,TELECOMM TELEGRAFOS (APPRIZA)_MEXICO,TELECOMM TELEGRAFOS (APPRIZA),MEXICO,995,590756.26,23,7236.8952,2023-12-04,7.2733,2024-02-03,...,3,-16,0,0.0,0,0,0,0,0,0
2023-12-05,TELECOMM TELEGRAFOS (APPRIZA)_MEXICO,TELECOMM TELEGRAFOS (APPRIZA),MEXICO,836,547966.06,18,1487.2508,2023-12-05,1.779,2024-02-03,...,0,3,0,0.0,0,0,0,0,0,0
2023-12-06,TELECOMM TELEGRAFOS (APPRIZA)_MEXICO,TELECOMM TELEGRAFOS (APPRIZA),MEXICO,777,506516.58,16,5493.2843,2023-12-06,7.0699,2024-02-03,...,15,0,0,0.0,0,0,0,0,0,0
2023-12-07,TELECOMM TELEGRAFOS (APPRIZA)_MEXICO,TELECOMM TELEGRAFOS (APPRIZA),MEXICO,765,521352.94,10,5244.9748,2023-12-07,6.8562,2024-02-03,...,-14,15,0,0.0,0,0,0,0,0,0
2023-12-08,TELECOMM TELEGRAFOS (APPRIZA)_MEXICO,TELECOMM TELEGRAFOS (APPRIZA),MEXICO,905,537220.52,14,4357.7974,2023-12-08,4.8152,2024-02-03,...,19,-14,0,0.0,0,0,0,0,0,0
2023-12-09,TELECOMM TELEGRAFOS (APPRIZA)_MEXICO,TELECOMM TELEGRAFOS (APPRIZA),MEXICO,905,547987.9,19,4044.0445,2023-12-09,4.4686,2024-02-03,...,-12,19,0,0.0,0,0,0,0,0,0
2023-12-10,TELECOMM TELEGRAFOS (APPRIZA)_MEXICO,TELECOMM TELEGRAFOS (APPRIZA),MEXICO,1199,679383.31,10,5414.7753,2023-12-10,4.5161,2024-02-03,...,-12,-12,0,0.0,0,0,0,0,0,0
2023-12-11,TELECOMM TELEGRAFOS (APPRIZA)_MEXICO,TELECOMM TELEGRAFOS (APPRIZA),MEXICO,1050,595237.32,16,2923.2505,2023-12-11,2.784,2024-02-03,...,15,-12,0,0.0,0,0,0,0,0,0
2023-12-12,TELECOMM TELEGRAFOS (APPRIZA)_MEXICO,TELECOMM TELEGRAFOS (APPRIZA),MEXICO,699,433461.13,10,2609.4072,2023-12-12,3.7331,2024-02-03,...,-20,15,0,0.0,0,0,0,0,0,0


In [36]:
data_last_window.index.min()

Timestamp('2023-12-19 00:00:00')

In [37]:
forecaster.exog_col_names

['tx_lag_2',
 'tx_lag_3',
 'tx_lag_5',
 'tx_lag_7',
 'tx_lag_10',
 'tx_lag_14',
 'tx_lag_21',
 'margin_lag_10',
 'is_holiday',
 'is_fourth_of_july',
 'var_30ds',
 'christmas_day',
 'new_year_day']

In [38]:
temp_df[temp_df['pred'] < 0]

Unnamed: 0,index,pred,payer_country,model


In [39]:
# Replace values in 'pred' column with 0 where 'pred' is less than 0
temp_df.loc[temp_df['pred'] < 0, 'pred'] = 0

In [40]:
# Convert 'index' column to date format
temp_df['index'] = pd.to_datetime(temp_df['index']).dt.date

In [41]:
temp_df.head(4)

Unnamed: 0,index,pred,payer_country,model
0,2024-01-25,0,TELECOMM TELEGRAFOS (APPRIZA)_MEXICO,MODEL_2d_TELECOMM TELEGRAFOS (APPRIZA)_MEXICO_...
1,2024-01-26,0,TELECOMM TELEGRAFOS (APPRIZA)_MEXICO,MODEL_2d_TELECOMM TELEGRAFOS (APPRIZA)_MEXICO_...


In [42]:
stop

NameError: name 'stop' is not defined

In [None]:
temp_df.rename(columns={'index':'pred_date'}, inplace=True)

In [None]:
temp_df

In [None]:
data_test

In [None]:
forecaster.exog_col_names

In [None]:
forecaster.last_window

In [None]:
#print(pkl_files)

In [None]:
data_last_window.index.max()
#test_date

In [None]:
payer_countries_pinched

In [None]:
temp_df[temp_df['pred'] < 0]

In [None]:
# Replace values in 'pred' column with 0 where 'pred' is less than 0
temp_df.loc[temp_df['pred'] < 0, 'pred'] = 0

In [None]:
# Convert 'index' column to date format
temp_df['index'] = pd.to_datetime(temp_df['index']).dt.date

In [None]:
temp_df.head(4)

In [None]:
temp_df.rename(columns={'index':'pred_date'}, inplace=True)

In [None]:
temp_df

In [None]:
stop

### ADD ID_COUNTRY AND ID_PAYER FROM DAILY_CHECK_GP TABLE

In [None]:
# DB Setting
bucket_name = 's3://viamericas-datalake-dev-us-east-1-283731589572-athena/'
origin_name = 'AwsDataCatalog'
database_name= 'analytics'
table_name = 'daily_check_gp'

In [None]:
df = wr.athena.read_sql_table(
    table=table_name,
    database=database_name,
)

In [None]:
df['payer_country'] = df['payer'] + '_' + df['country']

In [None]:
df_id = df[['payer_country', 'id_main_branch', 'id_country']].drop_duplicates().dropna(subset='id_main_branch')

In [None]:
len(df_id)

In [None]:
df_final = pd.merge(temp_df, df_id, on='payer_country', how='left')

In [None]:
# Insertar la columna 'processing_date' al principio del DataFrame
df_final.insert(0, 'processing_date', today)

In [None]:
df_final['processing_date'] = pd.to_datetime(df_final['processing_date'] )

In [None]:
df_final['id_country'].isnull().sum()

In [None]:
df_final['id_main_branch'].isnull().sum()

In [None]:
# Splitting 'payer' & 'country'
df_final[['payer', 'country']] = df_final['payer_country'].str.split('_', expand=True)

In [None]:
df_final.head()

In [None]:
df_final.info()

In [None]:
df_final.pred.sum()

### SAVE THE INFERENCES OF JANUARY 28TH AND 29TH
##### THIS IS THE INPUT FOR ADDING MAPES TO HISTORIC MAPES EXCEL

In [None]:
bucket_name = 'viamericas-datalake-dev-us-east-1-283731589572-analytics'
path_inference = '/ABTv3_update/' # Folder under analysis

In [None]:
#wr.s3.to_csv(
    df=df_final,
    path='s3://viamericas-datalake-dev-us-east-1-283731589572-analytics/ABTv3_update/Inferences_2d/predictions_2d.csv',
    dataset=False,
    index=False
)

In [None]:
control=df_final.loc[df_final.payer_country=="GIROSMEX_MEXICO"]

In [None]:
control