# Data Aggregation

## Import Packages

In [None]:
import pandas as pd
import datetime

## Import Datasets

### Main

In [None]:
Order_df = pd.read_csv('./dataset/Order_clean.csv')
Order_df.head()

In [None]:
OrderItem_df = pd.read_csv('./dataset/OrderItem_clean.csv')
OrderItem_df.head()

### Extra

In [None]:
Holiday_df = pd.read_csv('./dataset/Holidays.csv')
Holiday_df = Holiday_df.astype({'Date': 'datetime64[ns]'})
Holiday_df

## Data Aggregation

### Merged

In [None]:
Merged_df = pd.merge(OrderItem_df, Order_df, on='POSNo')
Merged_df

### Daily

In [None]:
Daily_df = Merged_df.groupby(['BranchID','ItemID','POSDate']).agg({'Qty': 'sum', 'Price': 'first', 'Discount': 'sum'}).sort_values(by=['BranchID','ItemID']).reset_index()
Daily_df = Daily_df.rename(columns={'Qty': 'Qty_sum', 'Price': 'PricePerItem'})
Daily_df = Daily_df.astype({'POSDate': 'datetime64[ns]'})
Daily_df['Discount'] = Daily_df['Discount'].map(bool)
cols = ['BranchID','ItemID','POSDate','PricePerItem','Discount','Qty_sum']
Daily_df = Daily_df[cols]
Daily_df

### Transactions

In [None]:
Transactions_df = Daily_df.groupby(['BranchID','ItemID']).agg({'POSDate': 'count'}).sort_values(by=['BranchID','ItemID']).reset_index()
Transactions_df = Transactions_df.rename(columns={'POSDate': 'TransactionDays'})
Transactions_df

In [None]:
Transactions_filtered = Transactions_df[Transactions_df['TransactionDays'] >= 50]
Transactions_filtered

In [None]:
Daily_filtered_df = Daily_df[Daily_df.set_index(['BranchID', 'ItemID']).index.isin(Transactions_filtered.set_index(['BranchID', 'ItemID']).index)]
Daily_filtered_df

### Holidays

In [None]:
holidays = pd.DataFrame({'POSDate': pd.date_range(start=Daily_df['POSDate'].min(), end=Daily_df['POSDate'].max()).tolist()})
holidays

In [None]:
def affected_by_holiday(date, holidays):
    date = date.to_datetime64()
    holidays_date = holidays['Date']
    days_to_holiday = date - holidays_date
    number_of_holidays = len(days_to_holiday)
    affected = [False for i in range(number_of_holidays)]

    for i in range(number_of_holidays):
        days = days_to_holiday[i]
        preparation_days = pd.to_timedelta(holidays['Preparation'][i], unit='D')
        aftermath_days = pd.to_timedelta(holidays['Aftermath'][i], unit='D')
        if (days < pd.Timedelta(0)) & (days >= -preparation_days):
            affected[i] = True
        elif (days > pd.Timedelta(0)) & (days <= aftermath_days):
            affected[i] = True
        else:
            affected[i] = holidays['DDay'][i]

    return any(affected)

holidays['AffectedByHoliday'] = holidays['POSDate'].apply(lambda date: affected_by_holiday(date, Holiday_df))
holidays

## Fill in the Blanks

### Full

In [None]:
branches_arr = []
items_arr = []
prices_arr = []
dates_arr = []

grouped = Daily_df.groupby(['BranchID', 'ItemID'])
first_date = Daily_df['POSDate'].min()
last_date = Daily_df['POSDate'].max()
date_format = '%Y-%m-%d'

for (branch, item), group in grouped:
    pos_dates = group['POSDate']

    start = pos_dates.min()
    start_year = start.year
    start_month = start.month
    start_date = datetime.datetime.strptime(f'{start_year}-{start_month}-01',date_format)
    start_date = max(start_date,first_date)

    end = pos_dates.max()
    end_year = end.year
    end_month = end.month
    end_date = datetime.datetime.strptime(f'{end_year}-{end_month}-28',date_format) + datetime.timedelta(days=4)
    end_date = end_date - datetime.timedelta(days=end_date.day)
    end_date = min(end_date,last_date)

    date_arr = pd.date_range(start=start_date, end=end_date).tolist()
    branch_arr = [branch] * len(date_arr)
    item_arr = [item] * len(date_arr)

    branch_item_df = pd.DataFrame({'BranchID': branch_arr, 'ItemID': item_arr, 'POSDate': date_arr})
    branch_item_df = branch_item_df.merge(group[['BranchID','ItemID','POSDate','PricePerItem']], on=['BranchID', 'ItemID', 'POSDate'], how='left')
    branch_item_df['PricePerItem'] = branch_item_df['PricePerItem'].ffill().bfill()
    price_arr = branch_item_df['PricePerItem'].astype('int64').to_list()

    branches_arr.extend(branch_arr)
    items_arr.extend(item_arr)
    dates_arr.extend(date_arr)
    prices_arr.extend(price_arr)

complete = pd.DataFrame({'BranchID': branches_arr, 'ItemID': items_arr, 'POSDate': dates_arr, 'PricePerItem': prices_arr})
complete

In [None]:
Daily_full_filled = pd.merge(Daily_df, complete, on=['BranchID', 'ItemID', 'POSDate', 'PricePerItem'], how='right')
Daily_full_filled = pd.merge(Daily_full_filled, holidays, on=['POSDate'], how='left')
Daily_full_filled = Daily_full_filled.fillna({'Qty_sum': 0, 'Discount': False})
Daily_full_filled = Daily_full_filled.astype({'Qty_sum': 'int64'})
Daily_full_filled['POSDay'] = Daily_full_filled['POSDate'].dt.day
Daily_full_filled['POSMonth'] = Daily_full_filled['POSDate'].dt.month
Daily_full_filled['POSYear'] = Daily_full_filled['POSDate'].dt.year
cols = ['BranchID','ItemID','POSDate','POSDay','POSMonth','POSYear','PricePerItem','Discount','AffectedByHoliday','Qty_sum']
Daily_full_filled = Daily_full_filled[cols]
Daily_full_filled

### Filtered

In [None]:
branches_arr = []
items_arr = []
prices_arr = []
dates_arr = []

grouped = Daily_filtered_df.groupby(['BranchID', 'ItemID'])
first_date = Daily_filtered_df['POSDate'].min()
last_date = Daily_filtered_df['POSDate'].max()
date_format = '%Y-%m-%d'

for (branch, item), group in grouped:
    pos_dates = group['POSDate']

    start = pos_dates.min()
    start_year = start.year
    start_month = start.month
    start_date = datetime.datetime.strptime(f'{start_year}-{start_month}-01',date_format)
    start_date = max(start_date,first_date)

    end = pos_dates.max()
    end_year = end.year
    end_month = end.month
    end_date = datetime.datetime.strptime(f'{end_year}-{end_month}-28',date_format) + datetime.timedelta(days=4)
    end_date = end_date - datetime.timedelta(days=end_date.day)
    end_date = min(end_date,last_date)

    date_arr = pd.date_range(start=start_date, end=end_date).tolist()
    branch_arr = [branch] * len(date_arr)
    item_arr = [item] * len(date_arr)

    branch_item_df = pd.DataFrame({'BranchID': branch_arr, 'ItemID': item_arr, 'POSDate': date_arr})
    branch_item_df = branch_item_df.merge(group[['BranchID','ItemID','POSDate','PricePerItem']], on=['BranchID', 'ItemID', 'POSDate'], how='left')
    branch_item_df['PricePerItem'] = branch_item_df['PricePerItem'].ffill().bfill()
    price_arr = branch_item_df['PricePerItem'].astype('int64').to_list()

    branches_arr.extend(branch_arr)
    items_arr.extend(item_arr)
    dates_arr.extend(date_arr)
    prices_arr.extend(price_arr)

complete = pd.DataFrame({'BranchID': branches_arr, 'ItemID': items_arr, 'POSDate': dates_arr, 'PricePerItem': prices_arr})
complete

In [None]:
Daily_filtered_filled = pd.merge(Daily_filtered_df, complete, on=['BranchID', 'ItemID', 'POSDate', 'PricePerItem'], how='right')
Daily_filtered_filled = pd.merge(Daily_filtered_filled, holidays, on=['POSDate'], how='left')
Daily_filtered_filled = Daily_filtered_filled.fillna({'Qty_sum': 0, 'Discount': False})
Daily_filtered_filled = Daily_filtered_filled.astype({'Qty_sum': 'int64'})
Daily_filtered_filled['POSDay'] = Daily_filtered_filled['POSDate'].dt.day
Daily_filtered_filled['POSMonth'] = Daily_filtered_filled['POSDate'].dt.month
Daily_filtered_filled['POSYear'] = Daily_filtered_filled['POSDate'].dt.year
cols = ['BranchID','ItemID','POSDate','POSDay','POSMonth','POSYear','PricePerItem','Discount','AffectedByHoliday','Qty_sum']
Daily_filtered_filled = Daily_filtered_filled[cols]
Daily_filtered_filled

### Weekly

In [None]:
Weekly_full_filled = Daily_full_filled
Weekly_full_filled['POSWeek'] = Weekly_full_filled['POSDate'].dt.isocalendar().week
Weekly_full_filled['POSYear'] = Weekly_full_filled['POSDate'].dt.isocalendar().year
Weekly_full_filled = Weekly_full_filled.groupby(['BranchID','ItemID','POSYear','POSWeek']).agg(PricePerItem=('PricePerItem','last'), Discount=('Discount','any'), AffectedByHoliday=('AffectedByHoliday','any'), Qty_sum=('Qty_sum','sum')).reset_index()
cols = ['BranchID','ItemID','POSWeek','POSYear','PricePerItem','Discount','AffectedByHoliday','Qty_sum']
Weekly_full_filled = Weekly_full_filled[cols]
Weekly_full_filled

# Training Model

## Import Packages

In [None]:
import numpy as np
import psutil
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from dask.distributed import Client

from sklearn.preprocessing import OneHotEncoder
import pickle
import os

client = Client()
num_cpu_cores = psutil.cpu_count(logical=False)
num_partitions = min(num_cpu_cores, 4)

## Feature Selection

In [None]:
# Splitting features and target variable
X_daily = Daily_filtered_filled[['BranchID', 'ItemID', 'POSDay', 'POSMonth', 'POSYear', 'PricePerItem', 'Discount', 'AffectedByHoliday']]
y_daily = Daily_filtered_filled['Qty_sum']

X_weekly = Weekly_full_filled[['BranchID', 'ItemID', 'POSWeek', 'POSYear', 'PricePerItem', 'Discount', 'AffectedByHoliday']]
y_weekly = Weekly_full_filled['Qty_sum']

## Encoding

In [None]:
## Daily

# Use sparse matrix
X_daily_full = Daily_full_filled[['BranchID', 'ItemID', 'POSDay', 'POSMonth', 'POSYear', 'PricePerItem', 'Discount', 'AffectedByHoliday']]
encoder_D = OneHotEncoder(sparse_output=True, handle_unknown='ignore')  # Using sparse output to save memory
encoder_D.fit(X_daily_full[['BranchID', 'ItemID']])

X_daily_encoded = encoder_D.transform(X_daily[['BranchID', 'ItemID']])

# Drop original columns earlier to free memory
X_daily.drop(columns=['BranchID', 'ItemID'], inplace=True)

# Convert the sparse matrix to DataFrame
X_daily_encoded_df = pd.DataFrame.sparse.from_spmatrix(X_daily_encoded, columns=encoder_D.get_feature_names_out(['BranchID', 'ItemID']))

# Concatenate with original DataFrame
X_daily = pd.concat([X_daily, X_daily_encoded_df], axis=1)

# Display DataFrame
X_daily

In [None]:
## Weekly

# Use sparse matrix
encoder_W = OneHotEncoder(sparse_output=True, handle_unknown='ignore')  # Using sparse output to save memory
X_weekly_encoded = encoder_W.fit_transform(X_weekly[['BranchID', 'ItemID']])

# Drop original columns earlier to free memory
X_weekly.drop(columns=['BranchID', 'ItemID'], inplace=True)

# Convert the sparse matrix to DataFrame
X_weekly_encoded_df = pd.DataFrame.sparse.from_spmatrix(X_weekly_encoded, columns=encoder_W.get_feature_names_out(['BranchID', 'ItemID']))

# Concatenate with original DataFrame
X_weekly = pd.concat([X_weekly, X_weekly_encoded_df], axis=1)

# Display DataFrame
X_weekly

## Train Model

In [None]:
## Daily

X_train_D, X_test_D, y_train_D, y_test_D = train_test_split(X_daily, y_daily, test_size=0.2, random_state=69)
# Train XGBoost model with early stopping
model_D = xgb.XGBRegressor(n_estimators=500, gamma=2, tree_method='hist',
                         eta=0.15, max_depth=3, min_child_weight=2, subsample=0.6)

# Train XGBoost model with early stopping
eval_set_D = [(X_train_D, y_train_D), (X_test_D, y_test_D)]
model_D.fit(X_train_D, y_train_D, early_stopping_rounds=10, eval_set=eval_set_D, verbose=True)

In [None]:
## Weekly

X_train_W, X_test_W, y_train_W, y_test_W = train_test_split(X_weekly, y_weekly, test_size=0.2, random_state=69)
# Train XGBoost model with early stopping
model_W = xgb.XGBRegressor(n_estimators=600, gamma=3, tree_method='hist', learning_rate=0.05,
                         eta=0.07, max_depth=6, min_child_weight=4, subsample=0.8, reg_alpha = 0.2, reg_lambda=0.2)

# Train XGBoost model with early stopping
eval_set_W = [(X_train_W, y_train_W), (X_test_W, y_test_W)]
model_W.fit(X_train_W, y_train_W, early_stopping_rounds=10, eval_set=eval_set_W, verbose=True)

## Save Model

In [None]:
with open('xgboost_model_daily.pkl', 'wb') as f:
    pickle.dump(model_D, f)

with open('xgboost_model_weekly.pkl', 'wb') as f:
    pickle.dump(model_W, f)

In [None]:
model_D.save_model('xgboost_model_daily.bst')
model_W.save_model('xgboost_model_weekly.bst')