## Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

## Load Dataset & Preprocessing

In [3]:
df = pd.read_csv('../data/supermarket_preprocessed.csv')
df.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Sales,...,Time,Payment,cogs,gross margin percentage,gross income,Rating,Hour,Hour_label,Month,Weekday
0,750-67-8428,Alex,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,...,1900-01-01 13:08:00,Ewallet,522.83,4.761905,26.1415,9.1,13,01 PM,1,Saturday
1,226-31-3081,Giza,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,...,1900-01-01 10:29:00,Cash,76.4,4.761905,3.82,9.6,10,10 AM,3,Friday
2,631-41-3108,Alex,Yangon,Normal,Female,Home and lifestyle,46.33,7,16.2155,340.5255,...,1900-01-01 13:23:00,Credit card,324.31,4.761905,16.2155,7.4,13,01 PM,3,Sunday
3,123-19-1176,Alex,Yangon,Member,Female,Health and beauty,58.22,8,23.288,489.048,...,1900-01-01 20:33:00,Ewallet,465.76,4.761905,23.288,8.4,20,08 PM,1,Sunday
4,373-73-7910,Alex,Yangon,Member,Female,Sports and travel,86.31,7,30.2085,634.3785,...,1900-01-01 10:37:00,Ewallet,604.17,4.761905,30.2085,5.3,10,10 AM,2,Friday


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Invoice ID               1000 non-null   object 
 1   Branch                   1000 non-null   object 
 2   City                     1000 non-null   object 
 3   Customer type            1000 non-null   object 
 4   Gender                   1000 non-null   object 
 5   Product line             1000 non-null   object 
 6   Unit price               1000 non-null   float64
 7   Quantity                 1000 non-null   int64  
 8   Tax 5%                   1000 non-null   float64
 9   Sales                    1000 non-null   float64
 10  Date                     1000 non-null   object 
 11  Time                     1000 non-null   object 
 12  Payment                  1000 non-null   object 
 13  cogs                     1000 non-null   float64
 14  gross margin percentage  

In [5]:
df['Date'] = pd.to_datetime(df['Date'])

In [6]:
daily = (
    df.groupby(df['Date'].dt.date)
    .agg({'Sales': 'sum'})
    .reset_index()
    .rename(columns={'Date': 'ds', 'Sales': 'y'})
    )

In [8]:
daily.dtypes

ds     object
y     float64
dtype: object

In [None]:
daily['ds'] = pd.to_datetime(daily['ds'])
daily = daily.sort_values('ds').reset_index(drop=True)

## Feature Engineering

In [10]:
def add_time_features(df, target_col='y'):
    df = df.copy()
    df['dow'] = df['ds'].dt.dayofweek
    df['dom'] = df['ds'].dt.day
    df['month'] = df['ds'].dt.month
    df['weekofyear'] = df['ds'].dt.isocalendar().week.astype(int)

    for lag in  [1,7,14,30]:
        df[f'lag_{lag}'] = df[target_col].shift(lag)

    df['roll7_mean'] = df[target_col].shift(1).rolling(window=7, min_periods=1).mean()
    df['roll7_std'] = df[target_col].shift(1).rolling(window=7, min_periods=1).std()
    df['roll30_mean'] = df[target_col].shift(1).rolling(window=30, min_periods=1).mean()
    df['roll30_std'] = df[target_col].shift(1).rolling(window=30, min_periods=1).std()

    df = df.dropna().reset_index(drop=True)
    return df

data = add_time_features(daily, target_col='y')


## Train and Test Split Data

In [11]:
split_idx = int(len(data) * 0.8)
train_df = data.iloc[:split_idx].copy()
test_df = data.iloc[split_idx:].copy()

In [12]:
features_cols = [c for c in data.columns if c not in ['ds', 'y']]
X_train = train_df[features_cols]
y_train = train_df['y']
X_test = test_df[features_cols]
y_test = test_df['y']

## Pipelines & Preprocessor

In [15]:
num_features = features_cols

preprocess = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), num_features)
    ],
    remainder='drop'
)

In [16]:
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

pipe = Pipeline([
    ('preprocess', preprocess),
    ('model', rf)
])

In [17]:
tscv = TimeSeriesSplit(n_splits=5)

param_dist = {
    'model__n_estimators': [200, 400, 600, 800, 1000],
    'model__max_depth': [None, 6, 8, 10, 12, 16, 20],
    'model__min_samples_split': [2, 5, 10, 20],
    'model__min_samples_leaf': [1, 2, 4, 8],
    'model__max_features': ['auto', 'sqrt', 0.3, 0.5, 0.7],
    'model__bootstrap': [True, False]
}

In [18]:
random_search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=40,                
    scoring='neg_mean_absolute_error',
    cv=tscv,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

In [19]:
random_search.fit(X_train, y_train)


Fitting 5 folds for each of 40 candidates, totalling 200 fits


45 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Asus\OneDrive\Documents\Final _Project_Dibimbing\myvenv\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Asus\OneDrive\Documents\Final _Project_Dibimbing\myvenv\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Asus\OneDrive\Documents\Final _Project_Dibimbing\myvenv\Lib\site-packages\sklearn\pipeline.py", line 663, in fit
    self._fina

0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_distributions,"{'model__bootstrap': [True, False], 'model__max_depth': [None, 6, ...], 'model__max_features': ['auto', 'sqrt', ...], 'model__min_samples_leaf': [1, 2, ...], ...}"
,n_iter,40
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,TimeSeriesSpl...est_size=None)
,verbose,1
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,transformers,"[('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,400
,criterion,'squared_error'
,max_depth,16
,min_samples_split,10
,min_samples_leaf,8
,min_weight_fraction_leaf,0.0
,max_features,0.7
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True
