In [6]:
# for basic data manipulations
import pandas as pd
import numpy as np

# for handling file paths
import os

# for getting clustered data
from cluster import get_daily_sales

# for time series preprocessing
from scipy.signal import savgol_filter

# for suppressing warnings
import warnings 
warnings.filterwarnings('ignore')


In [7]:
def add_cluster_idx(df, idx):
    df["cluster_id"] = f"cluster_{idx}"
    return df

In [8]:
sales = list(map(lambda x: get_daily_sales(x).reset_index(), [1, 5, 7]))
sales_ = list(map(add_cluster_idx, sales, [1, 5, 7]))
data = pd.concat(sales_).reset_index(drop = True)

In [9]:
data['TotalSales'] = savgol_filter(data['TotalSales'], window_length=9, polyorder=2)

data["date"] = pd.to_datetime(data["date"])
data["time_idx"] = (data["date"] - data["date"].min()).dt.days

data["month"] = data["date"].dt.month.astype(str).astype("category")
data["year"] = data["date"].dt.year.astype(str).astype("category")
data["day_of_week"] = data["date"].dt.dayofweek.astype(str).astype("category")
data["day_of_month"] = data["date"].dt.day.astype(str).astype("category")
data["is_weekend"] = (data["date"].dt.dayofweek >= 5).astype(int)

In [10]:
data['date'] = pd.to_datetime(data['date'])
data = data.sort_values(['cluster_id', 'date'])

# statistics by group_idx
def create_features(group):
    # 1st and 7th lag
    group['lag_1'] = group['TotalSales'].shift(1)
    group['lag_7'] = group['TotalSales'].shift(7)
    
    # rolling mean and sd
    for window in [3, 7, 14, 30]:
        group[f'rolling_mean_{window}'] = group['TotalSales'].rolling(window).mean().bfill()
        group[f'rolling_std_{window}'] = group['TotalSales'].rolling(window).std().bfill()
    
    # exponential moving average
    group['ema_7'] = group['TotalSales'].ewm(span=7, adjust=False).mean()
    
    # first order difference
    group['daily_diff'] = group['TotalSales'].diff(1)
    
    return group

data = data.groupby('cluster_id', group_keys=False).apply(create_features)
data = data.dropna()

In [13]:
data = data.sort_values(by = "time_idx").reset_index()

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1791 entries, 0 to 1790
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   index            1791 non-null   int64         
 1   date             1791 non-null   datetime64[ns]
 2   TotalSales       1791 non-null   float64       
 3   cluster_id       1791 non-null   object        
 4   time_idx         1791 non-null   int64         
 5   month            1791 non-null   category      
 6   year             1791 non-null   category      
 7   day_of_week      1791 non-null   category      
 8   day_of_month     1791 non-null   category      
 9   is_weekend       1791 non-null   int64         
 10  lag_1            1791 non-null   float64       
 11  lag_7            1791 non-null   float64       
 12  rolling_mean_3   1791 non-null   float64       
 13  rolling_std_3    1791 non-null   float64       
 14  rolling_mean_7   1791 non-null   float64