In [1]:
# Libraries
import os
import pandas as pd
from scipy.stats import zscore
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
user = os.getenv('USERPROFILE')
data_path = os.path.join(
    user, 'OneDrive - National University of Singapore\EBAC\Year 1 Semester 1\Project\Data')
os.chdir(data_path)

In [2]:
retail_df = pd.read_csv('Retail_Week_NUS.csv', parse_dates=[0])
retail_df.head()

Unnamed: 0,DATE,CUSTNAME,MATERIAL,BASEUOM,MSTAE,H1,H2,H3,QTY_SOLD,CLUSTER
0,2018-06-25,Store 4,10091,PAC,AC,Dairy,Cream Compound,Whipping,5.0,B
1,2018-06-25,Store 4,10550,PAC,AC,"Nuts, Seeds & Beans",Nuts,Pumpkin Kernel,3.0,B
2,2018-06-25,Store 4,11485,PAC,AC,"Flour, Grain & Flakes",Premix,Others,1.0,B
3,2018-06-25,Store 4,1201,PAC,AC,Bakery,Functional,Starches,1.0,B
4,2018-06-25,Store 4,12085,PAC,AC,Grocery,Seasoning,Herbs & Spices,1.0,B


# Data Cleaning

**We only look at active skus and remove General and Service in H1**

In [3]:
retail_df = retail_df.loc[(retail_df['MSTAE'] == 'AC') & (
    (retail_df['H1'] != 'General') & (retail_df['H1'] != 'Service'))]

**Create Year and Week**

In [4]:
retail_df['YEAR'] = retail_df['DATE'].dt.year
retail_df['WEEK'] = retail_df['DATE'].dt.strftime('%W')
print(retail_df.shape)
retail_df.head()

(1391820, 12)


Unnamed: 0,DATE,CUSTNAME,MATERIAL,BASEUOM,MSTAE,H1,H2,H3,QTY_SOLD,CLUSTER,YEAR,WEEK
0,2018-06-25,Store 4,10091,PAC,AC,Dairy,Cream Compound,Whipping,5.0,B,2018,26
1,2018-06-25,Store 4,10550,PAC,AC,"Nuts, Seeds & Beans",Nuts,Pumpkin Kernel,3.0,B,2018,26
2,2018-06-25,Store 4,11485,PAC,AC,"Flour, Grain & Flakes",Premix,Others,1.0,B,2018,26
3,2018-06-25,Store 4,1201,PAC,AC,Bakery,Functional,Starches,1.0,B,2018,26
4,2018-06-25,Store 4,12085,PAC,AC,Grocery,Seasoning,Herbs & Spices,1.0,B,2018,26


**Insert weeks with no sales (Put 0)<br>Did not put currently as it affects standard deviation**

In [5]:
retail_group = retail_df.groupby(['YEAR', 'WEEK', 'MATERIAL'])[
    'QTY_SOLD'].sum().reset_index()

In [6]:
retail_group.head()

Unnamed: 0,YEAR,WEEK,MATERIAL,QTY_SOLD
0,2018,26,101,2.0
1,2018,26,112,1.0
2,2018,26,116,8.0
3,2018,26,213,7.0
4,2018,26,219,1.0


In [7]:
# retail_group = retail_df.pivot_table(index=['YEAR', 'WEEK'], columns=[
#     'MATERIAL'], values='QTY_SOLD', fill_value=0).reset_index()

In [8]:
# retail_group = retail_group.melt(
#     id_vars=['YEAR', 'WEEK'], value_vars=retail_group.columns[2:], value_name='QTY_SOLD')

In [9]:
# print(retail_group.shape)
# retail_group.head()

# Seasonal Products

**Sales of items affected by**
- Economy at times
- Nature of Item (Seasonal or Non-seasonal)
- Promotions or Discounts
- Competition from a rival
- Competition from a substitute product
- Special events like Super Bowl, Thanksgiving, New Year, etc varying from places to places

In [10]:
avg_df = retail_group.groupby(['MATERIAL', 'YEAR'])[
    'QTY_SOLD'].mean().reset_index()
avg_df = avg_df.rename(columns={'QTY_SOLD': 'AVG_QTY'})
avg_df.head()

Unnamed: 0,MATERIAL,YEAR,AVG_QTY
0,101,2018,1.454545
1,101,2019,1.783784
2,101,2020,2.275862
3,101,2021,1.714286
4,102,2018,1.0


In [11]:
retail_merged = retail_group.merge(avg_df, how='left', on=['YEAR', 'MATERIAL'])
retail_merged.head()

Unnamed: 0,YEAR,WEEK,MATERIAL,QTY_SOLD,AVG_QTY
0,2018,26,101,2.0,1.454545
1,2018,26,112,1.0,17.214286
2,2018,26,116,8.0,35.821429
3,2018,26,213,7.0,30.964286
4,2018,26,219,1.0,52.607143


**Create seasonal index**

In [12]:
retail_merged['SEASONAL_INDEX'] = retail_merged['QTY_SOLD'] / \
    retail_merged['AVG_QTY']

In [13]:
std_df = retail_merged.groupby(['MATERIAL', 'YEAR'])[
    'SEASONAL_INDEX'].std().reset_index()
std_df = std_df.rename(columns={'SEASONAL_INDEX': 'STD'})

In [14]:
retail_merged = retail_merged.merge(
    std_df, how='left', on=['MATERIAL', 'YEAR'])

**Remove materials that are not sold, it may be cause of lack of inventory**

In [15]:
retail_merged = retail_merged.dropna(subset=['STD'])

retail_merged = retail_merged.groupby(['YEAR', 'MATERIAL'])['STD'].mean(
).to_frame().sort_values(by=['YEAR', 'STD'], ascending=False)

In [28]:
high_std = retail_merged.loc[retail_merged['STD'] >= 2]
low_std = retail_merged.loc[retail_merged['STD'] < 1]

**Find common seasonality items for 2019 and 2020**

In [29]:
s_products = high_std.loc[2019].merge(high_std.loc[2020],left_index=True,right_index=True)
non_s_products = low_std.loc[2019].merge(low_std.loc[2020],left_index=True,right_index=True)

In [32]:
non_s_products

Unnamed: 0_level_0,STD_x,STD_y
MATERIAL,Unnamed: 1_level_1,Unnamed: 2_level_1
102577,0.998800,0.628141
103455,0.998710,0.844054
61148,0.998692,0.817783
61669,0.998108,0.837293
304,0.995885,0.547677
...,...,...
103779,0.000000,0.300000
103901,0.000000,0.655871
103955,0.000000,0.433013
200302,0.000000,0.000000
