In [1]:
# Libraries
import os
import pandas as pd
from scipy.stats import zscore
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
user = os.getenv('USERPROFILE')
data_path = os.path.join(
    user, 'OneDrive - National University of Singapore\EBAC\Year 1 Semester 1\Project\Data')
os.chdir(data_path)

In [2]:
retail_df = pd.read_csv('Retail_Week_NUS.csv', parse_dates=[0])
retail_df.head()

Unnamed: 0,DATE,CUSTNAME,MATERIAL,BASEUOM,MSTAE,H1,H2,H3,QTY_SOLD,CLUSTER
0,2018-06-25,Store 4,10091,PAC,AC,Dairy,Cream Compound,Whipping,5.0,B
1,2018-06-25,Store 4,10550,PAC,AC,"Nuts, Seeds & Beans",Nuts,Pumpkin Kernel,3.0,B
2,2018-06-25,Store 4,11485,PAC,AC,"Flour, Grain & Flakes",Premix,Others,1.0,B
3,2018-06-25,Store 4,1201,PAC,AC,Bakery,Functional,Starches,1.0,B
4,2018-06-25,Store 4,12085,PAC,AC,Grocery,Seasoning,Herbs & Spices,1.0,B


# Data Cleaning

**We only look at active skus and remove General and Service in H1**

In [3]:
retail_df = retail_df.loc[(retail_df['MSTAE'] == 'AC') & (
    (retail_df['H1'] != 'General') & (retail_df['H1'] != 'Service'))]

**Create Year and Week**

In [4]:
retail_df['YEAR'] = retail_df['DATE'].dt.year
retail_df['WEEK'] = retail_df['DATE'].dt.strftime('%W')
print(retail_df.shape)
retail_df.head()

(1391820, 12)


Unnamed: 0,DATE,CUSTNAME,MATERIAL,BASEUOM,MSTAE,H1,H2,H3,QTY_SOLD,CLUSTER,YEAR,WEEK
0,2018-06-25,Store 4,10091,PAC,AC,Dairy,Cream Compound,Whipping,5.0,B,2018,26
1,2018-06-25,Store 4,10550,PAC,AC,"Nuts, Seeds & Beans",Nuts,Pumpkin Kernel,3.0,B,2018,26
2,2018-06-25,Store 4,11485,PAC,AC,"Flour, Grain & Flakes",Premix,Others,1.0,B,2018,26
3,2018-06-25,Store 4,1201,PAC,AC,Bakery,Functional,Starches,1.0,B,2018,26
4,2018-06-25,Store 4,12085,PAC,AC,Grocery,Seasoning,Herbs & Spices,1.0,B,2018,26


**Insert weeks with no sales (Put 0)<br>Did not put currently as it affects standard deviation**

In [33]:
retail_group = retail_df.groupby(['YEAR', 'WEEK', 'MATERIAL'])[
    'QTY_SOLD'].sum().reset_index()

In [54]:
retail_group.head()

Unnamed: 0,YEAR,WEEK,MATERIAL,QTY_SOLD
0,2018,26,101,2.0
1,2018,26,112,1.0
2,2018,26,116,8.0
3,2018,26,213,7.0
4,2018,26,219,1.0


In [5]:
# retail_group = retail_df.pivot_table(index=['YEAR', 'WEEK'], columns=[
#     'MATERIAL'], values='QTY_SOLD', fill_value=0).reset_index()

In [6]:
# retail_group = retail_group.melt(
#     id_vars=['YEAR', 'WEEK'], value_vars=retail_group.columns[2:], value_name='QTY_SOLD')

In [7]:
# print(retail_group.shape)
# retail_group.head()

(1246402, 4)


Unnamed: 0,YEAR,WEEK,MATERIAL,QTY_SOLD
0,2018,26,101,2.0
1,2018,27,101,1.0
2,2018,28,101,1.0
3,2018,29,101,1.0
4,2018,30,101,1.0


# Seasonal Products

**Sales of items affected by**
- Economy at times
- Nature of Item (Seasonal or Non-seasonal)
- Promotions or Discounts
- Competition from a rival
- Competition from a substitute product
- Special events like Super Bowl, Thanksgiving, New Year, etc varying from places to places

In [36]:
avg_df = retail_group.groupby(['MATERIAL', 'YEAR'])[
    'QTY_SOLD'].mean().reset_index()
avg_df = avg_df.rename(columns={'QTY_SOLD': 'AVG_QTY'})
avg_df.head()

Unnamed: 0,MATERIAL,YEAR,AVG_QTY
0,101,2018,1.454545
1,101,2019,1.783784
2,101,2020,2.275862
3,101,2021,1.714286
4,102,2018,1.0


In [37]:
retail_merged = retail_group.merge(avg_df, how='left', on=['YEAR', 'MATERIAL'])
retail_merged.head()

Unnamed: 0,YEAR,WEEK,MATERIAL,QTY_SOLD,AVG_QTY
0,2018,26,101,2.0,1.454545
1,2018,26,112,1.0,17.214286
2,2018,26,116,8.0,35.821429
3,2018,26,213,7.0,30.964286
4,2018,26,219,1.0,52.607143


**Create seasonal index**

In [38]:
retail_merged['SEASONAL_INDEX'] = retail_merged['QTY_SOLD'] / \
    retail_merged['AVG_QTY']

In [46]:
std_df = retail_merged.groupby(['MATERIAL', 'YEAR'])[
    'SEASONAL_INDEX'].std().reset_index()
std_df = std_df.rename(columns={'SEASONAL_INDEX': 'STD'})

In [48]:
retail_merged = retail_merged.merge(
    std_df, how='left', on=['MATERIAL', 'YEAR'])

**Remove materials that are not sold, it may be cause of lack of inventory**

In [49]:
retail_merged = retail_merged.dropna(subset=['STD'])

retail_merged = retail_merged.groupby(['YEAR', 'MATERIAL'])['STD'].mean(
).to_frame().sort_values(by=['YEAR', 'STD'], ascending=False)

In [50]:
retail_merged

Unnamed: 0_level_0,Unnamed: 1_level_0,STD
YEAR,MATERIAL,Unnamed: 2_level_1
2021,56732,4.242641
2021,104945,3.464102
2021,430,3.311135
2021,200989,2.510824
2021,53036,2.010835
...,...,...
2018,100377,0.000000
2018,100495,0.000000
2018,100499,0.000000
2018,101301,0.000000


In [None]:
retail_df.groupby(['YEAR', 'MATERIAL'])['DATE'].count().to_frame()

In [51]:
retail_merged.loc[2020]

Unnamed: 0_level_0,STD
MATERIAL,Unnamed: 1_level_1
65237,4.327266
52289,4.264110
50061,4.000208
102365,3.853268
102452,3.515416
...,...
200302,0.000000
200303,0.000000
200417,0.000000
200951,0.000000


In [53]:
retail_df.loc[(retail_df['YEAR'] == 2020) & (retail_df['MATERIAL'] == 65237)]

Unnamed: 0,DATE,CUSTNAME,MATERIAL,BASEUOM,MSTAE,H1,H2,H3,QTY_SOLD,CLUSTER,YEAR,WEEK
796087,2020-01-06,Store 2,65237,PAC,AC,Non Food,Consumables,Baking Cases,2.0,A,2020,01
800689,2020-01-13,Store 1,65237,PAC,AC,Non Food,Consumables,Baking Cases,2.0,A,2020,02
813904,2020-01-20,Store 2,65237,PAC,AC,Non Food,Consumables,Baking Cases,1.0,A,2020,03
816488,2020-01-27,Store 3,65237,PAC,AC,Non Food,Consumables,Baking Cases,1.0,B,2020,04
826670,2020-02-03,Store 1,65237,PAC,AC,Non Food,Consumables,Baking Cases,1.0,A,2020,05
...,...,...,...,...,...,...,...,...,...,...,...,...
1314418,2020-12-14,Store 2,65237,PAC,AC,Non Food,Consumables,Baking Cases,3.0,A,2020,50
1325308,2020-12-21,Store 4,65237,PAC,AC,Non Food,Consumables,Baking Cases,9.0,B,2020,51
1326750,2020-12-21,Store 1,65237,PAC,AC,Non Food,Consumables,Baking Cases,2.0,A,2020,51
1330770,2020-12-21,Store 2,65237,PAC,AC,Non Food,Consumables,Baking Cases,3.0,A,2020,51


In [None]:
retail_group