In [2]:
# Libraries
import os
import pandas as pd
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
import datetime
import scipy.stats as stats
import researchpy as rp
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
from scipy.stats import zscore
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from arch import arch_model
#user = os.getenv('USERPROFILE')
#data_path = os.path.join(
#    user, 'OneDrive - National University of Singapore\EBAC\Year 1 Semester 1\Project\Data')
#os.chdir(data_path)


In [5]:
retail_df = pd.read_excel('C:/Users/fanjn/Desktop/ISS Master/Term 1 Project/Retail_Week_NUS.xlsx', parse_dates=[0])
retail_df.head()

Unnamed: 0,DATE,CUSTNAME,MATERIAL,BASEUOM,MSTAE,H1,H2,H3,QTY_SOLD,CLUSTER
0,2018-06-25,Store 4,10091,PAC,AC,Dairy,Cream Compound,Whipping,5.0,B
1,2018-06-25,Store 4,10550,PAC,AC,"Nuts, Seeds & Beans",Nuts,Pumpkin Kernel,3.0,B
2,2018-06-25,Store 4,11485,PAC,AC,"Flour, Grain & Flakes",Premix,Others,1.0,B
3,2018-06-25,Store 4,1201,PAC,AC,Bakery,Functional,Starches,1.0,B
4,2018-06-25,Store 4,12085,PAC,AC,Grocery,Seasoning,Herbs & Spices,1.0,B


# Data Cleaning

**We only look at active skus and remove General and Service in H1**

In [6]:
retail_df = retail_df.loc[(retail_df['MSTAE'] == 'AC') & (
    (retail_df['H1'] != 'General') & (retail_df['H1'] != 'Service'))]

**Create Year and Week**

In [64]:
retail_df['YEAR'] = retail_df['DATE'].dt.year
retail_df['WEEK'] = retail_df['DATE'].dt.strftime('%W').astype("int")
retail_df['MONTH'] = retail_df['DATE'].dt.month
print(retail_df.shape)
retail_df.head()

(964491, 13)


Unnamed: 0,DATE,CUSTNAME,MATERIAL,BASEUOM,MSTAE,H1,H2,H3,QTY_SOLD,CLUSTER,YEAR,WEEK,MONTH
0,2018-06-25,Store 4,10091,PAC,AC,Dairy,Cream Compound,Whipping,5.0,B,2018,26,6
1,2018-06-25,Store 4,10550,PAC,AC,"Nuts, Seeds & Beans",Nuts,Pumpkin Kernel,3.0,B,2018,26,6
2,2018-06-25,Store 4,11485,PAC,AC,"Flour, Grain & Flakes",Premix,Others,1.0,B,2018,26,6
3,2018-06-25,Store 4,1201,PAC,AC,Bakery,Functional,Starches,1.0,B,2018,26,6
4,2018-06-25,Store 4,12085,PAC,AC,Grocery,Seasoning,Herbs & Spices,1.0,B,2018,26,6


In [40]:
retail_df.columns

Index(['DATE', 'CUSTNAME', 'MATERIAL', 'BASEUOM', 'MSTAE', 'H1', 'H2', 'H3',
       'QTY_SOLD', 'CLUSTER', 'YEAR', 'WEEK', 'MONTH'],
      dtype='object')

**Insert weeks with no sales (Put 0)<br>Did not put currently as it affects standard deviation**

In [18]:
retail_group.head()

Unnamed: 0,YEAR,WEEK,MATERIAL,QTY_SOLD
0,2018,26,101,2.0
1,2018,26,112,1.0
2,2018,26,116,8.0
3,2018,26,213,7.0
4,2018,26,219,1.0


In [5]:
# retail_group = retail_df.pivot_table(index=['YEAR', 'WEEK'], columns=[
#     'MATERIAL'], values='QTY_SOLD', fill_value=0).reset_index()

In [6]:
# retail_group = retail_group.melt(
#     id_vars=['YEAR', 'WEEK'], value_vars=retail_group.columns[2:], value_name='QTY_SOLD')

In [7]:
# print(retail_group.shape)
# retail_group.head()

(1246402, 4)


Unnamed: 0,YEAR,WEEK,MATERIAL,QTY_SOLD
0,2018,26,101,2.0
1,2018,27,101,1.0
2,2018,28,101,1.0
3,2018,29,101,1.0
4,2018,30,101,1.0


# Seasonal Products

**Sales of items affected by**
- Economy at times
- Nature of Item (Seasonal or Non-seasonal)
- Promotions or Discounts
- Competition from a rival
- Competition from a substitute product
- Special events like Super Bowl, Thanksgiving, New Year, etc varying from places to places

In [41]:
retail_group = retail_df.groupby(['YEAR','MONTH', 'WEEK', 'MATERIAL'])[
    'QTY_SOLD'].sum().reset_index()

In [42]:
z_score=retail_group.groupby(['MATERIAL'])["QTY_SOLD"].transform(lambda x : zscore(x))
retail_group["z_score"]=z_score

In [43]:
print(retail_group["z_score"].max())
print(retail_group["z_score"].min())

10.006106654043196
-7.051126411312967


In [44]:
maxVar=retail_group[(retail_group["z_score"]>=3) | (retail_group["z_score"]<=-3)]

In [45]:
print(maxVar["MATERIAL"].nunique())
maxVar.head()

3823


Unnamed: 0,YEAR,MONTH,WEEK,MATERIAL,QTY_SOLD,z_score
254,2018,6,26,12088,13.0,-3.795744
390,2018,6,26,50313,3.0,-3.512229
1004,2018,6,26,64298,40.0,-3.501186
1005,2018,6,26,64300,34.0,-3.924497
1040,2018,6,26,65046,2.0,-3.002467


In [None]:
avg_df = retail_group.groupby(['MATERIAL', 'YEAR'])[
    'QTY_SOLD'].mean().reset_index()
avg_df = avg_df.rename(columns={'QTY_SOLD': 'AVG_QTY'})
avg_df.head()

In [None]:
retail_merged = retail_group.merge(avg_df, how='left', on=['YEAR', 'MATERIAL'])
retail_merged.head()

**Create seasonal index**

In [38]:
retail_merged['SEASONAL_INDEX'] = retail_merged['QTY_SOLD'] / \
    retail_merged['AVG_QTY']

In [46]:
std_df = retail_merged.groupby(['MATERIAL', 'YEAR'])[
    'SEASONAL_INDEX'].std().reset_index()
std_df = std_df.rename(columns={'SEASONAL_INDEX': 'STD'})

In [48]:
retail_merged = retail_merged.merge(
    std_df, how='left', on=['MATERIAL', 'YEAR'])

**Remove materials that are not sold, it may be cause of lack of inventory**

In [49]:
retail_merged = retail_merged.dropna(subset=['STD'])

retail_merged = retail_merged.groupby(['YEAR', 'MATERIAL'])['STD'].mean(
).to_frame().sort_values(by=['YEAR', 'STD'], ascending=False)

In [50]:
retail_merged

Unnamed: 0_level_0,Unnamed: 1_level_0,STD
YEAR,MATERIAL,Unnamed: 2_level_1
2021,56732,4.242641
2021,104945,3.464102
2021,430,3.311135
2021,200989,2.510824
2021,53036,2.010835
...,...,...
2018,100377,0.000000
2018,100495,0.000000
2018,100499,0.000000
2018,101301,0.000000


In [None]:
retail_df.groupby(['YEAR', 'MATERIAL'])['DATE'].count().to_frame()

In [51]:
retail_merged.loc[2020]

Unnamed: 0_level_0,STD
MATERIAL,Unnamed: 1_level_1
65237,4.327266
52289,4.264110
50061,4.000208
102365,3.853268
102452,3.515416
...,...
200302,0.000000
200303,0.000000
200417,0.000000
200951,0.000000


In [47]:
retail_df.loc[(retail_df['YEAR'] == 2020) & (retail_df['MATERIAL'] == 65237)]

<h2> Regression Model

**Variables**
- 1. QTY_SOLD of last year same day
- 2. QTY_SOLD of last month same day
- 3. QTY_SOLD of last week same day
- 4. Store Number

In [None]:
## Last Year
lyear=[]
for i,r in retail_df.iterrows():
    if len(retail_df.loc[(retail_df["YEAR"]==r["YEAR"]-1) & \
                        (retail_df["MONTH"]==r["MONTH"]) & \
                        (retail_df["WEEK"]==r["WEEK"]) & \
                        (retail_df["MATERIAL"]==r["MATERIAL"])& \
                          (retail_df["CUSTNAME"]==r["CUSTNAME"]),["QTY_SOLD"]])>0:
        x=retail_df.loc[(retail_df["YEAR"]==r["YEAR"]-1) & \
                        (retail_df["MONTH"]==r["MONTH"]) & \
                        (retail_df["WEEK"]==r["WEEK"]) & \
                        (retail_df["MATERIAL"]==r["MATERIAL"])& \
                          (retail_df["CUSTNAME"]==r["CUSTNAME"]),["QTY_SOLD"]].values[0,0]
        lyear.append(x)
    else:
        lyear.append("N")
    
retail_df["LAST_YEAR"]=lyear


In [None]:
## Last Month
lmonth=[]
for i,r in retail_df.iterrows():
    if len(retail_df.loc[(retail_df["YEAR"]==r["YEAR"]) & \
                        (retail_df["MONTH"]==r["MONTH"]-1) & \
                        (retail_df["WEEK"]==r["WEEK"]) & \
                        (retail_df["MATERIAL"]==r["MATERIAL"])& \
                          (retail_df["CUSTNAME"]==r["CUSTNAME"]),["QTY_SOLD"]])>0:
        x=retail_df.loc[(retail_df["YEAR"]==r["YEAR"]) & \
                        (retail_df["MONTH"]==r["MONTH"]-1) & \
                        (retail_df["WEEK"]==r["WEEK"]) & \
                        (retail_df["MATERIAL"]==r["MATERIAL"])& \
                          (retail_df["CUSTNAME"]==r["CUSTNAME"]),["QTY_SOLD"]].values[0,0]
        lmonth.append(x)
    else:
        lmonth.append("N")
    
retail_df["LAST_MONTH"]=lmonth

In [None]:
## Last Day
lday=[]
for i,r in retail_df.iterrows():
    if len(retail_df.loc[(retail_df["YEAR"]==r["YEAR"]) & \
                        (retail_df["MONTH"]==r["MONTH"]) & \
                        (retail_df["WEEK"]==r["WEEK"]-1) & \
                        (retail_df["MATERIAL"]==r["MATERIAL"])& \
                          (retail_df["CUSTNAME"]==r["CUSTNAME"]),["QTY_SOLD"]])>0:
        x=retail_df.loc[(retail_df["YEAR"]==r["YEAR"]) & \
                        (retail_df["MONTH"]==r["MONTH"]) & \
                        (retail_df["WEEK"]==r["WEEK"]-1) & \
                        (retail_df["MATERIAL"]==r["MATERIAL"])& \
                          (retail_df["CUSTNAME"]==r["CUSTNAME"]),["QTY_SOLD"]].values[0,0]
        lday.append(x)
    else:
        lday.append("N")
    
retail_df["LAST_DAY"]=lday

In [None]:
retail_df.head(3)

Train Test Split

In [None]:
train_x=retail_df[(retail_df["YEAR"]<=2020)&(retail_df["MONTH"]<=6)].drop("QTY_SOLD",axis=1)
train_x=train_x.set_index(["MATERIAL","YEAR","MONTH","WEEK"])
train_y=retail_df[(retail_df["YEAR"]<=2020)&(retail_df["MONTH"]<=6)][["MATERIAL","YEAR","MONTH","QTY_SOLD"]]
train_y=train_y.set_index(["MATERIAL","YEAR","MONTH","QTY_SOLD"])

test_x=retail_df[(retail_df["YEAR"]>2020)&(retail_df["MONTH"]>6)].drop("QTY_SOLD",axis=1)
test_x=train_x.set_index(["MATERIAL","YEAR","MONTH","WEEK"])
test_y=retail_df[(retail_df["YEAR"]<=2020)&(retail_df["MONTH"]<=6)][["MATERIAL","YEAR","MONTH","QTY_SOLD"]]
test_y=test_y.set_index(["MATERIAL","YEAR","MONTH","QTY_SOLD"])

In [None]:
lm = linear_model.LinearRegression()
model = lm.fit(test_x,test_y)
predictions = lm.predict(test_x)

In [None]:
#R2
lm.score(test_x,test_y)

In [None]:
test_y["predicted_y"]=predictions