In [1]:
options(warn=-1)

# import required library
if (!require(lubridate)) {
    install.packages('lubridate')
}
if (!require(forecast)) {
    install.packages('forecast')
}

library(lubridate)  # convert date information
library(forecast)  # make forecast

Loading required package: lubridate

Attaching package: ‘lubridate’

The following object is masked from ‘package:base’:

    date

Loading required package: forecast


# Data Processing

In [2]:
# load the data
train = read.csv('./train.csv')
test = read.csv('./test.csv')

In [3]:
# transform the date
train$Date = as.Date(train$Date, '%Y-%m-%d')
test$Date = as.Date(test$Date, '%Y-%m-%d')

In [4]:
# get the year and month information
train$year = year(train$Date)
test$year = year(test$Date)

train$month = month(train$Date)
test$month = month(test$Date)

In [5]:
# get week information
train.week = train$Date
train.week = train.week - train.week[1]  # date is now 0, 7, 14, ...
train.week = train.week / 7 + 5  # make 2010-02-05 as '5'
train.week = as.numeric(train.week) %% 52  ## 52 weeks in a year
train$week = train.week

test.week = test$Date
test.week = test.week - test.week[1]
test.week = test.week / 7 + 9 # make 2011-03-04 as '9'.
test.week = as.numeric(test.week) %% 52
test$week = test.week

In [6]:
head(test)

Store,Dept,Date,Weekly_Sales,IsHoliday,Weekly_Pred1,Weekly_Pred2,Weekly_Pred3,year,month,week
1,1,2011-03-04,20327.61,False,0,0,0,2011,3,9
1,1,2011-03-11,21280.4,False,0,0,0,2011,3,10
1,1,2011-03-18,20334.23,False,0,0,0,2011,3,11
1,1,2011-03-25,20881.1,False,0,0,0,2011,3,12
1,1,2011-04-01,20398.09,False,0,0,0,2011,4,13
1,1,2011-04-08,23873.79,False,0,0,0,2011,4,14


# Model 1

* use the information from the previous year
* the same department, same store
* week - 1, week, week + 1

In [8]:
# get the length of unique store and department
store = sort(unique(test$Store))
n.store = length(store)
dept = sort(unique(test$Dept))
n.dept = length(dept)

In [9]:
# choose the median value from the last year, in week - 1, week, and week + 1
for (s in 1:n.store){
#     cat("Store: ", store[s], "\n")
    for (d in 1:n.dept){
        # find the data for (store, dept) = (s, d)
        test.id = which(test$Store == store[s] & test$Dept == dept[d])
        test.temp = test[test.id, ]
        train.id = which(train$Store == store[s] & train$Dept == dept[d])
        train.temp = train[train.id, ]
        
        for (i in 1:length(test.id)){
            id = which(train.temp$week == test.temp[i,]$week & train.temp$year == test.temp[i,]$year - 1)
            threeWeeksId = c(id - 1, id, id + 1)  ## three weeks in the last year
            tempSales = train.temp[threeWeeksId, 'Weekly_Sales']
            if (length(tempSales) == 0){
                test$Weekly_Pred1[test.id[i]] = 0
            }else{
                test$Weekly_Pred1[test.id[i]] = median(tempSales)
            }
        }
    }
}

Store:  1 
Store:  2 
Store:  3 
Store:  4 
Store:  5 
Store:  6 
Store:  7 
Store:  8 
Store:  9 
Store:  10 
Store:  11 
Store:  12 
Store:  13 
Store:  14 
Store:  15 
Store:  16 
Store:  17 
Store:  18 
Store:  19 
Store:  20 
Store:  21 
Store:  22 
Store:  23 
Store:  24 
Store:  25 
Store:  26 
Store:  27 
Store:  28 
Store:  29 
Store:  30 
Store:  31 
Store:  32 
Store:  33 
Store:  34 
Store:  35 
Store:  36 
Store:  37 
Store:  38 
Store:  39 
Store:  40 
Store:  41 
Store:  42 
Store:  43 
Store:  44 
Store:  45 


# Model 2

* use the information from the previous years
* the same department, but different stores
* use the median of the same week

In [10]:
# get the length of unique department
dept = sort(unique(test$Dept))
n.dept = length(dept)

In [11]:
# choose the median value from the last year, in week - 1, week, and week + 1
for (d in 1:n.dept){
#     cat("Department: ", dept[d], "\n")
    # find the data for (store, dept) = (s, d)
    test.id = which(test$Dept == dept[d])
    test.temp = test[test.id, ]
    train.id = which(train$Dept == dept[d])
    train.temp = train[train.id, ]

    for (i in 1:length(test.id)){
        id = which(train.temp$week == test.temp[i,]$week & train.temp$year < test.temp[i,]$year)
        tempSales = train.temp[id, 'Weekly_Sales']
        if (length(tempSales) == 0){
            test$Weekly_Pred2[test.id[i]] = 0
        }else{
            test$Weekly_Pred2[test.id[i]] = median(tempSales)
        }
    }
}

Department:  1 
Department:  2 
Department:  3 
Department:  4 
Department:  5 
Department:  6 
Department:  7 
Department:  8 
Department:  9 
Department:  10 
Department:  11 
Department:  12 
Department:  13 
Department:  14 
Department:  16 
Department:  17 
Department:  18 
Department:  19 
Department:  20 
Department:  21 
Department:  22 
Department:  23 
Department:  24 
Department:  25 
Department:  26 
Department:  27 
Department:  28 
Department:  29 
Department:  30 
Department:  31 
Department:  32 
Department:  33 
Department:  34 
Department:  35 
Department:  36 
Department:  37 
Department:  38 
Department:  39 
Department:  40 
Department:  41 
Department:  42 
Department:  43 
Department:  44 
Department:  45 
Department:  46 
Department:  47 
Department:  48 
Department:  49 
Department:  50 
Department:  51 
Department:  52 
Department:  54 
Department:  55 
Department:  56 
Department:  58 
Department:  59 
Department:  60 
Department:  65 
Department:  67 
Depart

In [12]:
head(test)

Store,Dept,Date,Weekly_Sales,IsHoliday,Weekly_Pred1,Weekly_Pred2,Weekly_Pred3,year,month,week
1,1,2011-03-04,20327.61,False,21043.39,17098.49,0,2011,3,9
1,1,2011-03-11,21280.4,False,21827.9,17034.09,0,2011,3,10
1,1,2011-03-18,20334.23,False,22136.64,17976.97,0,2011,3,11
1,1,2011-03-25,20881.1,False,26229.21,22356.19,0,2011,3,12
1,1,2011-04-01,20398.09,False,42960.91,49917.32,0,2011,4,13
1,1,2011-04-08,23873.79,False,42960.91,31548.24,0,2011,4,14


# Model 3

* use the information from the previous years
* the same department, same store
* use all weeks' information to make forecast

# Model 4

* not decided yet

# Performance Testing

* According to the definition of the weighted mean absolute error (WMAE)
* Link to Kaggle: [Walmart Recruiting - Store Sales Forecasting](https://www.kaggle.com/c/walmart-recruiting-store-sales-forecasting#evaluation)

In [16]:
# define weight w
weight = 4 * test$IsHoliday + 1

In [17]:
# calculate the performance of different models
WMAE1 = sum(weight * abs(test$Weekly_Pred1 - test$Weekly_Sales)) / sum(weight)
WMAE2 = sum(weight * abs(test$Weekly_Pred2 - test$Weekly_Sales)) / sum(weight)
WMAE3 = sum(weight * abs(test$Weekly_Pred3 - test$Weekly_Sales)) / sum(weight)

In [18]:
# output the performance of different models
cat(WMAE1, '\t', WMAE2, '\t', WMAE3, '\n')

16168.23 	 16168.23 	 16168.23 
