In [57]:
library(data.table)
library(zoo)
library(forecast)
library(dplyr)  
library(readr)
library(ggplot2)

In [58]:
train <- read_csv("./dataset/train.csv", col_types=list(
  Store = col_integer(),
  DayOfWeek= col_integer(),
  Date = col_date(),
  Sales = col_integer(),
  Customers = col_integer(),
  Open = col_integer(),
  Promo = col_integer(),
  StateHoliday = col_character(),
  SchoolHoliday = col_integer()))


by_Date_Gap <- by_Date[by_Date$Date %in% 
                         seq(as.Date("2014-06-30"),as.Date("2015-01-01"),by="day"),]
head(by_Date_Gap,3)

tail(by_Date_Gap,3)

all_stores <- unique(train$Store)
stores_reporting <- train$Store[train$Date == as.Date("2014-7-1")]
missing_stores <- all_stores[!(all_stores %in% stores_reporting)]
missing_stores

for (date in seq(as.Date("2014-7-2"),as.Date("2014-12-31"),by="day")) {
  stores_reporting <- train$Store[train$Date == date]
  missing_on_date <- all_stores[!(all_stores %in% stores_reporting)]
  if (length(setdiff(missing_on_date,missing_stores)) > 0) {
    cat("Date:",date," Difference in missing stores",setdiff(missing_on_date,missing_stores))
  } 
}

stores_reporting <- train$Store[train$Date == as.Date("2013-1-1")]
additional_missing_store <- all_stores[!(all_stores %in% stores_reporting)]
additional_missing_store

Date,NumOfStores
2014-06-30,1115
2014-07-01,935
2014-07-02,935


Date,NumOfStores
2014-12-30,935
2014-12-31,935
2015-01-01,1115


In [59]:
date <- as.Date("2013-1-1")
day_of_week <- unique(train$DayOfWeek[train$Date == date])
sales <- as.numeric(names(which.max(table(train$Sales[train$Date == date]))))
customers <- as.numeric(names(which.max(table(train$Customers[train$Date == date]))))
open <- as.numeric(names(which.max(table(train$Open[train$Date == date]))))
promo <- as.numeric(names(which.max(table(train$Promo[train$Date == date]))))
state_holiday <- names(which.max(table(train$StateHoliday[train$Date == date])))
school_holiday <- as.numeric(names(which.max(table(train$SchoolHoliday[train$Date == date]))))

missing_row <- data.frame(Store = additional_missing_store,
                          DayOfWeek = day_of_week,
                          Date = date,
                          Sales = sales,
                          Customers = customers,
                          Open = open,
                          Promo = promo,
                          StateHoliday = state_holiday,
                          SchoolHoliday = school_holiday)
missing_row

train <- rbind(train,missing_row)

Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
988,2,2013-01-01,0,0,0,0,a,1


In [60]:
gap <- seq(as.Date("2014-7-1"),as.Date("2014-12-31"),by="day")
n_missing <- length(gap)*length(missing_stores)
missing_df <- data.frame(Store = integer(n_missing),
                         DayOfWeek = integer(n_missing),
                         Date = rep(gap,length(missing_stores)),
                         Sales = integer(n_missing),
                         Customers = integer(n_missing),
                         Open = integer(n_missing),
                         Promo = integer(n_missing),
                         StateHoliday = character(n_missing),
                         SchoolHoliday = integer(n_missing),
                         logSales = numeric(n_missing),
                         stringsAsFactors=FALSE)

In [61]:
gap
n_missing
missing_df

Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,logSales
0,0,2014-07-01,0,0,0,0,,0,0
0,0,2014-07-02,0,0,0,0,,0,0
0,0,2014-07-03,0,0,0,0,,0,0
0,0,2014-07-04,0,0,0,0,,0,0
0,0,2014-07-05,0,0,0,0,,0,0
0,0,2014-07-06,0,0,0,0,,0,0
0,0,2014-07-07,0,0,0,0,,0,0
0,0,2014-07-08,0,0,0,0,,0,0
0,0,2014-07-09,0,0,0,0,,0,0
0,0,2014-07-10,0,0,0,0,,0,0


In [62]:
for (date in gap) {
  missing_df$Store[missing_df$Date == date] <- missing_stores
  
  day_of_week <- unique(train$DayOfWeek[train$Date == date])
  missing_df$DayOfWeek[missing_df$Date == date] <- rep(day_of_week, length(missing_stores))
  
  missing_df$Sales[missing_df$Date == date] <- rep(NA, length(missing_stores))

  missing_df$Customers[missing_df$Date == date] <- rep(NA, length(missing_stores))
  
  open <- as.numeric(names(which.max(table(train$Open[train$Date == date]))))
  missing_df$Open[missing_df$Date == date] <- rep(open, length(missing_stores))
  
  promo <- as.numeric(names(which.max(table(train$Promo[train$Date == date]))))
  missing_df$Promo[missing_df$Date == date] <- rep(promo, length(missing_stores))

  state_holiday <- names(which.max(table(train$StateHoliday[train$Date == date])))
  missing_df$StateHoliday[missing_df$Date == date] <- rep(state_holiday, length(missing_stores))

  school_holiday <- as.numeric(names(which.max(table(train$SchoolHoliday[train$Date == date]))))
  missing_df$SchoolHoliday[missing_df$Date == date] <- rep(school_holiday, length(missing_stores))
  
#   missing_df$logSales[missing_df$Date == date] <- rep(NA, length(missing_stores))

}

head(missing_df)

Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,logSales
13,2,2014-07-01,,,1,1,0,0,0
13,3,2014-07-02,,,1,1,0,0,0
13,4,2014-07-03,,,1,1,0,0,0
13,5,2014-07-04,,,1,1,0,0,0
13,6,2014-07-05,,,1,0,0,0,0
13,7,2014-07-06,,,0,0,0,0,0


In [63]:
train$logSales <- log(train$Sales+1)
train_filled_gap <- rbind(train,missing_df)
train_filled_gap <- train_filled_gap[order(train_filled_gap$Date),]
train_filled_gap 
train_filled_gap <- train_filled_gap %>% 
                      group_by(Store, DayOfWeek, Open, Promo) %>%
                      mutate(Sales = as.integer(ifelse(is.na(Sales), 
                                                       ifelse(Open == 0, 
                                                              0,
                                                              median(Sales, na.rm=T)), 
                                                       Sales))) %>%
                      mutate(Customers = as.integer(ifelse(is.na(Customers),
                                                           ifelse(Open == 0, 
                                                              0,
                                                              median(Customers, na.rm=T)),
                                                           Customers))) %>%
                      mutate(logSales = ifelse(is.na(logSales),
                                               ifelse(Open == 0,
                                                      0,
                                                      mean(logSales, na.rm=T)), 
                                               logSales))
train_filled_gap 

Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,logSales
1,2,2013-01-01,0,0,0,0,a,1,0
2,2,2013-01-01,0,0,0,0,a,1,0
3,2,2013-01-01,0,0,0,0,a,1,0
4,2,2013-01-01,0,0,0,0,a,1,0
5,2,2013-01-01,0,0,0,0,a,1,0
6,2,2013-01-01,0,0,0,0,a,1,0
7,2,2013-01-01,0,0,0,0,a,1,0
8,2,2013-01-01,0,0,0,0,a,1,0
9,2,2013-01-01,0,0,0,0,a,1,0
10,2,2013-01-01,0,0,0,0,a,1,0


Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,logSales
1,2,2013-01-01,0,0,0,0,a,1,0
2,2,2013-01-01,0,0,0,0,a,1,0
3,2,2013-01-01,0,0,0,0,a,1,0
4,2,2013-01-01,0,0,0,0,a,1,0
5,2,2013-01-01,0,0,0,0,a,1,0
6,2,2013-01-01,0,0,0,0,a,1,0
7,2,2013-01-01,0,0,0,0,a,1,0
8,2,2013-01-01,0,0,0,0,a,1,0
9,2,2013-01-01,0,0,0,0,a,1,0
10,2,2013-01-01,0,0,0,0,a,1,0


In [64]:
anything_missed <- subset(train_filled_gap, is.na(Sales) | is.na(Customers) | is.na(logSales))
anything_missed

Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,logSales


In [65]:
store13 <- subset(train_filled_gap, Store==13)
ggplot(store13, aes(Date,Sales)) + geom_line() +geom_smooth() + ggtitle("Revenue for Store 13 over time")

`geom_smooth()` using method = 'loess'


ERROR: Error in png(tf, width, height, "in", pointsize, bg, res, antialias = antialias): 无法启动png()装置


plot without title

In [None]:
by_Date <- train %>% group_by(Date) %>% summarise(NumOfStores=n())
# ggplot(by_Date, aes(Date,NumOfStores)) + geom_line()

In [None]:
library(ggplot2)
test <- fread("./dataset/test.csv")
train <- fread("./dataset/train.csv")
store <- fread("./dataset/store.csv")

In [None]:
hist(train$Sales, 50, main="Histogram of Sales")

In [None]:
train[Sales != 0]

In [None]:
hist(aggregate(train[Sales != 0]$Sales, by = list(train[Sales != 0]$Store), mean)$x, 100, main = "Mean sales per store when store was not closed")

In [None]:
str(train)
by_Date <- train %>% group_by(Date) %>% summarise(NumStores=n())
ggplot(by_Date, aes(x=Date,y=NumStores,group = 1)) + geom_line()

In [None]:
str(test)

In [None]:
str(store)

In [None]:
head(test)

In [None]:
train[, Date := as.Date(Date)]
head(train)

In [None]:
head(store)

In [None]:
train <- train[order(Date)]
test <- test[order(Date)]
summary(train)
summary(test)
summary(store)
# head(train)

In [None]:
test[is.na(test$Open), ]

In [None]:
test$Open[test$Store == 622]

In [None]:
test[is.na(test)] <- 1

In [None]:
test$Open[test$Store == 622]

In [None]:
by_Date <- train %>% group_by(Date) %>% summarise(NumStores=n())
ggplot(by_Date, aes(Date,NumStores)) + geom_line()