In [1]:
required_packages <- c(
  "forecast",       # ARIMAX modeling
  "dplyr",          # Data manipulation
  "tidyr",          # pivot_wider/pivot_longer
  "prophet",        # xreg forecasting
  "thief",          # h
  "TSrepr",         # Errors
  "progress",
  "doParallel",
  "foreach"
)

# Install missing packages
new_packages <- required_packages[!required_packages %in% installed.packages()[,"Package"]]
if(length(new_packages)) install.packages(new_packages)

# Load all packages
invisible(lapply(required_packages, library, character.only = TRUE))

also installing the dependencies ‘hms’, ‘prettyunits’, ‘iterators’, ‘codetools’


Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done

Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: Rcpp

Loading required package: rlang

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



In [2]:
num_workers <- min(8, parallel::detectCores() - 2)  # Conservative number
cl <- makeCluster(num_workers)  # Instead of detectCores()-1
registerDoParallel(cl)

In [3]:
sales <- read.csv("sales_train_validation.csv", stringsAsFactors = FALSE)
calendar <- read.csv("calendar.csv", stringsAsFactors = FALSE)

In [4]:
calendar <- calendar %>%
  mutate(
    date = as.Date(date),
    is_holiday = 0  # Initialize all as 0 (not holidays)
  )

# Loop through each row (day) and check if it has any holiday-related events
for (i in 1:nrow(calendar)) {
  if (calendar$event_name_1[i] != "" | calendar$event_type_1[i] != "" |
      calendar$event_name_2[i] != "" | calendar$event_type_2[i] != "") {
    calendar$is_holiday[i] <- 1  # Set is_holiday to 1 if there's any event
  }
}

In [5]:
sales <- sales %>% mutate(row_id = row_number())
item_metadata <- sales %>% select(row_id, dept_id)
sales_long <- sales %>%
  select(row_id, starts_with("d_")) %>%
  pivot_longer(cols = starts_with("d_"), names_to = "day", values_to = "value")

In [6]:
dept_sales <- sales_long %>%
  left_join(calendar %>% select(day = d, date), by = "day") %>%
  left_join(item_metadata, by = "row_id") %>%
  group_by(date, dept_id) %>%
  summarise(dept_sales = sum(value, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(
    names_from = dept_id,
    values_from = dept_sales,
    values_fill = 0
  ) %>%
  arrange(date)

In [7]:
sales_with_categories <- sales %>%
  mutate(
    row_id = row_number(),
    category = case_when(
      grepl("^FOODS", item_id) ~ "FOODS",
      grepl("^HOBBIES", item_id) ~ "HOBBIES",
      grepl("^HOUSEHOLD", item_id) ~ "HOUSEHOLD",
      TRUE ~ "OTHER"
    )
  ) %>%
  filter(category != "OTHER")

In [8]:
category_sales <- sales_with_categories %>%
  select(row_id, category, starts_with("d_")) %>%
  pivot_longer(cols = starts_with("d_"), names_to = "day", values_to = "value")

In [9]:
category_sales_aggregated <- category_sales %>%
  left_join(calendar %>% select(day = d, date), by = "day") %>%
  group_by(date, category) %>%
  summarise(category_sales = sum(value, na.rm = TRUE), .groups = "drop") %>%
  pivot_wider(
    names_from = category,
    values_from = category_sales,
    values_fill = 0
  ) %>%
  arrange(date)

In [10]:
historical_regressors <- calendar %>%
  select(date, is_holiday) %>%
  left_join(
    sales %>% 
      select(starts_with("d_"), dept_id) %>%
      pivot_longer(starts_with("d_"), names_to = "day", values_to = "sales") %>%
      left_join(calendar %>% select(day = d, date), by = "day") %>%
      group_by(date, dept_id) %>%
      summarise(dept_sales = sum(sales), .groups = "drop") %>%
      pivot_wider(names_from = dept_id, values_from = dept_sales),
    by = "date"
  ) %>%
  left_join(
    sales %>% 
      select(starts_with("d_"), cat_id) %>%
      pivot_longer(starts_with("d_"), names_to = "day", values_to = "sales") %>%
      left_join(calendar %>% select(day = d, date), by = "day") %>%
      group_by(date, cat_id) %>%
      summarise(cat_sales = sum(sales), .groups = "drop") %>%
      pivot_wider(names_from = cat_id, values_from = cat_sales),
    by = "date"
  )

In [11]:
# Future regressors (from external file)
future_regressors <- read.csv("prognozes_platus_formatas.csv")

In [12]:
# Safe ARIMAX forecasting function
forecast_item <- function(item_data, item_id, store_id, dept_id, cat_id) {
  # Prepare time series (weekly seasonality)
  ts_data <- ts(item_data$sales, frequency = 7)
  
  # Historical regressors for this item
  xreg_hist <- historical_regressors %>%
    filter(date %in% item_data$date) %>%
    select(is_holiday, all_of(dept_id), all_of(cat_id)) %>%
    as.matrix()
  
  # Future regressors for this item
  xreg_future <- future_regressors %>%
    select(is_holiday, all_of(dept_id), all_of(cat_id)) %>%
    as.matrix()
  
  # Fit model with error handling
  fit <- tryCatch({
    auto.arima(ts_data, xreg = xreg_hist, lambda=0, biasadj=TRUE, seasonal = TRUE, stepwise = TRUE)
  }, error = function(e) {
    tryCatch({
      # Fallback 1: ARIMA with fixed order
      Arima(ts_data, order = c(1,1,1), seasonal = c(0,1,1), xreg = xreg_hist)
    }, error = function(e) {
      # Fallback 2: Pure ARIMA
      auto.arima(ts_data)
    })
  })
  
  # Generate forecast
  if ("xreg" %in% names(fit$call)) {
    fc <- forecast(fit, h = 28, xreg = xreg_future)
  } else {
    fc <- forecast(fit, h = 28)
  }
  
  # Return results
  data.frame(
    item_id = item_id,
    store_id = store_id,
    date = seq(max(item_data$date) + 1, length.out = 28, by = "day"),
    forecast = as.numeric(fc$mean),
    stringsAsFactors = FALSE
  )
}

In [13]:
# Prepare cluster
cl <- makeCluster(detectCores() - 1)
registerDoParallel(cl)

# Export required functions to workers
clusterExport(cl, c("forecast_item", "historical_regressors", "future_regressors", "calendar"))

# Load packages in workers
clusterEvalQ(cl, {
  library(forecast)
  library(dplyr)
  library(tidyr)  # <-- Critical fix
})

# Process all items
all_forecasts <- foreach(
  i = 1:nrow(sales), 
  .combine = rbind,
  .packages = c("dplyr", "forecast", "tidyr")
) %dopar% {
  item <- sales[i, ]
  
  # Prepare item sales data
  item_data <- item %>%
    select(starts_with("d_")) %>%
    pivot_longer(starts_with("d_"), names_to = "day", values_to = "sales") %>%
    left_join(calendar %>% select(day = d, date), by = "day") %>%
    arrange(date)
  
  # Run forecasting
  forecast_item(
    item_data,
    item_id = item$item_id,
    store_id = item$store_id,
    dept_id = item$dept_id,
    cat_id = item$cat_id
  )
}

# Stop cluster
stopCluster(cl)

In [14]:
# Convert to submission format
submission <- all_forecasts %>%
  mutate(day = paste0("F", rep(1:28, length.out = nrow(all_forecasts)))) %>%
  select(item_id, store_id, day, forecast) %>%
  pivot_wider(names_from = day, values_from = forecast)

# Save to CSV
write.csv(submission, "submission_arimax.csv", row.names = FALSE)

In [15]:
sales_out <- read.csv("sales_test_validation.csv", stringsAsFactors = FALSE)
stat_total <- read.csv("stat_total.csv", stringsAsFactors = FALSE)
forecasts <- read.csv("submission_arimax.csv", stringsAsFactors = FALSE)  # Your forecast file

In [16]:
# Convert sales_out to long format
actuals_long <- sales_out %>%
  pivot_longer(
    cols = starts_with("d_"),
    names_to = "day",
    values_to = "actual"
  ) %>%
  mutate(
    day_num = as.numeric(gsub("d_", "", day)),
    series_id = paste(item_id, store_id, sep = "_")
  )

# Convert forecasts to long format
forecasts_long <- forecasts %>%
  pivot_longer(
    cols = starts_with("F"),
    names_to = "day",
    values_to = "forecast"
  ) %>%
  mutate(
    day_num = as.numeric(gsub("F", "", day)) + 1913,
    series_id = paste(item_id, store_id, sep = "_")
  )

In [17]:
weights <- stat_total %>%
  mutate(
    series_id = paste(item_id, store_id, sep = "_"),
    weight = dollar_sales/sum(dollar_sales)  # Normalize to create weights
  ) %>%
  select(series_id, weight)

In [18]:
results <- actuals_long %>%
  # Clean column names first
  select(item_id, store_id, day_num, actual, series_id) %>%
  inner_join(
    forecasts_long %>% 
      select(item_id, store_id, day_num, forecast, series_id),
    by = c("series_id", "day_num"),
    suffix = c("", ".y")
  ) %>%
  # Calculate scaling factors
  group_by(series_id) %>%
  mutate(
    scale = mean(abs(diff(actual))),  # Scaling factor per series
    scaled_error = (forecast - actual)/scale
  ) %>%
  ungroup()

In [19]:
avg_scale <- mean(results$scale, na.rm = TRUE)
results <- results %>%
  mutate(scale = ifelse(scale == 0 | is.na(scale), avg_scale * 0.01, scale))

In [20]:
results <- results %>%
  mutate(scaled_error = ifelse(is.infinite(scaled_error), 0, scaled_error))

In [21]:
wrmsse <- results %>%
  left_join(weights, by = "series_id") %>%
  group_by(series_id) %>%
  summarise(
    rmse = sqrt(mean(scaled_error^2, na.rm = TRUE)),
    weighted_rmse = rmse * first(weight)
  ) %>%
  summarise(
    WRMSSE = sum(weighted_rmse, na.rm = TRUE)
  ) %>%
  pull(WRMSSE)

print(paste("Final WRMSSE:", round(wrmsse, 4)))

[1] "Final WRMSSE: 1.1387"
