In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Store Sales Forecasting: ETS and ARIMA Models

# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Load Data
train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv", parse_dates=['date'])
test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv", parse_dates=['date'])
store_info = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv")
oil = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/oil.csv", parse_dates=['date'])
holidays = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv", parse_dates=['date'])

In [None]:
# Focus on one store and product family: Store 1, "GROCERY I"
df = train[(train['store_nbr'] == 1) & (train['family'] == 'GROCERY I')].copy()
df = df[['date', 'sales']].set_index('date').asfreq('D').fillna(0)

# Plot the sales data
plt.figure(figsize=(12, 4))
plt.plot(df.index, df['sales'], marker='o', linestyle='-', label='Sales')
plt.title('Daily Sales - Store 1, GROCERY I')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.show()

In [None]:
# ## 2. Stationarity Check Using ADF Test
#
# The Augmented Dickey-Fuller (ADF) test helps us check whether our time series is stationary. A p-value larger than 0.05 suggests that the time series is non-stationary.

# Perform ADF Test
result = adfuller(df['sales'])
print("ADF Statistic:", result[0])
print("p-value:", result[1])

In [None]:
# ## 3. ETS Model: Exponential Smoothing
#
# We use an additive trend and seasonal component (with a weekly seasonality of 7 days) to fit the ETS model.

# Build the ETS Model
ets_model = ExponentialSmoothing(df['sales'], trend='add', seasonal='add', seasonal_periods=7).fit()
df['ETS_Forecast'] = ets_model.fittedvalues

# Forecast the next 15 days
forecast_ets = ets_model.forecast(15)

# Plot ETS fit on training data
plt.figure(figsize=(12, 4))
plt.plot(df.index, df['sales'], label='Actual Sales')
plt.plot(df.index, df['ETS_Forecast'], label='ETS Fitted', linestyle='--')
plt.title('ETS Model Fit')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.show()

# Plot ETS Forecast
plt.figure(figsize=(10, 4))
plt.plot(pd.date_range(df.index[-1] + pd.Timedelta(days=1), periods=15, freq='D'),
         forecast_ets, marker='o', linestyle='-')
plt.title('ETS Forecast - Next 15 Days')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.show()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf 
from statsmodels.graphics.tsaplots import plot_pacf 
plot_acf(df['sales']) 
plot_pacf(df['sales'])

In [None]:
# ## 4. ARIMA Model using SARIMAX
# Here, we use:
# - Non-seasonal order: (1, 1, 1)
# - Seasonal order: (1, 1, 1, 7)

# Build ARIMA Model using SARIMAX
arima_model = SARIMAX(df['sales'], order=(1,1,1), seasonal_order=(1,1,1,7)).fit(disp=False)
print(arima_model.summary())

# Forecast the next 15 days
forecast_arima = arima_model.get_forecast(steps=15).predicted_mean

# Plot ARIMA Forecast
plt.figure(figsize=(10, 4))
plt.plot(df.index[-60:], df['sales'][-60:], label='Actual Sales')
plt.plot(pd.date_range(df.index[-1] + pd.Timedelta(days=1), periods=15, freq='D'),
         forecast_arima, marker='o', linestyle='-', label='ARIMA Forecast')
plt.title('ARIMA Forecast - Next 15 Days')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.show()

In [None]:
# After fitting the ARIMA model (arima_model)
# Extract the residuals from the ARIMA model
residuals = arima_model.resid

# Plot the Residuals, ACF, and PACF for multiple lags
import statsmodels.api as sm

plt.figure(figsize=(14, 8))

# Residual Plot
plt.subplot(311)
plt.plot(residuals)
plt.title('Residuals of the SARIMAX Model')
plt.xlabel('Time')
plt.ylabel('Residuals')

# ACF Plot
plt.subplot(312)
sm.graphics.tsa.plot_acf(residuals, lags=30, ax=plt.gca())
plt.title('ACF of Residuals')

# PACF Plot
plt.subplot(313)
sm.graphics.tsa.plot_pacf(residuals, lags=30, ax=plt.gca(), method='ywm')
plt.title('PACF of Residuals')

plt.tight_layout()
plt.show()

# Ljung-Box test for a set of lags
from statsmodels.stats.diagnostic import acorr_ljungbox

# Test for autocorrelation at multiple lags
ljung_box_results = acorr_ljungbox(residuals, lags=[1, 2, 3, 4, 5, 10, 15, 20], return_df=True)
print("Ljung-Box test results:")
print(ljung_box_results)

# Interpreting results:
# A high p-value indicates that we fail to reject the null hypothesis of no autocorrelation.


In [None]:
# ## 5. Preparing Submission Files
# Step 1. Prepare the training data for store 1 and "GROCERY I"
df = train[(train['store_nbr'] == 1) & (train['family'] == 'GROCERY I')].copy()
df = df[['date', 'sales']].set_index('date').sort_index().asfreq('D').fillna(0)

# Fit the ETS model (example configuration)
ets_model = ExponentialSmoothing(df['sales'], trend='add', seasonal='add', seasonal_periods=7).fit()

# Forecast for the next 15 days (adjust as necessary)
forecast_ets = ets_model.forecast(15)

# Prepare a forecast DataFrame for the dates and predicted sales
forecast_dates = pd.date_range(start=df.index[-1] + pd.Timedelta(days=1), periods=15, freq='D')
forecast_df = pd.DataFrame({'date': forecast_dates, 'predicted_sales': forecast_ets.values})
print("Forecast DataFrame:")
print(forecast_df)

# Step 2. Prepare the test file
test['date'] = pd.to_datetime(test['date'])

# Initialize a new column 'sales' with a default numeric value (0.0)
# We set everything to 0.0 for combinations that are not store 1 and "GROCERY I"
test['sales'] = 0.0

# Define a mask for rows corresponding to store 1 and "GROCERY I"
mask = (test['store_nbr'] == 1) & (test['family'] == 'GROCERY I')

# For the rows that match, map the forecast based on the date
# We use the forecast_df's date as index
forecast_series = forecast_df.set_index('date')['predicted_sales']

# Use map on the 'date' column in the matching subset. This may produce NaNs if a date is not found.
# Then fill any missing values with 0.0.
test.loc[mask, 'sales'] = test.loc[mask, 'date'].map(forecast_series).fillna(0.0)

# Ensure the sales column is of float type
test['sales'] = test['sales'].astype(float)

# Optionally, check for any empty or non-numeric values in the 'sales' column:
if test['sales'].isnull().sum() > 0:
    print("There are still missing values in the sales column; filling with 0.0")
    test['sales'] = test['sales'].fillna(0.0)

# Step 3. Create the submission DataFrame and file
submission = test[['id', 'sales']].copy()

# Ensure no empty strings are present (they should be numeric 0.0 if missing)
submission['sales'] = submission['sales'].apply(lambda x: float(x) if pd.notnull(x) else 0.0)

#submission.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")


In [None]:
#ARIMA sub
# Step 1. Filter training data for store 1 and GROCERY I
df = train[(train['store_nbr'] == 1) & (train['family'] == 'GROCERY I')].copy()
df = df[['date', 'sales']].set_index('date').sort_index().asfreq('D').fillna(0)

# Step 2. Fit SARIMAX model (use best config based on earlier analysis)
model = SARIMAX(df['sales'], order=(1, 1, 1), seasonal_order=(1, 1, 1, 7), enforce_stationarity=False, enforce_invertibility=False)
results = model.fit(disp=False)

# Step 3. Forecast next 15 days
forecast_arima = results.forecast(steps=15)

# Create forecast DataFrame
forecast_dates = pd.date_range(start=df.index[-1] + pd.Timedelta(days=1), periods=15, freq='D')
forecast_df = pd.DataFrame({'date': forecast_dates, 'predicted_sales': forecast_arima.values})

# Step 4. Prepare test set
test['date'] = pd.to_datetime(test['date'])
test['sales'] = 0.0  # Initialize all with default

# Define mask for store 1 & GROCERY I
mask = (test['store_nbr'] == 1) & (test['family'] == 'GROCERY I')

# Map forecast values to matching test rows
forecast_series = forecast_df.set_index('date')['predicted_sales']
test.loc[mask, 'sales'] = test.loc[mask, 'date'].map(forecast_series).fillna(0.0)

# Ensure numeric type
test['sales'] = test['sales'].astype(float)

# Step 5. Write submission
submission = test[['id', 'sales']].copy()
submission['sales'] = submission['sales'].apply(lambda x: float(x) if pd.notnull(x) else 0.0)
submission.to_csv('submission.csv', index=False)
print("ARIMA submission saved as submission.csv")