# Store Sales Forecasting

### Modules Needed

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import warnings 
import matplotlib.dates as mdates


from darts import TimeSeries
from darts.models import LightGBMModel
from darts.dataprocessing.transformers import Scaler
from darts.metrics import rmse

warnings.filterwarnings('ignore')

### Data Import and Merging

In [None]:
#"C:\Users\hp\Downloads\playground-series-s5e2\store-sales-time-series-forecasting\submission.csv"
train_x = pd.read_csv("train.csv")
test_x = pd.read_csv("test.csv")
oil = pd.read_csv("oil.csv")
store = pd.read_csv("stores.csv")
transactions = pd.read_csv("transactions.csv")
holiday = pd.read_csv("holidays_events.csv")
train_x.head()

In [None]:
#prep for merger 
train_x["dataset"] = 0
test_x["dataset"] = 1

train = pd.concat([train_x,test_x], axis=0).copy()

In [None]:
#Function to merge datasets across test and train data
def data_merge(data):
    data = pd.merge(data,oil, on="date", how="left")
    data = pd.merge(data,store, on="store_nbr", how="left")
    data = pd.merge(data,transactions, on=["date","store_nbr"], how="left")
    data = pd.merge(data,holiday, left_on=["date","city"], right_on=["date","locale_name"], how="left")
    data = data.set_index(['store_nbr', 'date', 'family'])
    
    return data

train = data_merge(train)
train = train.drop(index='2013-01-01', level=1).reset_index()
train.head()

#### Sort dataset based key variables

In [None]:
train = train.sort_values(["store_nbr","family","date"])

#### Generate Lag features on Sales

In [None]:
# Prepare features
lag_features = [1,7]   # Number of lag based on lenght of prediction
for i in lag_features:
    train[f'lag_{i}'] = train.groupby(["store_nbr", "family"])["sales"].shift(i)
    train[f'lag_{i}'].fillna(0)   
    train[f'transaction_lag_{i}'] = train.groupby(["store_nbr", "family"])["transactions"].shift(i)
    train[f'transaction_lag_{i}'].fillna(0)

lag_features1 = [15,30,90,]   # Number of lag based on lenght of prediction
for i in lag_features1:
    train[f'rolling_mean_{i}'] =train.groupby(["store_nbr", "family"])["sales"].transform(lambda x: x.shift(1).rolling(i).mean())
    train[f'rolling_mean_{i}'].fillna(0)





#### Casting Columns types

In [None]:
def object_cat (df):
    for column, type in zip(df.columns,df.dtypes):
        if column == "cluster":
            pass
            #df[column] = df[column].astype("category")
        elif column == "date":
            df["date"] = pd.to_datetime(df["date"], errors="coerce") 
        if type == "object" and column != "date":
            df[column] = df[column].astype("category")
    return df

train = object_cat(train)

#### Create Time Step Feature

In [None]:
train['time_step'] = train['date'].rank(method="dense", ascending=True).astype(int)

#### Generate List for categorical and numeric columns

In [None]:
cat_col = [x for x, y in zip(train.columns, train.dtypes) if y in ["object", "category","bool"] and x != "date"]
num_col = [x for x, y in zip(train.columns, train.dtypes) if y not in ["object", "category","bool"] and x not in ["id","date","family","dataset"]]


#### Creation of Train X and Y

In [None]:
train = train.set_index("id")
train_x = train[train["dataset"]==0].copy()

test_x = train[train["dataset"]==1].copy()
train_explore = train_x.copy()



### Dealing with Missing Values and Column Types

In [None]:
# Missing oil data filled with mean of 3 days window  and others filled with zero or Unknown
def fill_na_groups(train_xx):
    train_xx["dcoilwtico"] = train_xx.groupby(["store_nbr"])["dcoilwtico"].transform(lambda x: x.fillna(x.rolling(3, min_periods=1).mean()))
    train_xx["dcoilwtico"] = train_xx.groupby(["store_nbr" ])["dcoilwtico"].transform(lambda x: x.bfill())
    train_xx[num_col] = train_xx[num_col].fillna(0)
    

    return train_xx

train_x = fill_na_groups(train_x)
test_x = fill_na_groups(test_x)
train_explore = fill_na_groups(train_explore)

train_x.head()


### Exploratory Analysis

 #### Numeric Columns

##### Histogram and Scatterplot

In [None]:
train_explore1 = train_explore[train_explore["date"] <="2024-01-01"]

#### Line Plot and Regression Plot

In [None]:
plt.style.use("seaborn-whitegrid")
plt.rc(
    "figure",
    autolayout=True,
    figsize=(16, 5),
    titlesize=18,
    titleweight='bold',
)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)
%config InlineBackend.figure_format = 'retina'

fig, ax = plt.subplots()

ax.plot('time_step', 'sales',data=train_explore[num_col], color='0.75')
ax =sns.regplot(x='time_step', y='sales', data=train_explore[num_col], ci=None, scatter_kws=dict(color='0.25'),)
ax.set_title(f'Time Plot of Sales ');

# Plot
train_explore["date_ordinal"] = train_explore["date"].map(mdates.date2num)
fig, ax = plt.subplots(figsize=(30, 10))
sns.regplot(x="date_ordinal", y="sales", data=train_explore, ci=None, scatter_kws={"color": "0.25"}, ax=ax)

# Format x-axis
ax.xaxis.set_major_locator(mdates.YearLocator(1))  # Outer level: Every year
ax.xaxis.set_minor_locator(mdates.MonthLocator())  # Inner level: Every month
ax.xaxis.set_major_formatter(mdates.DateFormatter("\n%Y"))  # Year with newline
ax.xaxis.set_minor_formatter(mdates.DateFormatter("%b"))  # Month (Jan, Feb, etc.)

# Convert back to datetime scale
#ax.set_xticks(train_explore["date_ordinal"][::3])  # Adjust tick density
#ax.set_xticklabels(train_explore["date"].dt.strftime("%b\n%Y")[::3])  

# Rotate month labels
plt.setp(ax.get_xticklabels(minor=True), rotation=45, ha="right")
# Improve readability
plt.xticks(rotation=0)  # Rotate if needed
plt.xlabel("Date")
plt.ylabel("Sales")
plt.title("Sales Trend Over Time")

plt.show()


In [None]:
fig, ax = plt.subplots(9,6, figsize=(60,40))
ax = ax.flatten()
for axs, store in zip(ax,train_explore["store_nbr"].unique()):
    train_explore2 = train_explore[train_explore["store_nbr"] == store]
    axs.plot('time_step', 'sales',data=train_explore2[num_col], color='0.5')
    sns.regplot(x='time_step', y='sales', data=train_explore2[num_col], ci=None, scatter_kws=dict(color='0.25'), ax=axs)
    axs.set_title(f'Time Plot of Sales store_nbr {store}');

###### The time plot, highlights that for most stores , over the years their max sales increase, which shows the importance of time as a factor. Further analysis will be subsequently provided based on monthly turnovers.

##### Lag Plot on Sales

In [None]:
fig, ax = plt.subplots()
ax = sns.regplot(x='lag_7', y='sales', data=train_explore, ci=None, scatter_kws=dict(color='0.25'))
ax.set_aspect('equal')
ax.set_title('Lag Plot of Sales');

###### Looking at the overall lag plot across all sales, no relationship can be established between previous days sales on current day sales, we would explore further like on the time step plot based on individual stores number

In [None]:
fig, ax = plt.subplots(9,6, figsize=(60,40))
ax = ax.flatten()
for axs, store in zip(ax,train_explore["store_nbr"].unique()):
    train_explore2 = train_explore[train_explore["store_nbr"] == store]
    sns.regplot(x='lag_7', y='sales', data=train_explore2[num_col], ci=None, scatter_kws=dict(color='0.25'), ax=axs)
    axs.set_title(f'Time Plot of Sales store_nbr {store}');

###### From the Individual plots, we can see a more valid serial dependence, that previous day sales affect current day. Further analysis can be done at yearly, monthly or weekly level based on average. However this is not currently available in this version of code.

In [None]:
train_explore2 = train_explore[train_explore["store_nbr"] == 5].set_index("date")#.to_period()
moving_average = train_explore2["sales"].rolling(
    window=365,       # 365-day window
    center=True,      # puts the average at the center of the window
    min_periods=183,  # choose about half the window size
).mean()              # compute the mean (could also do median, std, min, max, ...)

ax = train_explore2["sales"].plot(style=".", color="0.5")
moving_average.plot(
    ax=ax, linewidth=3, title="Sales Prediction - 365-Day Moving Average", legend=False,
);

##### Line plot of Sales Aggregates

In [None]:
#display(num_col) #display(cat_col).
train_explore["month_day"] = train_explore["date"].dt.strftime("%m-%d") 
fig, axes = plt.subplots(6, 1, figsize=(20,5*6), constrained_layout=True)

for ax, x in zip(axes,train_explore[[col for col in num_col if col != "time_step"]].columns) :
    #sns.histplot(x=train_explore[x], ax=ax[0], color="green", bins=30, kde=True)
    #ax[0].set_title(f"Histogram of {x}")
    #ax[0].tick_params(axis='both', labelsize=20)

    if x == "store_nbr":
        sns.lineplot(x="month_day", y="sales", ax=ax,alpha=0.8, palette="coolwarm", hue ="store_nbr",
        data=train_explore.loc[train_explore["sales"] != 0].groupby(["month_day","store_nbr"])["sales"].median().reset_index(), )
        ax.set_title(f"Lineplot of Median {x} based on month and store ")
        ax.xaxis.set_major_locator(mdates.MonthLocator())  # Show one tick per month
        ax.xaxis.set_major_formatter(mdates.DateFormatter("%b"))
        ax.tick_params(axis='both', labelsize=20)
    elif x =="cluster":
        pass
    else:
        sns.lineplot(x="month_day", y=x, ax=ax,alpha=0.8, palette="coolwarm", hue ="store_nbr",
        data=train_explore.loc[train_explore[x] != 0].groupby(["month_day","store_nbr"])[x].median().reset_index(), )
        ax.set_title(f"Lineplot of (Median) {x} based on month and store")
        ax.xaxis.set_major_locator(mdates.MonthLocator())  # Show one tick per month
        ax.xaxis.set_major_formatter(mdates.DateFormatter("%b"))
        ax.tick_params(axis='both', labelsize=20)


##### Heatmap

In [None]:
train_explore["month"] = train_explore["date"].dt.strftime("%m") 
train_explore2 = train_explore[train_explore["date"] <= "2013-01-15"]
#sns.pairplot(train_explore2[["sales"] + num_col], kind = 'reg', diag_kind="kde", hue ="store_nbr",
 #x_vars=["sales","transactions","dcoilwtico"] ,y_vars=["sales","transactions","dcoilwtico"],)

##### Analysis of Top 10  Performing Store in terms of Total Sales

In [None]:
train_sales = pd.DataFrame(train_explore.loc[train_explore["sales"] != 0].groupby(["store_nbr"])["sales"].sum().reset_index())
top10= train_sales.sort_values("sales", ascending=False)[:10]
top10stores=top10["store_nbr"].to_numpy() 
train_top10 = train_explore[train_explore["store_nbr"].isin(top10stores)]
sorted(top10stores)

In [None]:
train_explore2 = train_top10 #[train_top10["date"] <= "2020-01-31"]

# Split top 10 stores into two sets
top5_stores = train_top10["month"].unique()

# Create a single figure with subplots (2 columns for side-by-side placement)
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(18, 25), sharex=True, sharey=True, constrained_layout=True)

# Flatten axes for easier iteration
axes = axes.flatten()

for i, store in enumerate(top5_stores):
    sns.kdeplot(data=train_explore2[train_explore2["month"] == store], 
                x="sales", hue="store_nbr", ax=axes[i],)
    axes[i].set_title(f"Month {store}")

plt.tight_layout()
plt.show()

###### The plot shows sales distributions by store across all 12 months. Sales are highly right-skewed with most values clustered near zero and occasional high outliers. The distribution shape is consistent, suggesting stable seasonality with some store-specific spikes. This might call the need to do a log transformation on sales to reduce skewness (np.log1p) and reconversion (np.expm1) post analysis

In [None]:
# Create a single figure with subplots (2 columns for side-by-side placement)
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(18, 25), sharex=True, sharey=True, constrained_layout=True)

# Flatten axes for easier iteration
axes = axes.flatten()

train_explore2["sales_log"] = np.log1p(train_explore2["sales"])

for i, store in enumerate(top5_stores):
    sns.kdeplot(data=train_explore2[train_explore2["month"] == store], 
                x="sales_log", hue="store_nbr", ax=axes[i],)
    axes[i].set_title(f"Month {store}")

plt.tight_layout()
plt.show()

###### After log transformation (`sales_log`), the sales distributions across stores and months appear more symmetric and multi-modal, with reduced skewness. Peaks are better aligned across stores, indicating improved comparability and stabilized variance ideal for modeling.

In [None]:
train_explore3 = train_top10[(train_top10["date"] <= "2014-01-31") & (train_top10["sales"]>1) & (train_top10["sales"]<=1000)]
# Create a single figure with subplots (2 columns for side-by-side placement)
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(12, 15), sharex=True, sharey=True, constrained_layout=True)

# Flatten axes for easier iteration
axes = axes.flatten()

for i, store in enumerate(top5_stores):
    sns.boxplot(data=train_explore3[train_explore3["month"] == store], 
                x="month",y="sales", hue="store_nbr", ax=axes[i])
    axes[i].set_title(f"Month {store}")
    axes[i].legend().remove()   # dont display individual legend

handles, labels = axes[1].get_legend_handles_labels()  # Get legend from one subplot
fig.legend(handles, labels, title="Store",loc="upper left", ncol=4, fontsize=6)
plt.tight_layout(rect=[0, 0.05, 1, 1]) 
plt.show()


###### This boxplot shows monthly sales distributions across multiple stores. Key points:

- **Sales values are fairly consistent** across months and stores, with median sales mostly between 200–400.
- All months show **significant outliers**, especially beyond 800, indicating sporadic high sales days.
- Some stores (e.g. Store 8 and Store 50) consistently show **higher medians** than others, suggesting stronger performance.
- Variance is generally stable across months, showing no clear seasonal spike.

### Darts ANalysis

In [None]:
#train_x = train_x[train_x["store_nbr"] <= 3]
#test_x = test_x[test_x["store_nbr"] <= 3]

train_x['date'] = pd.to_datetime(train_x['date'])
test_x['date'] = pd.to_datetime(test_x['date'])

all_preds = []

for (store, family), group_train in train_x.groupby(["store_nbr", "family"]):
    group_test = test_x[(test_x["store_nbr"] == store) & (test_x["family"] == family)]

    full_df = pd.concat([
        group_train[['date', 'sales']],
        group_test[['date']].assign(sales=np.nan)
    ])

    # Remove duplicates by date (if any)
    full_df = full_df.drop_duplicates(subset='date', keep='first')

    try:
        series = TimeSeries.from_dataframe(
            full_df,
            time_col='date',
            value_cols='sales',
            fill_missing_dates=True,
            freq="D"
        )
    except Exception as e:
        print(f"Skipping store {store}, family {family} due to error: {e}")
        continue

    scaler = Scaler()
    series_log = series.map(np.log1p)
    series_scaled = scaler.fit_transform(series_log)


    # Split using the last known date
    split_point = group_train['date'].max()
    train_series = series_scaled.drop_after(split_point)
    train_series = train_series.with_values(np.nan_to_num(train_series.values(), nan=0.0))



    model = LightGBMModel(lags=14, output_chunk_length=len(group_test),
                          device="gpu",
                          gpu_device_id=0,
                          gpu_platform_id=1,
        )
    model.fit(train_series)

    forecast = model.predict(len(group_test))
    forecast_final = scaler.inverse_transform(forecast).map(np.expm1)


    # Store with identifying columns
    df_preds = group_test[['store_nbr', 'family', 'date']].copy()
    df_preds['predicted_sales'] = forecast_final.values().flatten()
    all_preds.append(df_preds)

# Final predictions
predictions_df = pd.concat(all_preds).sort_values(['store_nbr', 'family', 'date'])



In [None]:
predictions_df

In [None]:
res = pd.DataFrame({#"idx":predictions_df.index,
                    "sales": predictions_df["predicted_sales"]})

res = res.reset_index()
res = res.sort_values("id")
res.to_csv('submission.csv', index=False)
res.head(10)