In [3]:
import pandas as pd
from prophet import Prophet

# Read the data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
holidays_events = pd.read_csv("data/holidays_events.csv")
oil = pd.read_csv("data/oil.csv")
stores = pd.read_csv("data/stores.csv")



  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


In [4]:
# Preprocessing
# Convert date columns to datetime
train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])
holidays_events["date"] = pd.to_datetime(holidays_events["date"])
oil["date"] = pd.to_datetime(oil["date"])

# Create a holidays dataframe for Prophet
holidays = holidays_events[["date", "type"]]
holidays.rename(columns={"date": "ds", "type": "holiday"}, inplace=True)

# Merge oil prices into train and test data
train = train.merge(oil, on="date", how="left")
test = test.merge(oil, on="date", how="left")

# Feature Engineering
# Fill missing oil prices with the nearest available values
train["dcoilwtico"].fillna(method="ffill", inplace=True)
test["dcoilwtico"].fillna(method="ffill", inplace=True)

# Create a binary feature for promotions
train["promotion"] = train["onpromotion"].apply(lambda x: 1 if x > 0 else 0)
test["promotion"] = test["onpromotion"].apply(lambda x: 1 if x > 0 else 0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  holidays.rename(columns={"date": "ds", "type": "holiday"}, inplace=True)
  train["dcoilwtico"].fillna(method="ffill", inplace=True)
  test["dcoilwtico"].fillna(method="ffill", inplace=True)


In [5]:
from tqdm import tqdm
import itertools
import logging

# Train a Prophet model for each store and product family
submission = pd.DataFrame(columns=["id", "sales"])

logging.getLogger("cmdstanpy").setLevel(logging.WARNING)

stores = train["store_nbr"].unique()
families = train["family"].unique()
# stores = [1]
# families = ["AUTOMOTIVE"]
for store, family in tqdm(list(itertools.product(stores, families))):
    # Filter the data for the current store and product family
    train_subset = train[(train["store_nbr"] == store) & (train["family"] == family)]
    test_subset = test[(test["store_nbr"] == store) & (test["family"] == family)]

    # Prepare the data for Prophet
    prophet_data = train_subset[["date", "sales"]].copy()
    prophet_data.rename(columns={"date": "ds", "sales": "y"}, inplace=True)

    # Initialize and fit the model
    model = Prophet(holidays=holidays.copy())
    model.fit(prophet_data)

    # Make predictions
    future = model.make_future_dataframe(
        periods=len(test_subset), include_history=False
    )
    forecast = model.predict(future)

    # Store the predictions
    test_subset = test_subset.copy()
    test_subset["sales"] = forecast["yhat"].values

    cleaned_test_subset = test_subset[["id", "sales"]].dropna(axis=1, how="all")
    submission = pd.concat([submission, cleaned_test_subset])

# Save the submission file
submission.to_csv("submission.csv", index=False)

  0%|          | 0/1782 [00:00<?, ?it/s]15:23:58 - cmdstanpy - INFO - Chain [1] start processing
15:23:58 - cmdstanpy - INFO - Chain [1] done processing
  submission = pd.concat([submission, cleaned_test_subset])
  0%|          | 2/1782 [00:00<08:54,  3.33it/s]15:23:58 - cmdstanpy - INFO - Chain [1] start processing
15:23:58 - cmdstanpy - INFO - Chain [1] done processing
  0%|          | 3/1782 [00:00<09:24,  3.15it/s]15:23:59 - cmdstanpy - INFO - Chain [1] start processing
15:23:59 - cmdstanpy - INFO - Chain [1] done processing
  0%|          | 4/1782 [00:01<11:07,  2.66it/s]15:23:59 - cmdstanpy - INFO - Chain [1] start processing
15:23:59 - cmdstanpy - INFO - Chain [1] done processing
  0%|          | 5/1782 [00:01<11:05,  2.67it/s]15:24:00 - cmdstanpy - INFO - Chain [1] start processing
15:24:00 - cmdstanpy - INFO - Chain [1] done processing
  0%|          | 6/1782 [00:02<11:41,  2.53it/s]15:24:00 - cmdstanpy - INFO - Chain [1] start processing
15:24:00 - cmdstanpy - INFO - Chain [1

KeyboardInterrupt: 