Import the library

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

检查分类列并转换为 category（如 `store_type`, `item_family`）

In [None]:
# 加载数据
DATA_DIR = (Path.cwd().parent / "data").resolve()   # …/Store-Sales---Time-Series-Forecasting/data
print("DATA_DIR =", DATA_DIR) # 返回数据目录路径
train_df  = pd.read_csv(DATA_DIR / "train.csv")
stores_df = pd.read_csv(DATA_DIR / "stores.csv")

# 检查列名
print("Train columns:", train_df.columns.tolist())
print("Stores columns:", stores_df.columns.tolist())

# 想要转为 category 类型的列
cat_cols_train = ['family']                # 商品类别
cat_cols_stores = ['type', 'city', 'state', 'cluster']  # 门店相关列

# train 中的 family
for col in cat_cols_train:
    if col in train_df.columns:
        train_df[col] = train_df[col].astype('category')

# stores 中的类别列
for col in cat_cols_stores:
    if col in stores_df.columns:
        stores_df[col] = stores_df[col].astype('category')

# 验证类型
print("\ntrain_df info:")
print(train_df.info())

print("\nstores_df info:")
print(stores_df.info())

生成交叉特征 store_type × item_family

In [None]:
# store_type 在 stores.csv，item_family 在 train.csv

# 通过 store_nbr 这个共同的键，合并两个数据集（train_df 和 stores_df）
train_merged = train_df.merge(stores_df[['store_nbr', 'type']], on='store_nbr', how='left')

# 创建交叉特征列
train_merged['store_type_item_family'] = (
    train_merged['type'].astype(str) + "_" + train_merged['family'].astype(str)
)

# 结果
print(train_merged[['type', 'family', 'store_type_item_family']].head())

标记连续促销天数（基于 `is_promo` 列）

In [None]:
def mark_promo_streak(group: pd.DataFrame) -> pd.Series:
    """计算每个 (store_nbr, family) 的连续促销天数"""
    streak = pd.Series(0, index=group.index)
    current = 0
    for i in range(len(group)):
        if group.loc[group.index[i], "onpromotion"]:
            current += 1
        else:
            current = 0
        streak.iloc[i] = current
    return streak

# === 应用到每个门店-商品组合 ===
train_df = train_df.sort_values(["store_nbr", "family", "date"])
train_df["promo_streak"] = (
    train_df.groupby(["store_nbr", "family"], group_keys=False)
            .apply(mark_promo_streak)
)

# === 结果预览 ===
print("\n=== 连续促销天数示例 ===")
print(train_df[["store_nbr", "family", "date", "onpromotion", "promo_streak"]].head(10))

# === 一些统计信息 ===
print("\n=== 促销统计 ===")
print(f"最长连续促销天数: {train_df['promo_streak'].max()}")
print("\n促销持续天数分布:")
print(train_df.loc[train_df["promo_streak"] > 0, "promo_streak"]
      .value_counts()
      .sort_index()
      .head())

总销售量随时间变化折线图

In [None]:
train_df["date"] = pd.to_datetime(train_df["date"])

df_family = (
    train_df.groupby(["date", "family"])["sales"]
            .sum()
            .reset_index()
)
plt.figure(figsize=(15, 8))

for fam in df_family["family"].unique():
    sub = df_family[df_family["family"] == fam]
    plt.plot(sub["date"], sub["sales"], label=fam)
    
plt.title("Sales Trend by Item Family (Line Plot)", fontsize=15)
plt.xlabel("Date")
plt.ylabel("Sales")
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()