In [6]:
import pandas as pd
import numpy as np
from pathlib import Path

检查分类列并转换为 category（如 `store_type`, `item_family`）

In [None]:
# 加载数据
DATA_DIR = (Path.cwd().parent / "data").resolve()   # …/Store-Sales---Time-Series-Forecasting/data
print("DATA_DIR =", DATA_DIR) # 返回数据目录路径
train_df  = pd.read_csv(DATA_DIR / "train.csv")
stores_df = pd.read_csv(DATA_DIR / "stores.csv")

# 检查列名
print("Train columns:", train_df.columns.tolist())
print("Stores columns:", stores_df.columns.tolist())

# 想要转为 category 类型的列
cat_cols_train = ['family']                # 商品类别
cat_cols_stores = ['type', 'city', 'state', 'cluster']  # 门店相关列

# train 中的 family
for col in cat_cols_train:
    if col in train_df.columns:
        train_df[col] = train_df[col].astype('category')

# stores 中的类别列
for col in cat_cols_stores:
    if col in stores_df.columns:
        stores_df[col] = stores_df[col].astype('category')

# 验证类型
print("\ntrain_df info:")
print(train_df.info())

print("\nstores_df info:")
print(stores_df.info())

DATA_DIR = E:\la liste de programe\Py\Store\Store-Sales---Time-Series-Forecasting\data
Train columns: ['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion']
Stores columns: ['store_nbr', 'city', 'state', 'type', 'cluster']

train_df info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype   
---  ------       -----   
 0   id           int64   
 1   date         object  
 2   store_nbr    int64   
 3   family       category
 4   sales        float64 
 5   onpromotion  int64   
dtypes: category(1), float64(1), int64(3), object(1)
memory usage: 117.3+ MB
None

stores_df info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   store_nbr  54 non-null     int64   
 1   city       54 non-null     category
 2   state      54 non-null     category
 3   type       54 n

生成交叉特征 store_type × item_family

In [15]:
# store_type 在 stores.csv，item_family 在 train.csv

# 通过 store_nbr 这个共同的键，合并两个数据集（train_df 和 stores_df）
train_merged = train_df.merge(stores_df[['store_nbr', 'type']], on='store_nbr', how='left')

# 创建交叉特征列
train_merged['store_type_item_family'] = (
    train_merged['type'].astype(str) + "_" + train_merged['family'].astype(str)
)

# 结果
print(train_merged[['type', 'family', 'store_type_item_family']].head())

  type      family store_type_item_family
0    D  AUTOMOTIVE           D_AUTOMOTIVE
1    D   BABY CARE            D_BABY CARE
2    D      BEAUTY               D_BEAUTY
3    D   BEVERAGES            D_BEVERAGES
4    D       BOOKS                D_BOOKS


标记连续促销天数（基于 `is_promo` 列）

In [16]:
def mark_promo_streak(group: pd.DataFrame) -> pd.Series:
    """计算每个 (store_nbr, family) 的连续促销天数"""
    streak = pd.Series(0, index=group.index)
    current = 0
    for i in range(len(group)):
        if group.loc[group.index[i], "onpromotion"]:
            current += 1
        else:
            current = 0
        streak.iloc[i] = current
    return streak

# === 应用到每个门店-商品组合 ===
train_df = train_df.sort_values(["store_nbr", "family", "date"])
train_df["promo_streak"] = (
    train_df.groupby(["store_nbr", "family"], group_keys=False)
            .apply(mark_promo_streak)
)

# === 结果预览 ===
print("\n=== 连续促销天数示例 ===")
print(train_df[["store_nbr", "family", "date", "onpromotion", "promo_streak"]].head(10))

# === 一些统计信息 ===
print("\n=== 促销统计 ===")
print(f"最长连续促销天数: {train_df['promo_streak'].max()}")
print("\n促销持续天数分布:")
print(train_df.loc[train_df["promo_streak"] > 0, "promo_streak"]
      .value_counts()
      .sort_index()
      .head())

  train_df.groupby(["store_nbr", "family"], group_keys=False)



=== 连续促销天数示例 ===
       store_nbr      family        date  onpromotion  promo_streak
0              1  AUTOMOTIVE  2013-01-01            0             0
1782           1  AUTOMOTIVE  2013-01-02            0             0
3564           1  AUTOMOTIVE  2013-01-03            0             0
5346           1  AUTOMOTIVE  2013-01-04            0             0
7128           1  AUTOMOTIVE  2013-01-05            0             0
8910           1  AUTOMOTIVE  2013-01-06            0             0
10692          1  AUTOMOTIVE  2013-01-07            0             0
12474          1  AUTOMOTIVE  2013-01-08            0             0
14256          1  AUTOMOTIVE  2013-01-09            0             0
16038          1  AUTOMOTIVE  2013-01-10            0             0

=== 促销统计 ===
最长连续促销天数: 363

促销持续天数分布:
promo_streak
1    127936
2     47470
3     29790
4     21968
5     17511
Name: count, dtype: int64


  train_df.groupby(["store_nbr", "family"], group_keys=False)
