In [1]:
# preprocess.py
import pandas as pd
from pathlib import Path

# ✅ 自动兼容 VS Code / 终端 与 Jupyter Notebook
try:
    SCRIPT_DIR = Path(__file__).resolve().parent
except NameError:
    SCRIPT_DIR = Path.cwd()   # Notebook 环境下使用当前工作目录

# 判断 data 文件夹位置（在 src 的上一级，或者当前目录）
DATA_DIR = SCRIPT_DIR.parent / 'data' if (SCRIPT_DIR.parent / 'data').exists() else SCRIPT_DIR / 'data'

print("脚本目录：", SCRIPT_DIR)
print("数据目录：", DATA_DIR)

# 读取 CSV
stores_df = pd.read_csv(DATA_DIR / 'stores.csv')
train_df  = pd.read_csv(DATA_DIR / 'train.csv')

# 想要转换为 category 的列（只要在表里存在就转）
stores_cat_cols = ['type', 'city', 'state', 'cluster']   # 你可按需删减
train_cat_cols  = ['family']                             # Kaggle 常见列名

# 安全转换：仅对存在的列转换，避免 KeyError
for col in stores_cat_cols:
    if col in stores_df.columns:
        stores_df[col] = stores_df[col].astype('category')
    else:
        print(f"⚠️ stores.csv 未找到列：{col}")

for col in train_cat_cols:
    if col in train_df.columns:
        train_df[col] = train_df[col].astype('category')
    else:
        print(f"⚠️ train.csv 未找到列：{col}")

# 查看结果
print("\n=== stores_df.info() ===")
print(stores_df.info())
print("\n=== train_df.info() ===")
print(train_df.info())

# 如果需要，保存处理后的文件
# (DATA_DIR / 'stores_processed.csv').write_text(stores_df.to_csv(index=False), encoding='utf-8')
# (DATA_DIR / 'train_processed.csv').write_text(train_df.to_csv(index=False), encoding='utf-8')

脚本目录： e:\la liste de programe\Py\Store\Store-Sales---Time-Series-Forecasting\notebooks
数据目录： e:\la liste de programe\Py\Store\Store-Sales---Time-Series-Forecasting\data

=== stores_df.info() ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   store_nbr  54 non-null     int64   
 1   city       54 non-null     category
 2   state      54 non-null     category
 3   type       54 non-null     category
 4   cluster    54 non-null     category
dtypes: category(4), int64(1)
memory usage: 3.0 KB
None

=== train_df.info() ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype   
---  ------       -----   
 0   id           int64   
 1   date         object  
 2   store_nbr    int64   
 3   family       category
 4   sales        float64 
 5   onpromotion  int64   
dtypes: