In [4]:
import pandas as pd
import numpy as np

In [12]:
event_data = pd.read_csv('data/event/event.csv', parse_dates=['日期'], date_format='%Y/%m/%d')

In [18]:
event_data.columns = ['Date', '描述', '影响', '类别']
# 创建完整日期范围
min_date = pd.to_datetime('2023-01-01')
max_date = pd.to_datetime('2025-4-21')
full_dates = pd.date_range(start=min_date, end=max_date, freq='D')
# 构建事件特征矩阵
events = pd.DataFrame(index=full_dates)
# 生成核心特征
impact_mapping = {
    '正面': 1,
    '负面': -1,
    np.nan: 0  # 处理无事件日期
}
event_impact = (
    event_data.groupby('Date')['影响']
    .apply(lambda x: x.map(impact_mapping).max())
    .reindex(full_dates)
    .fillna(0)
)
# 事件类型编码
event_type_dummies = (
    event_data.groupby(['Date', '类别']).size().unstack(fill_value=0)
    .reindex(full_dates, fill_value=0)
    .clip(upper=1)  # 确保值为0或1
    .add_prefix('event_')
)

events_df = pd.concat([events, event_type_dummies], axis=1).reset_index(names='Date')
events_df['event_impact'] = event_impact.values.astype(int)
events_df.tail()

Unnamed: 0,Date,event_体育赛事,event_国际会议,event_国际体育赛事,event_国际大型演唱会,event_大型演唱会,event_极端天气,event_impact
837,2025-04-17,0,1,0,0,0,0,1
838,2025-04-18,0,0,0,0,0,0,0
839,2025-04-19,0,0,0,0,0,0,0
840,2025-04-20,0,0,0,0,0,0,0
841,2025-04-21,0,0,0,0,0,0,0


In [20]:
# 处理并保存事件数据
events_df.to_csv('data/event/events.csv', index=False)