In [4]:
import pandas as pd

# 1. 读取 CSV 文件
file_path = "SAIL2025_LVMA_data_3min_20August-25August2025_flow.csv"   # ← 修改为你的 CSV 文件路径
df = pd.read_csv(file_path)

# 2. 展示前几行数据（可选）
print("Preview of data:")
print(df.head())

# 3. 展示列名
print("\nColumn names:")
print(df.columns.tolist())


Preview of data:
                   timestamp  CMSA-GAKH-01_0  CMSA-GAKH-01_180  \
0  2025-08-20 00:00:00+02:00              15                 4   
1  2025-08-20 00:03:00+02:00               1                 3   
2  2025-08-20 00:06:00+02:00               5                 4   
3  2025-08-20 00:09:00+02:00               4                 4   
4  2025-08-20 00:12:00+02:00               4                11   

   CMSA-GAWW-11_120  CMSA-GAWW-11_300  CMSA-GAWW-12_115  CMSA-GAWW-12_295  \
0                29                33                44                28   
1                21                29                34                39   
2                35                22                29                34   
3                40                47                42                40   
4                54                59                58                33   

   CMSA-GAWW-13_120  CMSA-GAWW-13_300  CMSA-GAWW-14_40  ...  GVCV-13_10  \
0                42                37           

In [5]:
import re
import pandas as pd
from pathlib import Path

sensor_csv = r"C:/Users/elvinli/OneDrive/CodeProjects/TIL6022-Group23-SAIL-Dashboard/SAIL2025_LVMA_data_3min_20August-25August2025_flow.csv"
OUT_DIR = Path("outputs_fast"); OUT_DIR.mkdir(parents=True, exist_ok=True)

# 1) 读取 & 统一时间到 UTC-naive（去掉 +02:00）
df = pd.read_csv(sensor_csv, low_memory=False)
if "timestamp" not in df.columns:
    raise ValueError("没找到 'timestamp' 列，请确认列名。")

ts = pd.to_datetime(df["timestamp"], utc=True, errors="coerce")  # 识别带 +02:00 的时间
df["timestamp"] = ts.dt.tz_convert(None)                         # 统一到 UTC-naive，便于 merge
# 这些是你文件里的派生列，先排除（若不存在也不会报错）
derived_cols = {"hour","minute","day","month","weekday","is_weekend"}

# 2) 把 “<ID>_<deg>” 型列挑出来，按 (deg+180)%360 配对
value_cols = [c for c in df.columns if c not in {"timestamp"} | derived_cols]

pat = re.compile(r"^(?P<sid>.+)_(?P<deg>\d+)$")
# 映射： (sid, deg_int) -> 列名
col_map = {}
for c in value_cols:
    m = pat.match(c)
    if m:
        sid = m.group("sid")
        deg = int(m.group("deg"))
        col_map[(sid, deg)] = c
    # 非这种命名的列会被自动忽略（如果有的话）

# 3) 生成“总人数”列：同一 sid 下把 deg 与 (deg+180)%360 的两列相加
seen_pairs = set()
total_series = {}  # sid -> Series

def sum_pair(col_a, col_b):
    a = pd.to_numeric(df[col_a], errors="coerce")
    b = pd.to_numeric(df[col_b], errors="coerce")
    # 如果一侧偶有缺失，用 0 相加更稳；想严格缺失可改成 (a + b)
    return a.add(b, fill_value=0)

for (sid, deg), cname in col_map.items():
    if (sid, deg) in seen_pairs:
        continue
    opp = (deg + 180) % 360
    cname_opp = col_map.get((sid, opp))
    if cname_opp is None:
        # 没找到相反方向；直接把这一列当作“总数”（或也可跳过/记录告警）
        total_series.setdefault(sid, pd.to_numeric(df[cname], errors="coerce"))
        seen_pairs.add((sid, deg))
        continue
    # 两列相加
    total_series[sid] = sum_pair(cname, cname_opp)
    seen_pairs.add((sid, deg))
    seen_pairs.add((sid, opp))

# 4) 组装宽表（timestamp + 每个传感器一列）
wide = pd.concat([df[["timestamp"]]] + [s.rename(sid) for sid, s in sorted(total_series.items())], axis=1)

# 5) 转成长表（建模更方便）：timestamp, sensor_id, human_flow
long = wide.melt(id_vars="timestamp", var_name="sensor_id", value_name="human_flow").sort_values(["sensor_id","timestamp"])

# 6) 可选：去掉全空的传感器/时间
# long = long.dropna(subset=["human_flow"])

# 7) 导出
wide_path = OUT_DIR / "sensor_total_wide.csv"
long_path = OUT_DIR / "sensor_total_long.csv"
wide.to_csv(wide_path, index=False)
long.to_csv(long_path, index=False)

print(f"保存：{wide_path} 形状={wide.shape}")
print(f"保存：{long_path} 形状={long.shape}")

# （可选）看看前几行
display(wide.head())
display(long.head())


保存：outputs_fast\sensor_total_wide.csv 形状=(2400, 38)
保存：outputs_fast\sensor_total_long.csv 形状=(88800, 3)


Unnamed: 0,timestamp,CMSA-GAKH-01,CMSA-GAWW-11,CMSA-GAWW-12,CMSA-GAWW-13,CMSA-GAWW-14,CMSA-GAWW-15,CMSA-GAWW-16,CMSA-GAWW-17,CMSA-GAWW-19,...,GVCV-04,GVCV-05-A,GVCV-05-B,GVCV-06,GVCV-07,GVCV-08,GVCV-09,GVCV-11,GVCV-13,GVCV-14
0,2025-08-19 22:00:00,19,62,72,79,14,47,114,47,58,...,0,0,0,0,14,2,0,114,81,0
1,2025-08-19 22:03:00,4,50,73,23,12,44,27,20,18,...,0,0,0,0,0,0,0,11,0,0
2,2025-08-19 22:06:00,9,57,63,75,17,89,101,32,94,...,0,0,0,0,0,0,0,0,0,0
3,2025-08-19 22:09:00,8,87,82,53,17,85,63,35,37,...,0,0,0,0,0,0,0,0,0,0
4,2025-08-19 22:12:00,15,113,91,50,35,52,52,14,36,...,0,0,0,0,0,0,0,0,184,0


Unnamed: 0,timestamp,sensor_id,human_flow
0,2025-08-19 22:00:00,CMSA-GAKH-01,19
1,2025-08-19 22:03:00,CMSA-GAKH-01,4
2,2025-08-19 22:06:00,CMSA-GAKH-01,9
3,2025-08-19 22:09:00,CMSA-GAKH-01,8
4,2025-08-19 22:12:00,CMSA-GAKH-01,15
