In [None]:
import pandas as pd

file_path = "SAIL2025_LVMA_data_3min_20August-25August2025_flow.csv"  
df = pd.read_csv(file_path)

print("Preview of data:")
print(df.head())

print("\nColumn names:")
print(df.columns.tolist())


In [None]:
import re
import pandas as pd
from pathlib import Path

sensor_csv = r"C:/Users/elvinli/OneDrive/CodeProjects/TIL6022-Group23-SAIL-Dashboard/SAIL2025_LVMA_data_3min_20August-25August2025_flow.csv"
OUT_DIR = Path("outputs_fast")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# 1) Read and convert timestamp to UTC-naive
df = pd.read_csv(sensor_csv, low_memory=False)
if "timestamp" not in df.columns:
    raise ValueError("The 'timestamp' column was not found. Please verify the file.")

ts = pd.to_datetime(df["timestamp"], utc=True, errors="coerce")
df["timestamp"] = ts.dt.tz_convert(None) 
derived_cols = {"hour","minute","day","month","weekday","is_weekend"}


# 2) Extract <ID>_<deg> type columns, match by direction ((deg + 180) % 360)
value_cols = [c for c in df.columns if c not in {"timestamp"} | derived_cols]
pat = re.compile(r"^(?P<sid>.+)_(?P<deg>\d+)$")
col_map = {}
for c in value_cols:
    m = pat.match(c)
    if m:
        sid = m.group("sid")
        deg = int(m.group("deg"))
        col_map[(sid, deg)] = c

# 3) Create total flow columns: for the same sensor ID, pair degree and (degree+180)%360 and sum them
seen_pairs = set()
total_series = {}

def sum_pair(col_a, col_b):
    a = pd.to_numeric(df[col_a], errors="coerce")
    b = pd.to_numeric(df[col_b], errors="coerce")
    return a.add(b, fill_value=0)

for (sid, deg), cname in col_map.items():
    if (sid, deg) in seen_pairs:
        continue

    opp = (deg + 180) % 360
    cname_opp = col_map.get((sid, opp))

    if cname_opp is None:
        total_series.setdefault(sid, pd.to_numeric(df[cname], errors="coerce"))
        seen_pairs.add((sid, deg))
        continue

    total_series[sid] = sum_pair(cname, cname_opp)
    seen_pairs.add((sid, deg))
    seen_pairs.add((sid, opp))

# 4) Reassemble DataFrame (timestamp + each sensor as one column)
wide = pd.concat(
    [df[["timestamp"]] + [s.rename(sid) for sid, s in sorted(total_series.items())]],
    axis=1
)

# 5) Convert to long format: timestamp, sensor_id, human_flow
long = wide.melt(id_vars="timestamp", var_name="sensor_id", value_name="human_flow") \
          .sort_values(["sensor_id", "timestamp"])

# 6) Drop rows with missing flow values
long = long.dropna(subset=["human_flow"])

# 7) Export
wide_path = OUT_DIR / "sensor_total_wide.csv"
long_path = OUT_DIR / "sensor_total_long.csv"

wide.to_csv(wide_path, index=False)
long.to_csv(long_path, index=False)

print(f"Saved (wide format) → {wide_path}   shape = {wide.shape}")
print(f"Saved (long format) → {long_path}   shape = {long.shape}")

display(wide.head())
display(long.head())
