In [5]:
import pandas as pd
import numpy as np
from pathlib import Path

FREQ = "3min"
OUT_DIR = Path("outputs_fast")
OUT_DIR.mkdir(parents=True, exist_ok=True)

TOMTOM_CSV = r"C:/Users/elvinli/OneDrive/CodeProjects/TomTom_data_20_24Aug2025.csv"
VESSEL_FILE = r"C:/Users/elvinli/OneDrive/CodeProjects/Vesselposition_data_20_24Aug2025.csv"

CSV_ENGINE = "c"

In [6]:
def _mean_from_inner_csv(s: str) -> float:
    if not isinstance(s, str) or not s:
        return np.nan
    i = s.find("\n")
    if i == -1:
        return np.nan
    total = 0.0
    n = 0
    for line in s[i+1:].splitlines():
        if not line:
            continue
        try:
            total += float(line.rsplit(",", 1)[-1])
            n += 1
        except Exception:
            continue
    return (total / n) if n else np.nan

def build_tomtom_3min_static(file_path: str,
                             out_path=OUT_DIR / "tomtom_3min.parquet",
                             chunksize: int = 2000,
                             engine: str = "pyarrow"):
    ts_chunks, mean_chunks = [], []
    for chunk in pd.read_csv(file_path, usecols=["time", "data"],
                             chunksize=chunksize, engine=engine, low_memory=False):
        t = pd.to_datetime(chunk["time"], utc=True, errors="coerce").dt.tz_convert(None)
        means = [_mean_from_inner_csv(s) for s in chunk["data"].tolist()]
        ts_chunks.append(t.values)
        mean_chunks.append(np.array(means, dtype="float64"))

    if not ts_chunks:
        raise ValueError("TomTom CSV data not analyzed")

    df = pd.DataFrame({
        "timestamp": np.concatenate(ts_chunks),
        "traffic_level_mean": np.concatenate(mean_chunks)
    }).dropna(subset=["timestamp"]).sort_values("timestamp")

    df_3min = (df.set_index("timestamp")
                 .resample(FREQ)["traffic_level_mean"]
                 .mean().to_frame().reset_index())

    df_3min.to_parquet(out_path, index=False)
    print(f"[TomTom] Saving_Path:{out_path}  Shape={df_3min.shape}")
    return df_3min



In [7]:
def build_vessel_3min_static(file_path: str,
                             out_path=OUT_DIR / "vessel_3min.parquet",
                             convert_speed_to_mps=True,
                             unique_boats=False,
                             chunksize: int = 2_000_000,
                             engine: str = "pyarrow",
                             sep: str = ","):

    # Head check
    head = pd.read_csv(file_path, nrows=50, engine=engine, sep=sep, low_memory=False, on_bad_lines="skip")
    cols = list(head.columns)

    # Match columns
    TCOL = "upload-timestamp"
    SCOL = "speed-in-centimeters-per-second"
    ICOL = None
    if unique_boats:
        ICOL = "id" if "id" in cols else ("mmsi-number" if "mmsi-number" in cols else None)

    if TCOL not in cols:
        raise ValueError(f"Time column not found '{TCOL}'; existed column example:{cols[:10]}")
    if SCOL not in cols:
        raise ValueError(f"Velocity column not found '{SCOL}'; existed column example:{cols[:10]}")
    if unique_boats and ICOL is None:
        raise ValueError("unique_boats=True but no suitable ID column found; existed column example:{cols[:10]}")

    usecols = [TCOL, SCOL] + ([ICOL] if unique_boats else [])

    # Universal accumulators
    count_acc, speed_sum_acc, speed_cnt_acc = {}, {}, {}

    TS_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" 

    for chunk in pd.read_csv(file_path, usecols=usecols, chunksize=chunksize,
                             engine=engine, sep=sep, low_memory=False, on_bad_lines="skip"):
        # Timestamp Parsing
        ts = pd.to_datetime(chunk[TCOL], format=TS_FORMAT, utc=True, errors="coerce")
        miss = ts.isna()
        if miss.any():
            ts2 = pd.to_datetime(chunk.loc[miss, TCOL], utc=True, errors="coerce")
            ts.loc[miss] = ts2

        ts = ts.dt.tz_convert(None) 
        chunk = chunk.assign(timestamp=ts).dropna(subset=["timestamp"])

        # Data Processing: Velocity
        sp = pd.to_numeric(chunk[SCOL], errors="coerce")
        if convert_speed_to_mps:
            sp = sp / 100.0
        chunk["speed"] = sp

        # 3min binning
        chunk["bin"] = chunk["timestamp"].dt.floor(FREQ)

        # Count & Sum & N per bin
        if unique_boats:
            g_count = chunk.groupby("bin")[ICOL].nunique()
        else:
            g_count = chunk.groupby("bin").size()

        g_sum = chunk.groupby("bin")["speed"].sum(min_count=1)
        g_n   = chunk.groupby("bin")["speed"].count()

        #Cumulate into global accumulators
        for k, v in g_count.items():
            count_acc[k] = count_acc.get(k, 0) + int(v)
        for k, v in g_sum.items():
            speed_sum_acc[k] = speed_sum_acc.get(k, 0.0) + float(v)
        for k, v in g_n.items():
            speed_cnt_acc[k] = speed_cnt_acc.get(k, 0) + int(v)

    # Aggragate into DataFrame
    bins = sorted(set(count_acc) | set(speed_sum_acc) | set(speed_cnt_acc))
    vessel_count = pd.Series([count_acc.get(b, 0) for b in bins], index=bins, dtype="int64").rename("vessel_count")
    vessel_avg_speed = (pd.Series([speed_sum_acc.get(b, np.nan) for b in bins], index=bins) /
                        pd.Series([speed_cnt_acc.get(b, 0) for b in bins], index=bins)).rename("vessel_avg_speed")

    df_3min = pd.concat([vessel_count, vessel_avg_speed], axis=1).reset_index().rename(columns={"index": "timestamp"})
    df_3min.to_parquet(out_path, index=False)
    print(f"[Vessel] Saving_Path={out_path}  Shape={df_3min.shape}")
    return df_3min


In [8]:
tt = build_tomtom_3min_static(TOMTOM_CSV, engine=CSV_ENGINE)
vs = build_vessel_3min_static(VESSEL_FILE, engine=CSV_ENGINE)

display(tt.head())
display(vs.head())


[TomTom] Saving_Path:outputs_fast\tomtom_3min.parquet  Shape=(2188, 2)
[Vessel] Saving_Path=outputs_fast\vessel_3min.parquet  Shape=(1802, 3)


Unnamed: 0,timestamp,traffic_level_mean
0,2025-08-20 06:36:00,0.825112
1,2025-08-20 06:39:00,0.816597
2,2025-08-20 06:42:00,0.810953
3,2025-08-20 06:45:00,0.800038
4,2025-08-20 06:48:00,0.800142


Unnamed: 0,timestamp,vessel_count,vessel_avg_speed
0,2025-08-20 06:27:00,12,2.408333
1,2025-08-20 06:30:00,30,1.023333
2,2025-08-20 06:33:00,3903,2.84043
3,2025-08-20 06:36:00,5821,3.025631
4,2025-08-20 06:39:00,5900,2.931915


In [None]:
# === Convert parquet outputs to CSV ===
import pandas as pd
from pathlib import Path

OUT_DIR = Path("data_preprocessing_for_model")

tomtom_path = OUT_DIR / "tomtom_3min.parquet"
vessel_path = OUT_DIR / "vessel_3min.parquet"

tt = pd.read_parquet(tomtom_path)
vs = pd.read_parquet(vessel_path)

# Standardize timestamp format
tt["timestamp"] = pd.to_datetime(tt["timestamp"], errors="coerce").dt.strftime("%Y-%m-%d %H:%M:%S")
vs["timestamp"] = pd.to_datetime(vs["timestamp"], errors="coerce").dt.strftime("%Y-%m-%d %H:%M:%S")

# Export CSV
tt_csv_path = OUT_DIR / "tomtom_3min.csv"
vs_csv_path = OUT_DIR / "vessel_3min.csv"

tt.to_csv(tt_csv_path, index=False, float_format="%.6f")
vs.to_csv(vs_csv_path, index=False, float_format="%.6f")

print(f"TomTom CSV Saved: {tt_csv_path}, Shape={tt.shape}")
print(f"Vessel CSV Saved: {vs_csv_path}, Shape={vs.shape}")


TomTom CSV Saved: outputs_fast\tomtom_3min.csv, Shape=(2188, 2)
Vessel CSV Saved: outputs_fast\vessel_3min.csv, Shape=(1802, 3)
