# **Imports**

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Helper Functions**

## **Delta Time Calculation**

In [None]:
def build_delta_time_files(base_path: str,
                           patients: list[int],
                           src_suffix: str = "_with_new_phonetime",
                           dst_suffix: str = "_with_delta_time",
                           full_hour_ms: int = 3_600_000) -> None:
    """
    Create *_with_delta.csv files for every patient.

    For each file in <patient>/<patient>{src_suffix}/:
        • delay_correction = time_ms(0) − phonetime_ms(0)
        • Skip file if delay_correction is NOT an exact multiple of one hour.
        • delta_time  = |time_ms − (phonetime_ms + delay_correction)|
        • max_delta   = delta_time.max()
        • Save to     <patient>/<patient>{dst_suffix}/<orig>_with_delta.csv
    """
    for pid in patients:
        input_dir  = os.path.join(base_path, str(pid), f"{pid}{src_suffix}")
        output_dir = os.path.join(base_path, str(pid), f"{pid}{dst_suffix}")

        if not os.path.exists(input_dir):
            print(f"⚠️  Source folder missing for patient {pid}")
            continue
        os.makedirs(output_dir, exist_ok=True)

        for fname in filter(lambda x: x.endswith(".csv"), os.listdir(input_dir)):
            fp_in  = os.path.join(input_dir, fname)
            fp_out = os.path.join(output_dir, fname[:-4] + "_with_delta.csv")

            df = pd.read_csv(fp_in)
            if {"time_ms", "phonetime_ms"} <= set(df.columns):
                # delay correction from first row
                corr = df.loc[0, "time_ms"] - df.loc[0, "phonetime_ms"]

                # skip if not full-hour multiple
                if corr % full_hour_ms != 0:
                    print(f"⏭️  Skipped {fp_in}  (delay_correction={corr} not full hour)")
                    continue

                # delta_time & max_delta
                df["delta_time"] = (df["time_ms"] - (df["phonetime_ms"] + corr)).abs()
                df["max_delta"]  = df["delta_time"].max()

                df.to_csv(fp_out, index=False)
                print(f"✔️  Processed: {fp_in} → {fp_out}  |  correction = {corr}")
            else:
                print(f"⚠️  Skipped {fp_in} (missing required columns)")


## **75th Percentile of Delta Time (for each file)**

In [None]:
def update_q75_per_file(base_path: str, patients: list[int]) -> pd.DataFrame:
    """
    Add / overwrite column ``q75_delta_time`` in every CSV
    (same constant value in all rows = 75-percentile of delta_time).

    Returns
    -------
    pd.DataFrame
        file | patient | q75   — one row per CSV.
    """
    rec: list[dict] = []

    for pid in patients:
        folder = os.path.join(base_path, str(pid), f"{pid}_with_delta_time")
        if not os.path.exists(folder):
            continue

        for fn in (f for f in os.listdir(folder) if f.endswith(".csv")):
            fp = os.path.join(folder, fn)
            df = pd.read_csv(fp, usecols=["delta_time"])
            if df.empty:
                continue

            q75 = df["delta_time"].quantile(0.75)
            df["q75_delta_time"] = q75
            df.to_csv(fp, index=False)

            rec.append({"patient": pid, "file": fn, "q75": float(q75)})

    return pd.DataFrame(rec)

## **75th Percentile of Delta Time (for all files together)**

In [None]:
def global_q75(base_path: str, patients: list[int]) -> float | None:
    """
    Compute 75-percentile across **all** delta_time values
    from every file / every patient.

    Returns ``None`` if no data found.
    """
    vals: list[float] = []

    for pid in patients:
        folder = os.path.join(base_path, str(pid), f"{pid}_with_delta_time")
        if not os.path.exists(folder):
            continue

        for fn in (f for f in os.listdir(folder) if f.endswith(".csv")):
            fp = os.path.join(folder, fn)
            try:
                vals.extend(pd.read_csv(fp, usecols=["delta_time"])
                            ["delta_time"].dropna().tolist())
            except Exception:
                continue

    return float(np.percentile(vals, 75)) if vals else None

## **IQR**

In [None]:
def sensor_pipeline(sensor_name: str,
                    base_path: str,
                    patient_ids: list[int]) -> None:
    """
    Pipeline for one sensor: writes q75 per file, prints the list sorted by q75,
    then computes IQR on that list (Q3-Q1) and the upper-fence = Q3+1.5·IQR.

    Any file whose q75 exceeds the fence is flagged as an outlier.
    """
    print(f"\n===== {sensor_name.upper()} SENSOR =====")

    # ── 1. q75 for each file ───────────────────────────────
    df_q75 = update_q75_per_file(base_path, patient_ids)
    if df_q75.empty:
        print("⚠️  No delta_time data found.")
        return

    # sort & display the per-file q75 list
    df_q75 = df_q75.sort_values("q75").reset_index(drop=True)
    print("\nq75 per file (low → high):")
    for _, r in df_q75.iterrows():
        print(f"{r.q75:7.2f} ms | {r.file:<35} | P{r.patient}")

    # ── 2. global 75-percentile across *all* delta_time values ─
    gq75 = global_q75(base_path, patient_ids)
    if gq75 is not None:
        print(f"\nGLOBAL 75-percentile of all delta_time = {gq75:.2f} ms")

    # ── 3. IQR & upper-fence on the *list* of q75 values ──────
    q1_files  = df_q75["q75"].quantile(0.25)    # Q1 of q75 list
    q3_files  = df_q75["q75"].quantile(0.75)    # Q3 of q75 list
    iqr_files = q3_files - q1_files
    upper_fence = q3_files + 1.5 * iqr_files

    print("\n── Stats on q75 list ──")
    print(f"Q1  (25-th pct) : {q1_files:.2f} ms")
    print(f"Q3  (75-th pct) : {q3_files:.2f} ms")
    print(f"IQR (Q3-Q1)     : {iqr_files:.2f} ms")
    print(f"Upper fence     : {upper_fence:.2f} ms")

    # flag files whose q75 exceeds the fence
    outliers = df_q75[df_q75["q75"] > upper_fence]
    if not outliers.empty:
        print("\n⚠️  Files whose q75 is ABOVE the upper-fence:")
        for _, r in outliers.iterrows():
            print(f"  {r.q75:.2f} ms | {r.file:<35} | P{r.patient}")
    else:
        print("\nNo file-level q75 exceeds the upper-fence.")


In [None]:
BASE_PRESSURE = "/content/drive/MyDrive/Master's degree/Insoles Project/ניתוח שגיאות זמן בהעברת נתונים/Pressure"
PRESSURE_PTS  = [2, 3, 5, 8, 9, 11, 12, 14, 16, 17, 18]

BASE_IMU      = "/content/drive/MyDrive/Master's degree/Insoles Project/ניתוח שגיאות זמן בהעברת נתונים/IMU"
IMU_PTS       = [2, 3, 5, 8, 9, 11, 12, 14, 15, 16, 17, 18]

# **MAIN**

In [None]:
def main() -> None:
    """Master routine – build delta_time files and run stats for each sensor."""
    # 1. build delta_time CSVs
    print("=== BUILDING delta_time FILES ===")
    build_delta_time_files(BASE_PRESSURE, PRESSURE_PTS)
    build_delta_time_files(BASE_IMU,       IMU_PTS)

    # 2. run statistics pipeline per sensor
    sensor_pipeline("pressure", BASE_PRESSURE, PRESSURE_PTS)
    sensor_pipeline("imu",       BASE_IMU,      IMU_PTS)


# ── entry-point ────────────────────────────────────────────────
if __name__ == "__main__":
    main()

=== BUILDING delta_time FILES ===
✔️  Processed: /content/drive/MyDrive/Master's degree/Insoles Project/ניתוח שגיאות זמן בהעברת נתונים/Pressure/2/2_with_new_phonetime/002LeftPRE_T1_unknown_clean.csv → /content/drive/MyDrive/Master's degree/Insoles Project/ניתוח שגיאות זמן בהעברת נתונים/Pressure/2/2_with_delta_time/002LeftPRE_T1_unknown_clean_with_delta.csv  |  correction = 10800000.0
✔️  Processed: /content/drive/MyDrive/Master's degree/Insoles Project/ניתוח שגיאות זמן בהעברת נתונים/Pressure/2/2_with_new_phonetime/002RightPRE_T1_unknown_clean.csv → /content/drive/MyDrive/Master's degree/Insoles Project/ניתוח שגיאות זמן בהעברת נתונים/Pressure/2/2_with_delta_time/002RightPRE_T1_unknown_clean_with_delta.csv  |  correction = 10800000.0
✔️  Processed: /content/drive/MyDrive/Master's degree/Insoles Project/ניתוח שגיאות זמן בהעברת נתונים/Pressure/3/3_with_new_phonetime/003LeftPRE_T0_unknown_clean.csv → /content/drive/MyDrive/Master's degree/Insoles Project/ניתוח שגיאות זמן בהעברת נתונים/Press