In [25]:
import os
import json
import re
import math
from typing import List, Tuple, Optional

import numpy as np
import pandas as pd
BASE_DIR = "/Users/naman/sem_1_2025_26/scalable_computing/final_project/mouse_training/web_bot_detection_dataset"

In [15]:
def parse_mouse_json(record: dict) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]:
    """
    Parse one mouse_movements.json record into:
      times: np.ndarray shape (N,)  - timestamps (ms or similar)
      xs:    np.ndarray shape (N,)  - x positions
      ys:    np.ndarray shape (N,)  - y positions

    Returns (None, None, None) if not enough points.
    """

    # ---- times ----
    times_raw = record.get("mousemove_times", "")
    times: List[int] = []

    if "(" in times_raw or "{" in times_raw:
        # e.g. "{(259860030),(259860045),...}"
        times = [int(t) for t in re.findall(r"\((\d+)\)", times_raw)]
    else:
        # e.g. "259860030,259860045,259860064"
        times = [int(t) for t in times_raw.split(",") if t.strip().isdigit()]

    # ---- coordinates ----
    coords_raw = record.get("mousemove_total_behaviour", "")
    coords = re.findall(r"[\(\[]\s*(\d+)\s*,\s*(\d+)\s*[\)\]]", coords_raw)
    xs = [int(x) for x, _ in coords]
    ys = [int(y) for _, y in coords]

    n = min(len(times), len(xs), len(ys))
    if n < 20:  # too short
        return None, None, None

    times_arr = np.array(times[:n], dtype=np.float64)
    xs_arr    = np.array(xs[:n], dtype=np.float64)
    ys_arr    = np.array(ys[:n], dtype=np.float64)

    return times_arr, xs_arr, ys_arr

In [19]:
test_path = "/Users/naman/sem_1_2025_26/scalable_computing/final_project/mouse_training/web_bot_detection_dataset/phase1/data/mouse_movements/humans_and_advanced_bots/0i5kvpslrq3vb6u8ff2kuejv0v/mouse_movements.json"  # or your own example
with open(test_path, "r") as f:
    rec = json.load(f)

times_test, xs_test, ys_test = parse_mouse_json(rec)
print(times_test.shape, xs_test.shape, ys_test.shape)
print(times_test[:5], xs_test[:5], ys_test[:5])



(8584,) (8584,) (8584,)
[6.25982432e+08 6.25982440e+08 6.25982452e+08 6.25982469e+08
 6.25982485e+08] [ 7. 15. 23. 31. 39.] [ 6. 14. 22. 29. 37.]


In [20]:
def slice_session(
    times: np.ndarray,
    xs: np.ndarray,
    ys: np.ndarray,
    window_seconds: float = 60.0,
    min_points: int = 20,
) -> List[Tuple[np.ndarray, np.ndarray, np.ndarray]]:
    """
    Slice (times, xs, ys) into non-overlapping windows of given length.
    times are assumed in ms (or consistent unit).
    Returns list of (times_rel, xs_slice, ys_slice).
    """
    if len(times) < min_points:
        return []

    window_ms = window_seconds * 1000.0
    slices: List[Tuple[np.ndarray, np.ndarray, np.ndarray]] = []

    start_time = times[0]
    end_time   = times[-1]
    cur_start  = start_time

    while cur_start + window_ms <= end_time:
        cur_end = cur_start + window_ms
        mask = (times >= cur_start) & (times < cur_end)

        if mask.sum() >= min_points:
            t_slice = times[mask] - cur_start  # make relative (start at 0)
            x_slice = xs[mask]
            y_slice = ys[mask]
            slices.append((t_slice, x_slice, y_slice))

        cur_start = cur_end  # move to next window

    return slices

In [21]:
def normalize_window(xs: np.ndarray, ys: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    """
    Normalize x, y to [0,1] per window based on max values in that window.
    """
    x_max = xs.max() if xs.size > 0 else 1.0
    y_max = ys.max() if ys.size > 0 else 1.0

    if x_max == 0: x_max = 1.0
    if y_max == 0: y_max = 1.0

    xs_norm = xs / x_max
    ys_norm = ys / y_max
    return xs_norm, ys_norm

In [22]:
def extract_features(times: np.ndarray,
                     xs: np.ndarray,
                     ys: np.ndarray) -> dict:
    """
    Extract features from one window.
    Returns a dict ready to be appended into a pandas DataFrame.
    """

    # times in seconds
    t = times.astype(np.float64) / 1000.0
    if len(t) < 2:
        return {
            "duration": 0.0,
            "path_length": 0.0,
            "displacement": 0.0,
            "straightness": 0.0,
            "jitter_index": 0.0,
            "speed_mean": 0.0, "speed_std": 0.0, "speed_max": 0.0,
            "acc_mean": 0.0,   "acc_std": 0.0,   "acc_max": 0.0,
            "jerk_mean": 0.0,  "jerk_std": 0.0,  "jerk_max": 0.0,
            "dt_mean": 0.0,    "dt_std": 0.0,    "dt_max": 0.0,
            "curvature": 0.0,
            "n_points": float(len(xs)),
        }

    dt = np.diff(t)
    dx = np.diff(xs)
    dy = np.diff(ys)
    dist = np.sqrt(dx * dx + dy * dy)

    dt_safe = np.where(dt == 0, 1e-6, dt)
    speed = dist / dt_safe

    acc = np.diff(speed)
    jerk = np.diff(acc) if len(acc) > 1 else np.array([0.0])

    duration = t[-1] - t[0]
    path_length = dist.sum()
    displacement = math.dist((xs[0], ys[0]), (xs[-1], ys[-1]))
    straightness = displacement / path_length if path_length > 0 else 0.0
    jitter_index = (path_length / displacement) if displacement > 0 else 1.0

    def stats(arr: np.ndarray):
        if arr.size == 0:
            return 0.0, 0.0, 0.0
        return float(arr.mean()), float(arr.std()), float(arr.max())

    speed_mean, speed_std, speed_max = stats(speed)
    acc_mean, acc_std, acc_max       = stats(acc)
    jerk_mean, jerk_std, jerk_max    = stats(jerk)
    dt_mean, dt_std, dt_max          = stats(dt)

    # curvature
    if len(dx) > 1:
        v1 = np.stack([dx[:-1], dy[:-1]], axis=1)
        v2 = np.stack([dx[1:],  dy[1:]],  axis=1)
        dot = np.sum(v1 * v2, axis=1)
        norms = np.linalg.norm(v1, axis=1) * np.linalg.norm(v2, axis=1)
        norms = np.where(norms == 0, 1e-6, norms)
        cos = np.clip(dot / norms, -1.0, 1.0)
        angles = np.arccos(cos)
        curvature = float(angles.mean())
    else:
        curvature = 0.0

    return {
        "duration": float(duration),
        "path_length": float(path_length),
        "displacement": float(displacement),
        "straightness": float(straightness),
        "jitter_index": float(jitter_index),

        "speed_mean": float(speed_mean),
        "speed_std": float(speed_std),
        "speed_max": float(speed_max),

        "acc_mean": float(acc_mean),
        "acc_std": float(acc_std),
        "acc_max": float(acc_max),

        "jerk_mean": float(jerk_mean),
        "jerk_std": float(jerk_std),
        "jerk_max": float(jerk_max),

        "dt_mean": float(dt_mean),
        "dt_std": float(dt_std),
        "dt_max": float(dt_max),

        "curvature": float(curvature),
        "n_points": float(len(xs)),
    }

In [23]:
slices_test = slice_session(times_test, xs_test, ys_test, window_seconds=60, min_points=20)
len(slices_test)

if slices_test:
    t0, x0, y0 = slices_test[0]
    x0n, y0n = normalize_window(x0, y0)
    feat_dict = extract_features(t0, x0n, y0n)
    pd.Series(feat_dict)

In [24]:
def load_phase_subset_to_df(
    mouse_root_dir: str,
    annotation_path: str,
    window_seconds: float = 60.0,
    min_points: int = 20,
    phase_name: str = "",
    subset_name: str = "",
) -> pd.DataFrame:
    """
    mouse_root_dir: folder with per-session subfolders
    annotation_path: TXT with "<session_id> <label>"
    Returns:
      DataFrame with one row per window, columns=features + ['label', 'session_id', 'phase', 'subset']
    """

    # --- load annotations into pandas ---
    annot_df = pd.read_csv(
        annotation_path,
        sep=r"\s+",
        header=None,
        names=["session_id", "raw_label"],
        engine="python",
    )

    # binary label: 1 = human, 0 = bot (moderate or advanced)
    annot_df["label"] = annot_df["raw_label"].str.lower().str.contains("human").astype(int)

    rows = []

    for _, row in annot_df.iterrows():
        session_id = str(row["session_id"])
        label = int(row["label"])

        session_dir = os.path.join(mouse_root_dir, session_id)
        json_path = os.path.join(session_dir, "mouse_movements.json")
        if not os.path.exists(json_path):
            continue

        with open(json_path, "r") as f:
            rec = json.load(f)

        times, xs, ys = parse_mouse_json(rec)
        if times is None:
            continue

        windows = slice_session(
            times, xs, ys,
            window_seconds=window_seconds,
            min_points=min_points,
        )

        for (tw, xw, yw) in windows:
            x_norm, y_norm = normalize_window(xw, yw)
            feat = extract_features(tw, x_norm, y_norm)
            feat["label"] = label
            feat["session_id"] = session_id
            feat["phase"] = phase_name
            feat["subset"] = subset_name
            rows.append(feat)

    if not rows:
        return pd.DataFrame()

    df = pd.DataFrame(rows)
    return df

In [26]:
p1_hab_mouse_dir_train = os.path.join(
    BASE_DIR, "phase1", "data", "mouse_movements", "humans_and_advanced_bots"
)

p1_hab_annot_train = os.path.join(
    BASE_DIR, "phase1", "annotations", "humans_and_advanced_bots", "train"
)

print(p1_hab_mouse_dir_train)
print(p1_hab_annot_train)


/Users/naman/sem_1_2025_26/scalable_computing/final_project/mouse_training/web_bot_detection_dataset/phase1/data/mouse_movements/humans_and_advanced_bots
/Users/naman/sem_1_2025_26/scalable_computing/final_project/mouse_training/web_bot_detection_dataset/phase1/annotations/humans_and_advanced_bots/train


In [27]:
p1_hmb_mouse_dir_train = os.path.join(
    BASE_DIR, "phase1", "data", "mouse_movements", "humans_and_moderate_bots"
)

p1_hmb_annot_train = os.path.join(
    BASE_DIR, "phase1", "annotations", "humans_and_moderate_bots", "train"
)

In [28]:
df_p1_hab_train = load_phase_subset_to_df(
    mouse_root_dir=p1_hab_mouse_dir_train,
    annotation_path=p1_hab_annot_train,
    window_seconds=60.0,
    min_points=20,
    phase_name="phase1",
    subset_name="humans_and_advanced_bots_train",
)

df_p1_hmb_train = load_phase_subset_to_df(
    mouse_root_dir=p1_hmb_mouse_dir_train,
    annotation_path=p1_hmb_annot_train,
    window_seconds=60.0,
    min_points=20,
    phase_name="phase1",
    subset_name="humans_and_moderate_bots_train",
)

print(df_p1_hab_train.shape, df_p1_hmb_train.shape)

(354, 23) (241, 23)


In [76]:
pd.set_option("display.max_columns", None)
list(df_p1_hab_train.columns)


['duration',
 'path_length',
 'displacement',
 'straightness',
 'jitter_index',
 'speed_mean',
 'speed_std',
 'speed_max',
 'acc_mean',
 'acc_std',
 'acc_max',
 'jerk_mean',
 'jerk_std',
 'jerk_max',
 'dt_mean',
 'dt_std',
 'dt_max',
 'curvature',
 'n_points',
 'label',
 'session_id',
 'phase',
 'subset']

In [80]:
import numpy as np

def extract_features(times, xs, ys):
    """
    Rich feature extractor, matching the first df's schema:

    Returns columns:
      duration, path_length, displacement, straightness,
      jitter_index,
      speed_mean, speed_std, speed_max,
      acc_mean, acc_std, acc_max,
      jerk_mean, jerk_std, jerk_max,
      dt_mean, dt_std, dt_max,
      curvature,
      n_points
    """
    times = np.asarray(times, dtype=np.float64)
    xs = np.asarray(xs, dtype=np.float64)
    ys = np.asarray(ys, dtype=np.float64)

    n = len(times)
    if n < 2:
        return {
            "duration": 0.0,
            "path_length": 0.0,
            "displacement": 0.0,
            "straightness": 0.0,
            "jitter_index": 0.0,
            "speed_mean": 0.0,
            "speed_std": 0.0,
            "speed_max": 0.0,
            "acc_mean": 0.0,
            "acc_std": 0.0,
            "acc_max": 0.0,
            "jerk_mean": 0.0,
            "jerk_std": 0.0,
            "jerk_max": 0.0,
            "dt_mean": 0.0,
            "dt_std": 0.0,
            "dt_max": 0.0,
            "curvature": 0.0,
            "n_points": float(n),
        }

    # durations (ms)
    duration = float(times[-1] - times[0])

    # dt in seconds
    dt = np.diff(times) / 1000.0
    dt[dt <= 0] = 1e-6

    # displacements between consecutive points
    dx = np.diff(xs)
    dy = np.diff(ys)
    dist = np.sqrt(dx * dx + dy * dy)

    path_length = float(dist.sum())
    displacement = float(np.sqrt((xs[-1] - xs[0])**2 + (ys[-1] - ys[0])**2))
    straightness = displacement / path_length if path_length > 0 else 0.0

    # "jitter" as extra distance vs straight line
    jitter_index = (path_length - displacement) / (displacement + 1e-6) if displacement > 0 else 0.0

    # speed
    speed = dist / dt
    speed_mean = float(speed.mean())
    speed_std = float(speed.std(ddof=0))
    speed_max = float(speed.max())

    # acceleration (based on speed)
    if len(speed) > 1:
        dt_acc = dt[1:]
        dt_acc[dt_acc <= 0] = 1e-6
        accel = np.diff(speed) / dt_acc
        acc_abs = np.abs(accel)
        acc_mean = float(acc_abs.mean())
        acc_std = float(acc_abs.std(ddof=0))
        acc_max = float(acc_abs.max())
    else:
        acc_mean = acc_std = acc_max = 0.0
        accel = np.array([], dtype=np.float64)

    # jerk (change of acceleration)
    if len(accel) > 1:
        dt_jerk = dt[2:] if len(dt) >= 3 else np.array([1.0])
        dt_jerk = np.maximum(dt_jerk, 1e-6)
        jerk = np.diff(accel) / dt_jerk[:len(np.diff(accel))]
        jerk_abs = np.abs(jerk)
        jerk_mean = float(jerk_abs.mean())
        jerk_std = float(jerk_abs.std(ddof=0))
        jerk_max = float(jerk_abs.max())
    else:
        jerk_mean = jerk_std = jerk_max = 0.0

    # curvature: mean absolute change in heading angle per segment
    if len(dx) > 1:
        v1x, v1y = dx[:-1], dy[:-1]
        v2x, v2y = dx[1:], dy[1:]
        dot = v1x * v2x + v1y * v2y
        norm1 = np.sqrt(v1x * v1x + v1y * v1y)
        norm2 = np.sqrt(v2x * v2x + v2y * v2y)
        denom = norm1 * norm2
        denom[denom == 0] = 1e-6
        cos_theta = np.clip(dot / denom, -1.0, 1.0)
        angles = np.arccos(cos_theta)  # radians
        curvature = float(np.mean(np.abs(angles)))
    else:
        curvature = 0.0

    return {
        "duration": duration,
        "path_length": path_length,
        "displacement": displacement,
        "straightness": straightness,
        "jitter_index": jitter_index,
        "speed_mean": speed_mean,
        "speed_std": speed_std,
        "speed_max": speed_max,
        "acc_mean": acc_mean,
        "acc_std": acc_std,
        "acc_max": acc_max,
        "jerk_mean": jerk_mean,
        "jerk_std": jerk_std,
        "jerk_max": jerk_max,
        "dt_mean": float(dt.mean()),
        "dt_std": float(dt.std(ddof=0)),
        "dt_max": float(dt.max()),
        "curvature": curvature,
        "n_points": float(n),
    }

In [81]:
PHASE2_BASE = "/Users/naman/sem_1_2025_26/scalable_computing/final_project/mouse_training/web_bot_detection_dataset/phase2"

ann_phase2_hmab = os.path.join(
    PHASE2_BASE,
    "annotations",
    "humans_and_moderate_and_advanced_bots",
    "humans_and_moderate_and_advanced_bots",
)

humans_file_phase2 = os.path.join(
    PHASE2_BASE,
    "data",
    "mouse_movements",
    "humans",
    "mouse_movements_humans.json",
)

bots_dir_phase2 = os.path.join(
    PHASE2_BASE,
    "data",
    "mouse_movements",
    "bots",
)

df_phase2 = load_phase2_to_df(
    annotation_path=ann_phase2_hmab,
    humans_file=humans_file_phase2,
    bots_dir=bots_dir_phase2,
    window_seconds=60.0,
    min_points=20,
    subset_name="phase2_hmab",
)

print(df_phase2.shape)
print(df_phase2.head())

(1316, 22)
   duration  path_length  displacement  straightness  jitter_index  \
0   59990.0     6.696846      0.525676      0.078496     11.739484   
1   59981.0     3.340945      0.300146      0.089839     10.131018   
2   59940.0     6.037206      0.513523      0.085060     10.756437   
3   59673.0     3.540497      0.516117      0.145775      5.859860   
4   47727.0     7.716309      0.397166      0.051471     18.428364   

   speed_mean    speed_std     speed_max      acc_mean       acc_std  \
0   78.742885  2533.803648  81987.640971  7.845753e+07  2.534987e+09   
1    0.227853     0.430129      5.842658  5.456315e+00  1.678149e+01   
2    0.353461     0.545656      6.022135  6.934909e+00  1.280488e+01   
3    0.310403     0.597915      7.753441  9.525905e+00  3.267751e+01   
4    0.335798     0.588874      4.563968  7.843137e+00  1.785879e+01   

        acc_max     jerk_mean      jerk_std      jerk_max   dt_mean    dt_std  \
0  8.198640e+10  7.853267e+13  2.536200e+15  8.198640e

In [72]:
print("humans_index:", len(index_from_ndjson(humans_file_phase2)))

bot_idx = {}
for f in os.listdir(bots_dir_phase2):
    if f.endswith(".json"):
        bot_idx = index_from_ndjson(os.path.join(bots_dir_phase2, f), bot_idx)
print("bots_index:", len(bot_idx))

print("annotations:", len(pd.read_csv(ann_phase2_hmab, sep=r'\s+', header=None)))

humans_index: 59
bots_index: 240
annotations: 112


In [77]:
# 1) Load annotations as DataFrame
annot_df = pd.read_csv(
    ann_phase2_hmab,
    sep=r"\s+",
    header=None,
    names=["session_id", "raw_label"],
    engine="python",
)
annot_df["label"] = annot_df["raw_label"].str.contains("human", case=False).astype(int)

annot_sids = set(annot_df["session_id"].astype(str))

# 2) Rebuild indices (or reuse if you already have them)
humans_index = index_from_ndjson(humans_file_phase2)
bots_index = {}
for f in os.listdir(bots_dir_phase2):
    if f.endswith(".json"):
        bots_index = index_from_ndjson(os.path.join(bots_dir_phase2, f), bots_index)

human_sids = set(humans_index.keys())
bot_sids   = set(bots_index.keys())

print("annotations total:", len(annot_sids))
print("human_index total:", len(human_sids))
print("bot_index total  :", len(bot_sids))

print("annot ∩ humans:", len(annot_sids & human_sids))
print("annot ∩ bots  :", len(annot_sids & bot_sids))

print("sample annot∩humans:", list(annot_sids & human_sids)[:5])
print("sample annot∩bots  :", list(annot_sids & bot_sids)[:5])

annotations total: 100
human_index total: 59
bot_index total  : 240
annot ∩ humans: 44
annot ∩ bots  : 56
sample annot∩humans: ['p5keafiscisucgq2tut00s0s7l', 'j8e0t5t4nnr92fupp8rh18ihkq', 'rh5fn2njcglhb2avkv30id017u', '36rja5utab0pakj5136bi7rkop', '89vshcu5jgp79ebb68f7jjkqc5']
sample annot∩bots  : ['62fp2u5etekvin0lrlmmq7ras2', '7e7nf7k4fcjagt3nasaihu4scq', '2cod58pj6b57pun9fl0vhiqoia', '27479tfiqfgi99u7r4vq0q33db', '7jtk8r1ak181fhbf582hjhvbdq']


In [83]:
(set(df_p1_hab_train.columns) - set(df_phase2.columns))

{'phase'}

In [84]:
x = df_phase2.columns

In [85]:
df_phase1 = pd.concat([df_p1_hab_train, df_phase2], ignore_index=True)

In [96]:
df_phase1 = df_phase1[x]
df_phase1.drop_duplicates(subset=x)
df_phase1

Unnamed: 0,duration,path_length,displacement,straightness,jitter_index,speed_mean,speed_std,speed_max,acc_mean,acc_std,acc_max,jerk_mean,jerk_std,jerk_max,dt_mean,dt_std,dt_max,curvature,n_points,label,session_id,subset
0,59.997,7.636607,0.689687,0.090313,11.072575,0.297785,0.692877,22.877720,-0.001763,0.868391,22.858958,0.000716,1.497760,23.016624,0.040814,0.192476,4.306,0.250571,1471.0,1,dr09rk5eagjuu87gedvdqmq3gl,humans_and_advanced_bots_train
1,59.632,10.523313,0.133968,0.012731,78.551161,0.273583,0.435429,5.193537,-0.000030,0.219044,2.958642,-0.000089,0.348174,4.803080,0.026017,0.067867,1.417,0.201213,2293.0,1,dr09rk5eagjuu87gedvdqmq3gl,humans_and_advanced_bots_train
2,57.527,8.988508,0.724884,0.080646,12.399924,0.262647,0.369822,4.102115,-0.000072,0.172747,1.614636,0.000038,0.254852,1.832012,0.028706,0.085258,1.800,0.252065,2005.0,1,dr09rk5eagjuu87gedvdqmq3gl,humans_and_advanced_bots_train
3,59.561,10.674577,0.196980,0.018453,54.191255,0.284308,0.374481,4.190250,0.000489,0.198214,2.155542,0.000045,0.310341,3.366794,0.026519,0.092578,2.545,0.218297,2247.0,1,dr09rk5eagjuu87gedvdqmq3gl,humans_and_advanced_bots_train
4,59.996,11.378064,1.028964,0.090434,11.057782,0.304364,0.505306,6.779101,-0.000061,0.318139,5.125661,-0.000029,0.542011,6.780530,0.026952,0.081551,1.903,0.217555,2227.0,1,gq715ms79515gcq39vf91mli6t,humans_and_advanced_bots_train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1665,36820.000,10.689306,0.973135,0.091038,9.984397,0.351051,0.594303,24.569783,6.108003,48.498893,1498.461561,706.657019,5153.560203,181667.642071,0.020142,0.052881,1.402,0.157353,1829.0,0,bdsf9dba41deu2j4744onppg6k,phase2_hmab
1666,54664.000,14.809452,0.448483,0.030284,32.021125,0.363867,0.733649,35.558007,5.660575,64.731595,2334.725982,656.274689,6973.669125,282512.905980,0.022275,0.177278,8.515,0.168213,2455.0,0,bdsf9dba41deu2j4744onppg6k,phase2_hmab
1667,59998.000,11.958783,0.471671,0.039441,24.354003,0.364874,0.227506,5.562442,4.632244,13.209250,348.394095,534.441153,1508.874233,48558.860351,0.030502,0.356713,11.346,0.154374,1968.0,0,bdsf9dba41deu2j4744onppg6k,phase2_hmab
1668,34117.000,10.551241,0.564034,0.053457,17.706728,0.358841,0.479124,19.123591,5.262979,39.526835,1178.866372,598.840564,4231.477963,146070.747282,0.019188,0.051568,1.410,0.149533,1779.0,0,bdsf9dba41deu2j4744onppg6k,phase2_hmab


In [88]:
df_all = pd.concat([df_phase1, df_phase2], ignore_index=True)

In [89]:
df_all

Unnamed: 0,duration,path_length,displacement,straightness,jitter_index,speed_mean,speed_std,speed_max,acc_mean,acc_std,acc_max,jerk_mean,jerk_std,jerk_max,dt_mean,dt_std,dt_max,curvature,n_points,label,session_id,subset
0,59.997,7.636607,0.689687,0.090313,11.072575,0.297785,0.692877,22.877720,-0.001763,0.868391,22.858958,0.000716,1.497760,23.016624,0.040814,0.192476,4.306,0.250571,1471.0,1,dr09rk5eagjuu87gedvdqmq3gl,humans_and_advanced_bots_train
1,59.632,10.523313,0.133968,0.012731,78.551161,0.273583,0.435429,5.193537,-0.000030,0.219044,2.958642,-0.000089,0.348174,4.803080,0.026017,0.067867,1.417,0.201213,2293.0,1,dr09rk5eagjuu87gedvdqmq3gl,humans_and_advanced_bots_train
2,57.527,8.988508,0.724884,0.080646,12.399924,0.262647,0.369822,4.102115,-0.000072,0.172747,1.614636,0.000038,0.254852,1.832012,0.028706,0.085258,1.800,0.252065,2005.0,1,dr09rk5eagjuu87gedvdqmq3gl,humans_and_advanced_bots_train
3,59.561,10.674577,0.196980,0.018453,54.191255,0.284308,0.374481,4.190250,0.000489,0.198214,2.155542,0.000045,0.310341,3.366794,0.026519,0.092578,2.545,0.218297,2247.0,1,dr09rk5eagjuu87gedvdqmq3gl,humans_and_advanced_bots_train
4,59.996,11.378064,1.028964,0.090434,11.057782,0.304364,0.505306,6.779101,-0.000061,0.318139,5.125661,-0.000029,0.542011,6.780530,0.026952,0.081551,1.903,0.217555,2227.0,1,gq715ms79515gcq39vf91mli6t,humans_and_advanced_bots_train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2981,36820.000,10.689306,0.973135,0.091038,9.984397,0.351051,0.594303,24.569783,6.108003,48.498893,1498.461561,706.657019,5153.560203,181667.642071,0.020142,0.052881,1.402,0.157353,1829.0,0,bdsf9dba41deu2j4744onppg6k,phase2_hmab
2982,54664.000,14.809452,0.448483,0.030284,32.021125,0.363867,0.733649,35.558007,5.660575,64.731595,2334.725982,656.274689,6973.669125,282512.905980,0.022275,0.177278,8.515,0.168213,2455.0,0,bdsf9dba41deu2j4744onppg6k,phase2_hmab
2983,59998.000,11.958783,0.471671,0.039441,24.354003,0.364874,0.227506,5.562442,4.632244,13.209250,348.394095,534.441153,1508.874233,48558.860351,0.030502,0.356713,11.346,0.154374,1968.0,0,bdsf9dba41deu2j4744onppg6k,phase2_hmab
2984,34117.000,10.551241,0.564034,0.053457,17.706728,0.358841,0.479124,19.123591,5.262979,39.526835,1178.866372,598.840564,4231.477963,146070.747282,0.019188,0.051568,1.410,0.149533,1779.0,0,bdsf9dba41deu2j4744onppg6k,phase2_hmab


In [97]:
import numpy as np

# Columns that are NOT features
non_feature_cols = ["label", "session_id", "phase", "subset"]

feature_cols = [c for c in df_all.columns if c not in non_feature_cols]
print("Number of features:", len(feature_cols))
print("Feature columns:", feature_cols)

X_all = df_all[feature_cols].astype(float).values
y_all = df_all["label"].astype(int).values

print("X_all shape:", X_all.shape)
print("y_all shape:", y_all.shape, " | class distribution:", np.bincount(y_all))

Number of features: 19
Feature columns: ['duration', 'path_length', 'displacement', 'straightness', 'jitter_index', 'speed_mean', 'speed_std', 'speed_max', 'acc_mean', 'acc_std', 'acc_max', 'jerk_mean', 'jerk_std', 'jerk_max', 'dt_mean', 'dt_std', 'dt_max', 'curvature', 'n_points']
X_all shape: (2986, 19)
y_all shape: (2986,)  | class distribution: [ 494 2492]


In [104]:
unique_sids = df_all["session_id"].unique()

from sklearn.model_selection import train_test_split

train_sids, test_sids = train_test_split(
    unique_sids,
    test_size=0.2,
    random_state=42,
    stratify=df_all.groupby("session_id")["label"].first()
)

train_df = df_all[df_all["session_id"].isin(train_sids)]
test_df  = df_all[df_all["session_id"].isin(test_sids)]

X_train = train_df[feature_cols].values
y_train = train_df["label"].values

X_valid = test_df[feature_cols].values
y_valid = test_df["label"].values

In [105]:
from xgBoost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=400,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    eval_metric="logloss",
    n_jobs=-1,
)

xgb_model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [106]:
importances = xgb_model.feature_importances_
for f, imp in sorted(zip(feature_cols, importances), key=lambda x: -x[1])[:15]:
    print(f, imp)

curvature 0.2737692
n_points 0.13553578
acc_mean 0.1024635
acc_std 0.08756704
jerk_std 0.07738114
speed_mean 0.07100623
speed_std 0.06796702
speed_max 0.033086095
jerk_mean 0.02965112
dt_mean 0.01915154
duration 0.019084891
acc_max 0.016181665
straightness 0.016133225
path_length 0.016002985
dt_max 0.011395711


In [107]:
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
)

# Probabilities and hard predictions
y_valid_proba = xgb_model.predict_proba(X_valid)[:, 1]
y_valid_pred  = (y_valid_proba >= 0.5).astype(int)

print("ROC AUC:", roc_auc_score(y_valid, y_valid_proba))
print("Accuracy:", accuracy_score(y_valid, y_valid_pred))
print("F1 score:", f1_score(y_valid, y_valid_pred))

print("\nConfusion matrix:")
print(confusion_matrix(y_valid, y_valid_pred))

print("\nClassification report:")
print(classification_report(y_valid, y_valid_pred, digits=4))

ROC AUC: 1.0
Accuracy: 0.9954022988505747
F1 score: 0.9972677595628415

Confusion matrix:
[[ 68   2]
 [  0 365]]

Classification report:
              precision    recall  f1-score   support

           0     1.0000    0.9714    0.9855        70
           1     0.9946    1.0000    0.9973       365

    accuracy                         0.9954       435
   macro avg     0.9973    0.9857    0.9914       435
weighted avg     0.9954    0.9954    0.9954       435



In [108]:
import joblib

joblib.dump(xgb_model, "mouse_model_xgb.pkl")

['mouse_model_xgb.pkl']

In [109]:
model = joblib.load("mouse_model_xgb.pkl")