In [151]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import warnings
import tqdm
import numpy as np
# Suppress warnings
warnings.filterwarnings("ignore")

In [152]:
# Paths
BASE_DIR = "dataset/the-attentive-cursor-dataset-master/"
LOGS_DIR = BASE_DIR + "logs/"
GROUNDTRUTH_FILE = BASE_DIR + "groundtruth.tsv"
OUTPUT_DIR = "processed/"
OUTPUT_DIR_SCALED = "processed_scaled/"
VISUALIZATION_DIR = "visualizations/"
VISUALIZATION_DIR_SCALED = "visualizations_scaled/"

In [153]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(VISUALIZATION_DIR, exist_ok=True)
os.makedirs(VISUALIZATION_DIR_SCALED, exist_ok=True)
os.makedirs(OUTPUT_DIR_SCALED, exist_ok=True)

In [154]:
groundtruth_df = pd.read_csv(GROUNDTRUTH_FILE, sep="\t")

In [155]:
def parse_xml(xml_file):
    """Extracts metadata from an XML file."""
    tree = ET.parse(xml_file)
    root = tree.getroot()
    metadata = {}
    for child in root:
        metadata[child.tag] = child.text
    return metadata

In [156]:
def plot_mouse_movement(df, log_id):
    """Generates a mouse movement tracking plot."""
    plt.figure(figsize=(10, 6))
    
    # Filter only mouse movement events
    movement_df = df[df["event"].isin(["mousemove", "mouseover", "mousedown", "mouseup", "click"])]
    plt.plot(movement_df["xpos"], movement_df["ypos"], marker='o', linestyle='-', color='blue', label='Mouse Movement')
    
    # Highlight key events
    for _, row in movement_df.iterrows():
        if row["event"] in ["mouseover", "mousedown", "mouseup", "click"]:
            plt.scatter(row["xpos"], row["ypos"], color='red', s=100, label=row["event"] if row["event"] not in plt.gca().get_legend_handles_labels()[1] else "")
    
    plt.gca().invert_yaxis()  # Invert y-axis to match screen coordinates
    plt.xlabel('X Position')
    plt.ylabel('Y Position')
    plt.title(f'Mouse Movement Tracking - {log_id}')
    plt.legend()
    plt.grid(True)
    
    # Save figure
    plt.savefig(os.path.join(VISUALIZATION_DIR, f"{log_id}_mouse_movement.png"))
    plt.close()

In [157]:
from matplotlib.collections import LineCollection
import matplotlib.colors as mcolors


def plot_mouse_movement_scaled(df, log_id, norm_accel, cmap):

    """Generates a high-contrast mouse movement tracking plot with acceleration-based color."""
    
    plt.figure(figsize=(8, 6), facecolor='black')
    
    # Create line segments for visualization
    points = np.array([df["xpos_scaled"], df["ypos_scaled"]]).T.reshape(-1, 1, 2)
    segments = np.concatenate([points[:-1], points[1:]], axis=1)
    norm = plt.Normalize(norm_accel.min(), norm_accel.max())
    lc = LineCollection(segments, cmap=cmap, norm=norm, linewidth=2)
    lc.set_array(norm_accel[:-1])
    plt.gca().add_collection(lc)
    
    # Remove axis details for cleaner visualization
    plt.xticks([])
    plt.yticks([])
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.gca().invert_yaxis()
    plt.gca().set_facecolor("black")  # Set black background
    plt.savefig(os.path.join(VISUALIZATION_DIR_SCALED, f"{log_id}_mouse_movement.png"), bbox_inches='tight', pad_inches=0, facecolor='black')
    plt.close()

In [161]:
def process_log(log_id):
    """Loads and merges CSV and XML data for a given log_id."""
    csv_path = os.path.join(LOGS_DIR, f"{log_id}.csv")
    xml_path = os.path.join(LOGS_DIR, f"{log_id}.xml")
    
    if not os.path.exists(csv_path) or not os.path.exists(xml_path):
        print(f"Missing files for log_id {log_id}, skipping...")
        return None
    
    # Load mouse tracking data
    df = pd.read_csv(csv_path, delim_whitespace=True)
    
    # Parse XML metadata
    metadata = parse_xml(xml_path)
    metadata_df = pd.DataFrame([metadata])
    metadata_df["log_id"] = str(log_id)
    window_width, window_height = map(int, metadata["window"].split("x"))

    # Normalize positions by window size
    df["xpos_scaled"] = df["xpos"] / window_width
    df["ypos_scaled"] = df["ypos"] / window_height
    
    # Compute speed and acceleration
    df["timestamp_diff"] = df["timestamp"].diff().fillna(1)
    df["speed"] = np.sqrt(df["xpos"].diff()**2 + df["ypos"].diff()**2) / df["timestamp_diff"]
    df["speed"].replace([np.inf, -np.inf], np.nan, inplace=True)
    df["speed"].fillna(0, inplace=True)
    df["acceleration"] = df["speed"].diff().fillna(0)
    
    # Normalize acceleration for color mapping
    norm_accel = (df["acceleration"] - df["acceleration"].min()) / (df["acceleration"].max() - df["acceleration"].min())
    
    # Define acceleration color ranges for consistent usage across logs
    """
    Blue: Low acceleration
    Cyan: Medium-low acceleration
    Yellow: Medium acceleration
    Orange: High acceleration
    Red: Very high acceleration
    """
    
    accel_bins = [0.0, 0.25, 0.5, 0.75, 1.0]
    accel_colors = ["blue", "cyan", "yellow", "orange", "red"]
    cmap = mcolors.LinearSegmentedColormap.from_list("accel_cmap", list(zip(accel_bins, accel_colors)))
    
    # Merge with tracking data
    df["log_id"] = log_id
    df["log_id"] = df["log_id"].astype(str)
    merged_df = df.merge(metadata_df, on="log_id", how="left")
    
    # Merge with ground truth labels
    groundtruth_df["log_id"] = groundtruth_df["log_id"].astype(str)
    merged_df = merged_df.merge(groundtruth_df, on="log_id", how="left")
    
    # Save merged data
    output_path = os.path.join(OUTPUT_DIR_SCALED, f"{log_id}_processed_scaled.csv")
    merged_df.to_csv(output_path, index=False)
    
    # Generate visualization
    plot_mouse_movement_scaled(merged_df, log_id, norm_accel, cmap)
    
    return merged_df


In [163]:
def main():
    """Processes all log files in the directory."""
    log_ids = [f.split(".")[0] for f in os.listdir(LOGS_DIR) if f.endswith(".csv")]
    
    for log_id in tqdm.tqdm(log_ids):
        # print(f"Processing {log_id}...")
        process_log(log_id)
    
    print("Data processing complete. Check the 'processed' folder for merged files and 'visualizations scaled' for plots.")


In [164]:
if __name__ == "__main__":
    main()

100%|██████████| 2909/2909 [01:35<00:00, 30.34it/s]

Data processing complete. Check the 'processed' folder for merged files and 'visualizations scaled' for plots.



