In [17]:
import os, re
import numpy as np
import pandas as pd

base_data_path = r"E:\学习工作\PD\pks\SitToStand\Data\STS_2D_skeletons_coarsened"
xls_path       = r"E:\学习工作\PD\pks\SitToStand\Data\STS_human_labels\SitToStand_human_labels.xls"
csv_path       = r"E:\学习工作\PD\pks\SitToStand\Data\STS_human_labels\SitToStand_human_labels.csv"

if not os.path.exists(csv_path):
    df_labels = pd.read_excel(xls_path, engine="xlrd")

    df_labels.columns = df_labels.columns.str.strip()

    df_labels = df_labels.rename(columns={"Transition ID":"sts_id"})
    df_labels.to_csv(csv_path, index=False, encoding="utf-8")
    print(csv_path)
else:

    df_labels = pd.read_csv(csv_path)

    df_labels.columns = df_labels.columns.str.strip()
    df_labels = df_labels.rename(columns={"Transition ID":"sts_id"})
    print("Columns in CSV file：", df_labels.columns.tolist())

print("Number of Samples:", len(df_labels))


✅ 已加载并标准化列名，当前列为： ['sts_id', 'Participant ID number', 'PD_or_C', 'sts_whole_episode_duration', 'sts_final_attempt_duration', 'On_or_Off_medication', 'DBS_state', 'Clinical_assessment', 'STS_additional_features', 'MDS-UPDRS_score_3.9 _arising_from_chair']
样本数量： 403


In [18]:
# --- Load Labels ---
try:
    df_labels = pd.read_csv(csv_path)
    df_labels.rename(columns = {'Transition ID': 'sts_id'},inplace = True)
    print("sts_id", 'sts_id' in df_labels.columns)
    print(f"Successfully loaded labels file: {csv_path}")
    print(f"Labels DataFrame shape: {df_labels.shape}")
    print("\nFirst 5 rows of labels:")
    display(df_labels.head()) # Use display() in notebooks for better formatting
    print("\nLabel columns and data types:")
    print(df_labels.info())

except FileNotFoundError:
    print(f"Error: Labels file not found at {csv_path}")
    df_labels = None # Set df_labels to None if file not found
except Exception as e:
    print(f"An error occurred loading the labels file: {e}")
    df_labels = None # Set df_labels to None on other errors

现在主键列是： True
Successfully loaded labels file: E:\学习工作\PD\pks\SitToStand\Data\STS_human_labels\SitToStand_human_labels.csv
Labels DataFrame shape: (403, 10)

First 5 rows of labels:


Unnamed: 0,sts_id,Participant ID number,PD_or_C,sts_whole_episode_duration,sts_final_attempt_duration,On_or_Off_medication,DBS_state,Clinical_assessment,STS_additional_features,MDS-UPDRS_score_3.9 _arising_from_chair
0,1,596,PD,1.748,1.209,On medication,-,Yes,,0.0
1,2,596,PD,2.085,1.076999,On medication,-,Yes,,0.0
2,3,596,PD,2.61,1.067999,On medication,-,Yes,Uses arms of chair,2.0
3,4,596,PD,2.478999,1.233,On medication,-,Yes,Slow,1.0
4,5,596,PD,1.268,0.801,On medication,-,Yes,,0.0



Label columns and data types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403 entries, 0 to 402
Data columns (total 10 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   sts_id                                   403 non-null    int64  
 1   Participant ID number                    403 non-null    int64  
 2   PD_or_C                                  403 non-null    object 
 3   sts_whole_episode_duration               403 non-null    float64
 4   sts_final_attempt_duration               403 non-null    float64
 5   On_or_Off_medication                     403 non-null    object 
 6   DBS_state                                403 non-null    object 
 7   Clinical_assessment                      403 non-null    object 
 8   STS_additional_features                  223 non-null    object 
 9   MDS-UPDRS_score_3.9 _arising_from_chair  396 non-null    float64
dtypes: float64(3), int6

In [19]:
def load_sts_data(row, base_path=base_data_path):
    """
    Loads STS data for a specific row from the labels DataFrame.

    Args:
        row (pd.Series): A row from the labels DataFrame containing metadata.
        base_path (str): Path to the directory containing STS CSV files.

    Returns:
        pd.DataFrame: DataFrame containing keypoint data, or None if loading fails.
    """
    pid = int(row["Participant ID number"])
    sts = int(row["sts_id"])
    grp = row["PD_or_C"]
    pattern = rf"^Pt{pid}_{grp}_n_{sts}\.csv$"

    for fn in os.listdir(base_path):
        if re.match(pattern, fn):
            full = os.path.join(base_path, fn)

            # print(f"[DEBUG] Loading PID={pid}, sts_id={sts} from {full}")

            df = pd.read_csv(full,
                             header=2,
                             sep=",",  # 如果是 tab 分隔，改为 sep='\t'
                             engine="python",
                             on_bad_lines="skip")


            # print("[DEBUG] First 5 rows of loaded data:")
            # print(df.head())

            df = df.iloc[:, 2:2+50]
            num_keypoints = 25
            expected_cols = [f"{axis}{j}" for j in range(num_keypoints) for axis in ("x", "y")]

            if df.shape[1] < len(expected_cols):
                print(f"[WARN] PID={pid}, sts_id={sts}: only {df.shape[1]} cols, expected {len(expected_cols)}. Padding NaNs.")
                for col in expected_cols[df.shape[1]:]:
                    df[col] = np.nan

            df = df.iloc[:, :len(expected_cols)]
            df.columns = expected_cols

            print(f"[INFO] Loaded PID={pid}, sts_id={sts}. Shape: {df.shape}")
            return df

    print(f"[WARN] STS CSV not found for PID={pid}, sts_id={sts}")
    return None
print("Function 'load_turn_data' defined.")

Function 'load_turn_data' defined.


In [20]:
for sts_id in df_labels['sts_id']:
    row = df_labels[df_labels['sts_id'] == sts_id].iloc[0]
    df_kp = load_sts_data(row)
    if df_kp is None:
        print(f"[WARN] Failed to load data for sts_id={sts_id}")
    else:
        print(f"[INFO] Loaded data for sts_id={sts_id}, Shape: {df_kp.shape}")
        print(df_kp.head())

[INFO] Loaded PID=596, sts_id=1. Shape: (508, 50)
[INFO] Loaded data for sts_id=1, Shape: (508, 50)
    x0   y0   x1   y1   x2   y2   x3   y3   x4   y4  ...  x20  y20  x21  y21  \
0  304  116  306  128  294  128  288  144  302  140  ...  318  218  314  214   
1  304  116  304  128  292  128  286  140  298  138  ...  318  216  314  212   
2  304  116  304  128  292  128  286  140  298  138  ...  318  216  314  212   
3  304  116  304  128  292  128  286  140  298  138  ...  318  216  314  212   
4  304  116  304  128  292  128  286  140  298  138  ...  318  216  314  212   

   x22  y22  x23  y23  x24  y24  
0  298  222  294  218  298  214  
1  298  220  296  218  298  212  
2  298  220  294  218  298  212  
3  298  220  294  218  298  212  
4  298  220  294  218  298  212  

[5 rows x 50 columns]
[INFO] Loaded PID=596, sts_id=2. Shape: (401, 50)
[INFO] Loaded data for sts_id=2, Shape: (401, 50)
    x0   y0   x1   y1   x2   y2   x3   y3   x4   y4  ...  x20  y20  x21  y21  \
0  304  116 

In [21]:
# 测试加载 STS 数据
if df_labels is not None:
    example_sts_id =12  # 测试的 STS_ID
    example_row_df = df_labels[df_labels['sts_id'] == example_sts_id]

    if not example_row_df.empty:
        example_row = example_row_df.iloc[0]
        df_2d_example = load_sts_data(example_row)
        if df_2d_example is not None:
            print(f"Loaded data shape: {df_2d_example.shape}")
            display(df_2d_example.head(10))
        else:
            print("Failed to load data for the given STS ID.")
    else:
        print(f"STS ID {example_sts_id} not found in labels.")
else:
    print("Labels DataFrame not loaded.")

[INFO] Loaded PID=596, sts_id=12. Shape: (508, 50)
Loaded data shape: (508, 50)


Unnamed: 0,x0,y0,x1,y1,x2,y2,x3,y3,x4,y4,...,x20,y20,x21,y21,x22,y22,x23,y23,x24,y24
0,350,184,376,184,360,182,348,198,330,222,...,374,328,388,316,338,308,338,304,362,306
1,350,184,376,184,360,182,348,198,330,222,...,374,328,388,316,338,308,338,304,362,306
2,350,184,376,184,360,182,348,198,330,222,...,374,328,388,316,338,308,338,304,362,306
3,350,184,376,184,360,182,348,198,330,222,...,374,328,388,316,338,308,338,304,362,306
4,350,172,380,172,362,172,350,188,338,212,...,374,328,388,316,338,306,338,304,362,306
5,350,172,380,172,362,172,350,188,338,212,...,374,328,388,316,338,306,338,304,362,306
6,356,160,380,164,364,162,356,178,342,200,...,376,328,388,318,338,306,340,304,364,306
7,356,160,380,164,364,162,356,178,342,200,...,376,328,388,318,338,306,340,304,364,306
8,356,160,380,164,364,162,356,178,342,200,...,376,328,388,318,338,306,340,304,364,306
9,356,160,380,164,364,162,356,178,342,200,...,376,328,388,318,338,306,340,304,364,306


In [22]:
# --- Example Usage: Load first PD 'On' turn ---
if df_labels is not None:
    pd_on_rows = df_labels[(df_labels['PD_or_C'] == 'PD') & (df_labels['On_or_Off_medication'] == 'On medication')]
    if not pd_on_rows.empty:
        first_pd_on_row = pd_on_rows.iloc[0]
        sts_id = int(first_pd_on_row['sts_id'])
        print(f"\nAttempting to load first PD 'On' sts (sts_id: {sts_id})")
        df_pd = load_sts_data(first_pd_on_row, base_path=base_data_path)
        if df_pd is not None:
            print(f"   Loaded PD 'On' 2D Shape: {df_pd.shape}")

      
    else:
        print("No PD 'On' sts found in the labels file.")
else:
    print("Labels DataFrame ('df_labels') not loaded.")


Attempting to load first PD 'On' sts (sts_id: 1)
[INFO] Loaded PID=596, sts_id=1. Shape: (508, 50)
   Loaded PD 'On' 2D Shape: (508, 50)


In [23]:
# --- Example Usage: Load first PD 'On' turn ---
if df_labels is not None:
    pd_off_rows = df_labels[(df_labels['PD_or_C'] == 'PD') & (df_labels['On_or_Off_medication'] == 'Off medication')]
    if not pd_off_rows.empty:
        first_pd_off_row = pd_off_rows.iloc[0]
        sts_id = int(first_pd_off_row['sts_id'])
        print(f"\nAttempting to load first PD 'Off' sts (sts_id: {sts_id})")
        df_pd_off_3d = load_sts_data(first_pd_off_row, base_path=base_data_path)
        if df_pd_off_3d is not None:
             print(f"   Loaded PD 'Off' 2D Shape: {df_pd_off_3d.shape}")
             # Optional: display(df_pd_off_3d.head(2))
    else:
        print("No PD 'Off' sts found in the labels file.")
else:
    print("Labels DataFrame ('df_labels') not loaded.")


Attempting to load first PD 'Off' sts (sts_id: 12)
[INFO] Loaded PID=596, sts_id=12. Shape: (508, 50)
   Loaded PD 'Off' 2D Shape: (508, 50)


In [24]:
# --- Calculate Basic Features ---

# Check if the example DataFrame exists from the previous loading step
if 'df_2d_example' in locals() and df_2d_example is not None:
    print(f"Calculating features for the loaded turn (assuming Turn ID 350)...")

    # --- Configuration ---
    frame_rate = 30.0  # Approximate frame rate from readme.txt
    joint_index_to_analyze = 8 # Example: Joint 8 (often MidHip) - Choose a relevant joint
    x_col = f'x{joint_index_to_analyze}'
    y_col = f'y{joint_index_to_analyze}'
    print(f"Analyzing joint: {joint_index_to_analyze} ({x_col}, {y_col})")

    # --- 1. Duration ---
    num_frames = len(df_2d_example)
    duration_sec = num_frames / frame_rate if frame_rate > 0 else 0

    # --- 2. Path Length ---
    # Get coordinates for the chosen joint
    joint_coords = df_2d_example[[x_col, y_col]]
    # Calculate differences between consecutive frames
    diffs = joint_coords.diff().fillna(0) # Use diff() and fill the first NaN with 0
    # Calculate squared differences
    sq_diffs = diffs**2
    # Calculate distance for each step (sqrt(dx^2 + dy^2))
    step_distances = np.sqrt(sq_diffs[x_col] + sq_diffs[y_col])
    # Sum the distances
    path_length = step_distances.sum()

    # --- 3. Net Displacement ---
    start_pos = joint_coords.iloc[0]
    end_pos = joint_coords.iloc[-1]
    displacement = np.sqrt((end_pos[x_col] - start_pos[x_col])**2 + \
                           (end_pos[y_col] - start_pos[y_col])**2)

    # --- 4. Average Speed ---
    avg_speed = path_length / duration_sec if duration_sec > 0 else 0

    # --- Print Results ---
    print("\n--- Calculated Features ---")
    print(f"Duration: {num_frames} frames")
    print(f"Duration: {duration_sec:.3f} seconds (at {frame_rate} fps)")
    print(f"Total Path Length (Joint {joint_index_to_analyze}): {path_length:.3f} pixels")
    print(f"Net Displacement (Joint {joint_index_to_analyze}): {displacement:.3f} pixels")
    print(f"Average Speed (Joint {joint_index_to_analyze}): {avg_speed:.3f} pixels/second")

else:
    print("Variable 'df_2d_example' not found or is None.")
    print("Please make sure you successfully ran the cell that loads the example turn data.")

Calculating features for the loaded turn (assuming Turn ID 350)...
Analyzing joint: 8 (x8, y8)

--- Calculated Features ---
Duration: 508 frames
Duration: 16.933 seconds (at 30.0 fps)
Total Path Length (Joint 8): 459.612 pixels
Net Displacement (Joint 8): 234.009 pixels
Average Speed (Joint 8): 27.142 pixels/second


In [25]:
def calculate_sts_features(df_kp, frame_rate=30.0, dimension='2D'):
    """
    Calculates various features from a keypoint DataFrame for a turn,
    focusing on Pelvis kinematics and Trunk rotation.

    Args:
        df_kp (pd.DataFrame): DataFrame with keypoint data (rows=frames, cols=coords).
        frame_rate (float): Frame rate of the capture (e.g., 30.0 fps).
        dimension (str): '2D' or '3D'. Currently primarily uses 2D coords.

    Returns:
        dict: A dictionary containing the calculated features, or None if input is invalid.
    """
    if df_kp is None or df_kp.empty:
        print("Warning: Input DataFrame is None or empty.")
        return None
    


    features = {}
    dt = 1.0 / frame_rate if frame_rate > 0 else 0

    # --- Basic Features ---
    features['duration_frames'] = len(df_kp)
    if features['duration_frames'] < 4:  # Need at least 4 frames for Jerk calculation
        print(f"Warning: sts has only {features['duration_frames']} frames. Cannot calculate all derivatives.")
        # Calculate basic features if possible (at least 2 frames needed)
        if features['duration_frames'] >= 2:
            features['duration_sec'] = features['duration_frames'] * dt if dt > 0 else 0
        else:
            return None  # Not enough data even for basic calcs
        # Set others to NaN and return what we have
        nan_features = [
            'pelvis_path_length', 'pelvis_displacement', 'pelvis_avg_speed', 'pelvis_peak_speed',
            'pelvis_avg_accel_mag', 'pelvis_peak_accel_mag', 'pelvis_avg_jerk_mag',
            'r_wrist_path_length', 'r_wrist_avg_speed',
            'l_wrist_path_length', 'l_wrist_avg_speed',
            'r_ankle_path_length', 'r_ankle_avg_speed',
            'l_ankle_path_length','l_ankle_avg_speed',
            'bbox_width', 'bbox_height', 'bbox_area'
        ]
        for fname in nan_features:
            features[fname] = np.nan
        return features

    features['duration_sec'] = features['duration_frames'] * dt

    # --- Features based on Pelvis (Joint 0) ---
    pelvis_joint_index = 0
    if dimension == '2D':
        pelvis_cols = [f'x{pelvis_joint_index}', f'y{pelvis_joint_index}']
    else:
        return None  # Invalid dimension

    pelvis_coords = df_kp[pelvis_cols].copy()

    # Path Length & Displacement
    start_pos = pelvis_coords.iloc[0]
    end_pos = pelvis_coords.iloc[-1]
    diffs = pelvis_coords.diff().fillna(0)  # Differences between consecutive frames
    sq_diffs = diffs**2
    step_distances = np.sqrt(sq_diffs.sum(axis=1))  # Euclidean distance for each step
    features['pelvis_path_length'] = step_distances.sum()
    features['pelvis_displacement'] = np.sqrt(((end_pos - start_pos)**2).sum())

    # Kinematics (Velocity, Acceleration, Jerk for Pelvis)
    if dt > 0:
        # Velocity (pixels/sec or units/sec)
        velocity = diffs / dt
        speed = np.sqrt((velocity**2).sum(axis=1))
        features['pelvis_avg_speed'] = speed.mean()
        features['pelvis_peak_speed'] = speed.max()

        # Acceleration (pixels/sec^2 or units/sec^2)
        accel = velocity.diff().fillna(0) / dt
        accel_mag = np.sqrt((accel**2).sum(axis=1))
        features['pelvis_avg_accel_mag'] = accel_mag.mean()
        features['pelvis_peak_accel_mag'] = accel_mag.max()

        # Jerk (pixels/sec^3 or units/sec^3) - measure of smoothness
        jerk = accel.diff().fillna(0) / dt
        jerk_mag = np.sqrt((jerk**2).sum(axis=1))
        features['pelvis_avg_jerk_mag'] = jerk_mag.mean()
    else:
        # Assign NaN if dt=0
        nan_val = np.nan
        features['pelvis_avg_speed'] = nan_val
        features['pelvis_peak_speed'] = nan_val
        features['pelvis_avg_accel_mag'] = nan_val
        features['pelvis_peak_accel_mag'] = nan_val
        features['pelvis_avg_jerk_mag'] = nan_val

    # --- Features based on Trunk Rotation (Shoulders: 11=L, 14=R) ---
    # --- Features based on Hand and Leg Movement ---
    # Hand Movement (Wrist: 4=RWrist, 7=LWrist)
    def compute_joint_feature(idx, name):
        xcol, ycol = f'x{idx}', f'y{idx}'
        if xcol not in df_kp.columns or ycol not in df_kp.columns:
            print(f"[WARN] 没有列 {xcol}/{ycol}，跳过{name}特征。")
            return np.nan, np.nan

        joint_df = df_kp[[xcol, ycol]].copy()

        orig_count = joint_df.notna().all(axis=1).sum()
        print(f"[DEBUG] {name} 原始完整帧数 {orig_count}/{len(joint_df)}")

        joint_df = joint_df.interpolate(method='linear', limit_direction='both')
        interp_count = joint_df.notna().all(axis=1).sum()
        print(f"[DEBUG] {name} 插值后完整帧数 {interp_count}/{len(joint_df)}")

        diffs = joint_df.diff().fillna(0)
        dists = np.sqrt((diffs**2).sum(axis=1))
        path_len = dists.sum()

        if frame_rate > 0:
            speeds = dists * frame_rate
            avg_speed = speeds.mean()
        else:
            avg_speed = np.nan

        return path_len, avg_speed

    features['r_wrist_path_length'], features['r_wrist_avg_speed'] = compute_joint_feature(4, '右手腕')
    features['l_wrist_path_length'], features['l_wrist_avg_speed'] = compute_joint_feature(7, '左手腕')

    features['r_ankle_path_length'], features['r_ankle_avg_speed'] = compute_joint_feature(10, '右脚踝')
    features['l_ankle_path_length'], features['l_ankle_avg_speed'] = compute_joint_feature(13, '左脚踝')

    # --- Features based on all joints ---
    # Bounding Box Area (using only x and y even for 3D)
    x_cols_all = [col for col in df_kp.columns if col.startswith('x')]
    y_cols_all = [col for col in df_kp.columns if col.startswith('y')]
    if x_cols_all and y_cols_all:
        min_x = df_kp[x_cols_all].min().min()
        max_x = df_kp[x_cols_all].max().max()
        min_y = df_kp[y_cols_all].min().min()
        max_y = df_kp[y_cols_all].max().max()
        features['bbox_width'] = max_x - min_x
        features['bbox_height'] = max_y - min_y
        features['bbox_area'] = features['bbox_width'] * features['bbox_height']
    else:
        features['bbox_width'] = np.nan
        features['bbox_height'] = np.nan
        features['bbox_area'] = np.nan
 
    
    return features


In [26]:
# --- Iterate through labels, load data, calculate UPDATED features ---

# Check if df_labels exists
if df_labels is not None:
    print(f"Processing sts listed in '{csv_path}' using updated features...")

    # Define which dimension's data to load and analyze
    dimension_to_process = '2D' # Or change to '3D'
    frame_rate_to_use = 30.0 # Define frame rate here

    # Define the list of feature names we expect from the UPDATED function
    feature_names = [
        'duration_frames', 'duration_sec',
        'pelvis_path_length', 'pelvis_displacement', 'pelvis_avg_speed', 'pelvis_peak_speed',
        'pelvis_avg_accel_mag', 'pelvis_peak_accel_mag', 'pelvis_avg_jerk_mag',
        'r_wrist_path_length', 'r_wrist_avg_speed',
        'l_wrist_path_length', 'l_wrist_avg_speed',
        'r_ankle_path_length', 'r_ankle_avg_speed',
        'l_ankle_path_length','l_ankle_avg_speed',
        'bbox_width', 'bbox_height', 'bbox_area'
    ]

    # Initialize lists to store features for all turns
    results = []

    # Iterate through each turn in the labels DataFrame
    total_turns = len(df_labels)
    print(f"Starting feature extraction for {total_turns} turns...")
    for index, sts_info_row in df_labels.iterrows():
        sts_id = int(sts_info_row['sts_id'])
        # Print progress
        if (index + 1) % 100 == 0 or index == 0: # Print every 100 turns and the first one
             print(f"Processing turn {index + 1}/{total_turns} (ID: {sts_id})...")

        # Load the keypoint data for the current turn
        df_kp = load_sts_data(sts_info_row, base_path=base_data_path)

        # Calculate features if data was loaded successfully
        feature_dict = None # Initialize feature dictionary for this turn
        if df_kp is not None:
            # Calculate features using the updated function
            feature_dict = calculate_sts_features(df_kp, frame_rate=frame_rate_to_use, dimension=dimension_to_process)

        # Store results - always add Turn ID, features will be NaN if calculation failed
        if feature_dict is None:
             feature_dict = {} # Create empty dict if failed
        feature_dict['sts_id'] = sts_id # Ensure Turn ID is present
        results.append(feature_dict)


    # Convert results list to a DataFrame
    df_features = pd.DataFrame(results)
    # Reorder columns to have Turn ID first, then the expected feature names
    cols_ordered = ['sts_id'] + [f for f in feature_names if f in df_features.columns]
    df_features = df_features[cols_ordered]


    print(f"\nFeature calculation complete for {len(df_features[df_features['duration_frames'].notna()])} turns (out of {total_turns} total).")

    # Merge features back into the original labels DataFrame
    # Make sure Turn ID in df_labels is also integer type for merging
    df_labels['sts_id'] = df_labels['sts_id'].astype(int)
    # Drop old feature columns if they exist from previous runs before merging
    cols_to_drop = [f for f in feature_names if f in df_labels.columns]
    if cols_to_drop:
        df_labels = df_labels.drop(columns=cols_to_drop)
        print(f"Dropped old feature columns: {cols_to_drop}")

    df_labels_with_features = pd.merge(df_labels, df_features, on='sts_id', how='left')

    print("Features merged with labels.")
    print(f"Shape of final DataFrame: {df_labels_with_features.shape}")
    print("\nHead of DataFrame with calculated features:")
    display(df_labels_with_features.head(30))

    # Optional: Check how many turns have NaN features (indicating loading/calculation issues)
    print("\nNumber of turns with missing calculated features:")
    print(df_labels_with_features[feature_names].isnull().sum())
    print(f"Columns in DataFrame: {df_kp.columns.tolist()}")


else:
    print("Labels DataFrame ('df_labels') not loaded. Please run Cell 2 successfully.")

Processing sts listed in 'E:\学习工作\PD\pks\SitToStand\Data\STS_human_labels\SitToStand_human_labels.csv' using updated features...
Starting feature extraction for 403 turns...
Processing turn 1/403 (ID: 1)...
[INFO] Loaded PID=596, sts_id=1. Shape: (508, 50)
[DEBUG] 右手腕 原始完整帧数 508/508
[DEBUG] 右手腕 插值后完整帧数 508/508
[DEBUG] 左手腕 原始完整帧数 508/508
[DEBUG] 左手腕 插值后完整帧数 508/508
[DEBUG] 右脚踝 原始完整帧数 508/508
[DEBUG] 右脚踝 插值后完整帧数 508/508
[DEBUG] 左脚踝 原始完整帧数 508/508
[DEBUG] 左脚踝 插值后完整帧数 508/508
[INFO] Loaded PID=596, sts_id=2. Shape: (401, 50)
[DEBUG] 右手腕 原始完整帧数 401/401
[DEBUG] 右手腕 插值后完整帧数 401/401
[DEBUG] 左手腕 原始完整帧数 401/401
[DEBUG] 左手腕 插值后完整帧数 401/401
[DEBUG] 右脚踝 原始完整帧数 401/401
[DEBUG] 右脚踝 插值后完整帧数 401/401
[DEBUG] 左脚踝 原始完整帧数 401/401
[DEBUG] 左脚踝 插值后完整帧数 401/401
[INFO] Loaded PID=596, sts_id=3. Shape: (384, 50)
[DEBUG] 右手腕 原始完整帧数 384/384
[DEBUG] 右手腕 插值后完整帧数 384/384
[DEBUG] 左手腕 原始完整帧数 384/384
[DEBUG] 左手腕 插值后完整帧数 384/384
[DEBUG] 右脚踝 原始完整帧数 384/384
[DEBUG] 右脚踝 插值后完整帧数 384/384
[DEBUG] 左脚踝 原始完整帧数 384/384
[DEBUG] 左脚踝

Unnamed: 0,sts_id,Participant ID number,PD_or_C,sts_whole_episode_duration,sts_final_attempt_duration,On_or_Off_medication,DBS_state,Clinical_assessment,STS_additional_features,MDS-UPDRS_score_3.9 _arising_from_chair,...,r_wrist_avg_speed,l_wrist_path_length,l_wrist_avg_speed,r_ankle_path_length,r_ankle_avg_speed,l_ankle_path_length,l_ankle_avg_speed,bbox_width,bbox_height,bbox_area
0,1,596,PD,1.748,1.209,On medication,-,Yes,,0.0,...,14.344008,188.719676,11.144863,276.038512,16.301487,240.0,14.173228,40,144,5760
1,2,596,PD,2.085,1.076999,On medication,-,Yes,,0.0,...,84.360219,1793.886967,134.206007,2314.514624,173.155708,1718.126413,128.538136,428,476,203728
2,3,596,PD,2.61,1.067999,On medication,-,Yes,Uses arms of chair,2.0,...,88.558908,975.204182,76.187827,1008.074972,78.755857,998.531943,78.010308,438,476,208488
3,4,596,PD,2.478999,1.233,On medication,-,Yes,Slow,1.0,...,172.754345,2019.377756,142.544312,2363.489479,166.834551,1738.308344,122.704118,424,476,201824
4,5,596,PD,1.268,0.801,On medication,-,Yes,,0.0,...,113.70443,569.18667,33.613386,603.518031,35.640829,604.577644,35.703404,380,464,176320
5,6,596,PD,1.756,0.86,On medication,-,Yes,,0.0,...,71.560875,510.82245,30.16668,494.449271,29.19976,501.644253,29.624661,370,452,167240
6,8,596,PD,1.917,0.792999,On medication,-,No,Uses arms of chair,2.0,...,134.894794,1295.015317,148.284197,954.600767,109.305431,1003.294038,114.880997,426,476,202776
7,12,596,PD,2.293,1.245,Off medication,-,No,Carrying something,0.0,...,152.971968,768.174837,45.364656,561.288279,33.146946,486.809886,28.748615,432,466,201312
8,15,596,PD,4.283999,1.618,Off medication,-,No,>1 attempt\nmoves forward in chair\nCarrying s...,1.0,...,236.70141,1504.122862,88.826153,204.196756,12.058864,1468.668398,86.732386,482,368,177376
9,17,596,PD,3.031,1.599,Off medication,-,No,Slow,1.0,...,179.837615,897.824363,53.021124,3531.38992,208.546649,2327.802539,137.468654,510,476,242760



Number of turns with missing calculated features:
duration_frames          0
duration_sec             0
pelvis_path_length       0
pelvis_displacement      0
pelvis_avg_speed         0
pelvis_peak_speed        0
pelvis_avg_accel_mag     0
pelvis_peak_accel_mag    0
pelvis_avg_jerk_mag      0
r_wrist_path_length      0
r_wrist_avg_speed        0
l_wrist_path_length      0
l_wrist_avg_speed        0
r_ankle_path_length      0
r_ankle_avg_speed        0
l_ankle_path_length      0
l_ankle_avg_speed        0
bbox_width               0
bbox_height              0
bbox_area                0
dtype: int64
Columns in DataFrame: ['x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4', 'x5', 'y5', 'x6', 'y6', 'x7', 'y7', 'x8', 'y8', 'x9', 'y9', 'x10', 'y10', 'x11', 'y11', 'x12', 'y12', 'x13', 'y13', 'x14', 'y14', 'x15', 'y15', 'x16', 'y16', 'x17', 'y17', 'x18', 'y18', 'x19', 'y19', 'x20', 'y20', 'x21', 'y21', 'x22', 'y22', 'x23', 'y23', 'x24', 'y24']


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats # For statistical testing

# Set plot style
sns.set(style="whitegrid")

# Check if the final DataFrame exists
if 'df_labels_with_features' in locals() and df_labels_with_features is not None:

    print("--- Comparing Features between PD and Control Groups ---")

    # Select features to compare (choose ones relevant to PD symptoms)
    features_to_compare = [
        'sts_whole_episode_duration',    # 完整站起时间
        'sts_final_attempt_duration',    # 最后一次尝试用时
        'pelvis_avg_speed',              # 关键点特征，示例
        'pelvis_peak_speed',
        'pelvis_avg_jerk_mag',
        'r_wrist_path_length', 'r_wrist_avg_speed',
        'l_wrist_path_length', 'l_wrist_avg_speed',
        'r_ankle_path_length', 'r_ankle_avg_speed',
        'l_ankle_path_length', 'l_ankle_avg_speed',
        'bbox_width',
'bbox_height',
'bbox_area'
    ]

    # Separate data for easier testing
    pd_group = df_labels_with_features[df_labels_with_features['PD_or_C'] == 'PD']
    control_group = df_labels_with_features[df_labels_with_features['PD_or_C'] == 'C']

    print(f"\nNumber of turns - PD: {len(pd_group)}, Control: {len(control_group)}")

    # --- Visualization and Statistical Testing ---
    for feature in features_to_compare:
        if feature not in df_labels_with_features.columns:
            print(f"\nWarning: Feature '{feature}' not found in DataFrame. Skipping.")
            continue

        print(f"\nComparing feature: {feature}")

        # 1. Visualization (Box Plot)
        plt.figure(figsize=(6, 4))
        sns.boxplot(x='PD_or_C', y=feature, data=df_labels_with_features, order=['C', 'PD'])
        plt.title(f'Comparison of {feature} between Control and PD')
        plt.xlabel("Group")
        plt.ylabel(feature)
        plt.show()

        # 2. Statistical Test (Mann-Whitney U test)
        # Drop NaNs for this specific comparison before testing
        group1 = control_group[feature].dropna()
        group2 = pd_group[feature].dropna()

        if len(group1) > 0 and len(group2) > 0:
            try:
                stat, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')
                print(f"  Mann-Whitney U Test:")
                print(f"    U-statistic: {stat:.2f}")
                print(f"    P-value: {p_value:.4f}")
                if p_value < 0.05:
                    print("    Difference is statistically significant (p < 0.05)")
                else:
                    print("    Difference is not statistically significant (p >= 0.05)")
            except ValueError as e:
                # Handle potential issues like identical data causing errors in the test
                 print(f"  Could not perform Mann-Whitney U test for {feature}: {e}")
        else:
            print(f"  Not enough data in one or both groups to perform test for {feature}.")


else:
    print("DataFrame 'df_labels_with_features' not found.")
    print("Please ensure the previous cell successfully created and populated this DataFrame.")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats # For statistical testing
import pandas as pd
import numpy as np # Ensure numpy is imported

# Set plot style
sns.set(style="whitegrid")

# Check if the final DataFrame exists
if 'df_labels_with_features' in locals() and df_labels_with_features is not None:

    print("--- Comparing Features between PD 'On' and 'Off' Medication States ---")

    # Filter for PD participants only
    df_pd_only = df_labels_with_features[df_labels_with_features['PD_or_C'] == 'PD'].copy()

    # Check if there are both On and Off medication states present
    med_states_present = df_pd_only['On_or_Off_medication'].unique()
    print(f"Medication states found in PD data: {med_states_present}")

    if 'On medication' in med_states_present and 'Off medication' in med_states_present:

        # Use the same features as before, or select a different subset
        features_to_compare = [
            'duration_sec',
            'pelvis_avg_speed',
            'pelvis_peak_speed',
            'pelvis_avg_jerk_mag',
 'r_wrist_path_length', 'r_wrist_avg_speed',
            'l_wrist_path_length', 'l_wrist_avg_speed',
            'r_ankle_path_length', 'r_ankle_avg_speed',
            'l_ankle_path_length','l_ankle_avg_speed',
        ]

        # Separate data for easier testing
        pd_on = df_pd_only[df_pd_only['On_or_Off_medication'] == 'On medication']
        pd_off = df_pd_only[df_pd_only['On_or_Off_medication'] == 'Off medication']

        print(f"\nNumber of turns - PD On: {len(pd_on)}, PD Off: {len(pd_off)}")

        # --- Visualization and Statistical Testing ---
        for feature in features_to_compare:
            if feature not in df_pd_only.columns:
                print(f"\nWarning: Feature '{feature}' not found in DataFrame. Skipping.")
                continue

            print(f"\nComparing feature: {feature} (On vs Off Meds)")

            # 1. Visualization (Box Plot)
            plt.figure(figsize=(6, 4))
            # Explicitly order the boxes
            order = ['Off medication', 'On medication']
            sns.boxplot(x='On_or_Off_medication', y=feature, data=df_pd_only, order=order)
            plt.title(f'Comparison of {feature} (PD On vs Off Meds)')
            plt.xlabel("Medication State")
            plt.ylabel(feature)
            plt.xticks(rotation=10) # Rotate labels slightly if needed
            plt.show()

            # 2. Statistical Test (Mann-Whitney U test)
            # Drop NaNs for this specific comparison before testing
            group1 = pd_off[feature].dropna()
            group2 = pd_on[feature].dropna()

            if len(group1) > 0 and len(group2) > 0:
                try:
                    stat, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')
                    print(f"  Mann-Whitney U Test:")
                    print(f"    U-statistic: {stat:.2f}")
                    print(f"    P-value: {p_value:.4f}")
                    if p_value < 0.05:
                        print("    Difference is statistically significant (p < 0.05)")
                        # You might infer direction from box plots or group means
                        # print(f"    Mean Off: {group1.mean():.2f}, Mean On: {group2.mean():.2f}")
                    else:
                        print("    Difference is not statistically significant (p >= 0.05)")
                except ValueError as e:
                     print(f"  Could not perform Mann-Whitney U test for {feature}: {e}")
            else:
                print(f"  Not enough data in one or both groups (On/Off) to perform test for {feature}.")

    else:
        print("\nCould not find both 'On medication' and 'Off medication' states in the PD data.")
        print("Cannot perform On vs Off comparison.")

else:
    print("DataFrame 'df_labels_with_features' not found.")
    print("Please ensure the previous cells successfully created and populated this DataFrame.")

In [29]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats # For statistical testing
import pandas as pd
import numpy as np # Ensure numpy is imported

# Set plot style
sns.set(style="whitegrid")

# Check if the final DataFrame exists
if 'df_labels_with_features' in locals() and df_labels_with_features is not None:

    print("--- Comparing Features between Pivot and Step Turns ---")

    # Check if the 'type_of_turn' column exists and clean potential NaNs
    if 'type_of_turn' not in df_labels_with_features.columns:
        print("Error: 'type_of_turn' column not found in DataFrame.")
    else:
        # Filter out rows where turn type is missing (NaN)
        df_typed_turns = df_labels_with_features.dropna(subset=['type_of_turn']).copy()

        # Check if both turn types are present
        turn_types_present = df_typed_turns['type_of_turn'].unique()
        print(f"Turn types found: {turn_types_present}")

        if 'pivot_turn' in turn_types_present and 'step_turn' in turn_types_present:

            # Select features likely affected by turning strategy
            features_to_compare = [
                'duration_sec',
                'number_of_turning_steps', # Expect difference based on definition
                'pelvis_path_length',      # Step turns might cover more ground?
                'pelvis_avg_speed',
                'trunk_total_rotation_deg',# Does strategy affect rotation amount?
                'trunk_avg_ang_vel',       # Does strategy affect rotation speed?
                'pelvis_avg_jerk_mag'      # Is one strategy smoother?
            ]

            # Separate data for easier testing
            pivot_group = df_typed_turns[df_typed_turns['type_of_turn'] == 'pivot_turn']
            step_group = df_typed_turns[df_typed_turns['type_of_turn'] == 'step_turn']

            print(f"\nNumber of turns - Pivot: {len(pivot_group)}, Step: {len(step_group)}")

            # --- Visualization and Statistical Testing ---
            for feature in features_to_compare:
                if feature not in df_typed_turns.columns:
                    print(f"\nWarning: Feature '{feature}' not found in DataFrame. Skipping.")
                    continue

                print(f"\nComparing feature: {feature} (Pivot vs Step Turn)")

                # 1. Visualization (Box Plot)
                plt.figure(figsize=(6, 4))
                order = ['pivot_turn', 'step_turn'] # Define order
                sns.boxplot(x='type_of_turn', y=feature, data=df_typed_turns, order=order)
                plt.title(f'Comparison of {feature} (Pivot vs Step Turn)')
                plt.xlabel("Turn Type")
                plt.ylabel(feature)
                plt.show()

                # 2. Statistical Test (Mann-Whitney U test)
                # Drop NaNs for this specific comparison before testing
                group1 = pivot_group[feature].dropna()
                group2 = step_group[feature].dropna()

                if len(group1) > 0 and len(group2) > 0:
                    try:
                        stat, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')
                        print(f"  Mann-Whitney U Test:")
                        print(f"    U-statistic: {stat:.2f}")
                        print(f"    P-value: {p_value:.4f}")
                        if p_value < 0.05:
                            print("    Difference is statistically significant (p < 0.05)")
                        else:
                            print("    Difference is not statistically significant (p >= 0.05)")
                    except ValueError as e:
                        print(f"  Could not perform Mann-Whitney U test for {feature}: {e}")
                else:
                    print(f"  Not enough data in one or both groups (Pivot/Step) to perform test for {feature}.")

        else:
            print("\nCould not find both 'pivot_turn' and 'step_turn' values in the 'type_of_turn' column.")
            print("Cannot perform Pivot vs Step comparison.")

else:
    print("DataFrame 'df_labels_with_features' not found.")
    print("Please ensure the previous cells successfully created and populated this DataFrame.")

--- Comparing Features between Pivot and Step Turns ---
Error: 'type_of_turn' column not found in DataFrame.


In [39]:


output_csv_path = r"E:\学习工作\PD\pks\SitToStand\Data\STS_features_with_UPDRS.csv"

if 'df_labels_with_features' in locals() and df_labels_with_features is not None:
    df_labels_with_features.to_csv(output_csv_path, index=False, encoding='utf-8-sig')
    print(f"特征和标签已保存到：{output_csv_path}")



✅ 特征和标签已保存到：E:\学习工作\PD\pks\SitToStand\Data\STS_features_with_UPDRS.csv


In [2]:
# pd.get_dummies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

df = pd.read_csv(r'E:\学习工作\PD\pks\SitToStand\Data\STS_features_with_UPDRS.csv')

df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace('3.9 _arising', '3.9_arising')

target_col = 'MDS-UPDRS_score_3.9_arising_from_chair'

feature_cols = [col for col in df.columns if col not in ['sts_id', target_col]]
print('features:', feature_cols)

df_clean = df.dropna(subset=feature_cols + [target_col])

y = df_clean[target_col].astype(int)

num_cols = df_clean[feature_cols].select_dtypes(include=['number']).columns.tolist()
cat_cols = df_clean[feature_cols].select_dtypes(include=['object']).columns.tolist()

print('数值特征:', num_cols)
print('类别特征:', cat_cols)

df_cat = pd.get_dummies(df_clean[cat_cols], drop_first=True)

X = pd.concat([df_clean[num_cols].reset_index(drop=True), df_cat.reset_index(drop=True)], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM (RBF kernel)': SVC(kernel='rbf', class_weight='balanced', random_state=42),
    'XGBoost': xgb.XGBClassifier( eval_metric='mlogloss', random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'MLP Neural Network': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
}

for name, model in models.items():
    print(f"\n--- Training and evaluating: {name} ---")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, zero_division=0))

print("The number of samples for each scoring category in the training set：")
print(y_train.value_counts())

print("The number of samples for each scoring category in the testing set:")
print(y_test.value_counts())
importances = models['Random Forest'].feature_importances_

# importances = models['XGBoost'].feature_importances_
# importances = models['CatBoost'].get_feature_importance()

feature_names = X.columns

indices = importances.argsort()[::-1]

print("Feature importances (RF):")
for i in indices:
    print(f"{feature_names[i]}: {importances[i]:.4f}")

features: ['Participant ID number', 'PD_or_C', 'sts_whole_episode_duration', 'sts_final_attempt_duration', 'On_or_Off_medication', 'DBS_state', 'Clinical_assessment', 'STS_additional_features', 'duration_frames', 'duration_sec', 'pelvis_path_length', 'pelvis_displacement', 'pelvis_avg_speed', 'pelvis_peak_speed', 'pelvis_avg_accel_mag', 'pelvis_peak_accel_mag', 'pelvis_avg_jerk_mag', 'r_wrist_path_length', 'r_wrist_avg_speed', 'l_wrist_path_length', 'l_wrist_avg_speed', 'r_ankle_path_length', 'r_ankle_avg_speed', 'l_ankle_path_length', 'l_ankle_avg_speed', 'bbox_width', 'bbox_height', 'bbox_area']
数值特征: ['Participant ID number', 'sts_whole_episode_duration', 'sts_final_attempt_duration', 'duration_frames', 'duration_sec', 'pelvis_path_length', 'pelvis_displacement', 'pelvis_avg_speed', 'pelvis_peak_speed', 'pelvis_avg_accel_mag', 'pelvis_peak_accel_mag', 'pelvis_avg_jerk_mag', 'r_wrist_path_length', 'r_wrist_avg_speed', 'l_wrist_path_length', 'l_wrist_avg_speed', 'r_ankle_path_length',

### Random Forest 和 XGBoost 模型性能对比

| Model      | Accuracy | Macro Avg F1-Score |
| :------------ | :------- | :----------------- |
| Random Forest | 0.86     | 0.65               |
| XGBoost       | 0.96     | 0.97               |

In [None]:
# CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import xgboost as xgb
from catboost import CatBoostClassifier

df = pd.read_csv(r'E:\学习工作\PD\pks\SitToStand\Data\STS_features_with_UPDRS.csv')
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace('3.9 _arising', '3.9_arising')

target_col = 'MDS-UPDRS_score_3.9_arising_from_chair'
feature_cols = [col for col in df.columns if col not in ['sts_id', target_col]]
df_clean = df.dropna(subset=feature_cols + [target_col])

y = df_clean[target_col].astype(int)

num_features = df_clean[feature_cols].select_dtypes(include=['number']).columns.tolist()
cat_features = df_clean[feature_cols].select_dtypes(include=['object']).columns.tolist()

print("数值特征:", num_features)
print("文本特征:", cat_features)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('text', CountVectorizer(), cat_features[0])  
    ])

models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM (RBF kernel)': SVC(kernel='rbf', class_weight='balanced', random_state=42),
    'XGBoost': xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False, random_state=42),
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'MLP Neural Network': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
}


X = df_clean[num_features + cat_features]
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

for name, model in models.items():
    print(f"\n--- Training and evaluating: {name} ---")
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    print(classification_report(y_test, y_pred, zero_division=0))


数值特征: ['Participant ID number', 'sts_whole_episode_duration', 'sts_final_attempt_duration', 'duration_frames', 'duration_sec', 'pelvis_path_length', 'pelvis_displacement', 'pelvis_avg_speed', 'pelvis_peak_speed', 'pelvis_avg_accel_mag', 'pelvis_peak_accel_mag', 'pelvis_avg_jerk_mag', 'r_wrist_path_length', 'r_wrist_avg_speed', 'l_wrist_path_length', 'l_wrist_avg_speed', 'r_ankle_path_length', 'r_ankle_avg_speed', 'l_ankle_path_length', 'l_ankle_avg_speed', 'bbox_width', 'bbox_height', 'bbox_area']
文本特征: ['PD_or_C', 'On_or_Off_medication', 'DBS_state', 'Clinical_assessment', 'STS_additional_features']

--- Training and evaluating: Random Forest ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       0.50      0.36      0.42        11
           2       0.76      0.88      0.81        32
           3       0.00      0.00      0.00         2

    accuracy                           0.75        51
   macro avg      

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.86      1.00      0.92         6
           1       0.29      0.18      0.22        11
           2       0.70      0.81      0.75        32
           3       0.00      0.00      0.00         2

    accuracy                           0.67        51
   macro avg       0.46      0.50      0.47        51
weighted avg       0.60      0.67      0.63        51


--- Training and evaluating: CatBoost ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       0.62      0.45      0.53        11
           2       0.78      0.91      0.84        32
           3       0.00      0.00      0.00         2

    accuracy                           0.78        51
   macro avg       0.60      0.59      0.59        51
weighted avg       0.74      0.78      0.76        51


--- Training and evaluating: MLP Neural Network ---
              precision    recall 



### Random Forest 和 XGBoost 模型性能对比

| Model      | Accuracy | Macro Avg F1-Score |
| :------------ | :------- | :----------------- |
| Random Forest | 0.75     | 0.56               |
| XGBoost       | 0.67     | 0.47               |

In [86]:
results_df = pd.DataFrame({
    'True Label': y_test,
    'Predicted Label': y_pred
})

error_indices = results_df[results_df['True Label'] != results_df['Predicted Label']].index

error_samples = df_clean.loc[error_indices].copy()

error_samples['True Label'] = y_test.loc[error_indices]

pos_indices = results_df.index.get_indexer(error_indices)
error_samples['Predicted Label'] = y_pred[pos_indices]

print(f"预测错误样本数：{len(error_samples)}")
print(error_samples)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100000)
pd.set_option('display.expand_frame_repr', False)


预测错误样本数：7
     sts_id  Participant ID number PD_or_C  sts_whole_episode_duration  sts_final_attempt_duration On_or_Off_medication DBS_state Clinical_assessment                                       STS_additional_features  MDS-UPDRS_score_3.9_arising_from_chair  ...  l_wrist_avg_speed  r_ankle_path_length  r_ankle_avg_speed  l_ankle_path_length  l_ankle_avg_speed  bbox_width  bbox_height  bbox_area  True Label  Predicted Label
137   281.0                  396.0      PD                   13.340999                    6.536999        On medication         -                  No  Uses arms of chair\nSlow\n>1 attempt\nMoves forward in chair                                     3.0  ...         113.131757           870.298637          55.082192          1997.339065         126.413865       644.0        452.0   291088.0           3                2
135   279.0                  396.0      PD                   10.428000                    6.172000        On medication         -                  N

In [None]:
# transformer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# 1. 读取数据
file_path = r"E:\学习工作\PD\pks\SitToStand\Data\STS_features_with_UPDRS.csv"  # 你的文件路径
df = pd.read_csv(file_path)

# 2. 数据预处理
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace('3.9 _arising', '3.9_arising')

target_col = 'MDS-UPDRS_score_3.9_arising_from_chair'

feature_cols = [col for col in df.columns if col not in ['sts_id', target_col]]
df_clean = df.dropna(subset=feature_cols + [target_col])

# 取标签，转成整数类别
y = df_clean[target_col].astype(int).values

# 选择数值特征和类别特征（类别特征请先独热编码或者LabelEncoder）
num_cols = df_clean[feature_cols].select_dtypes(include=['number']).columns.tolist()
cat_cols = df_clean[feature_cols].select_dtypes(include=['object']).columns.tolist()

# 简单示例：用独热编码类别特征
df_cat = pd.get_dummies(df_clean[cat_cols], drop_first=True)
X_num = df_clean[num_cols].values

# 标准化数值特征
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)

# 合并特征矩阵
import numpy as np
X = np.hstack([X_num_scaled, df_cat.values])

# 转成Tensor
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# 3. 划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(
    X_tensor, y_tensor, test_size=0.3, random_state=42, stratify=y)

# 4. Dataset 和 DataLoader
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

# 5. 定义Transformer分类器（你已有代码）
class TransformerClassifier(nn.Module):
    def __init__(self, input_dim, num_classes, num_heads=4, hidden_dim=128, num_layers=2):
        super().__init__()
        self.embedding = nn.Linear(input_dim, hidden_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)  # (batch, seq_len=1, hidden_dim)
        x = self.transformer_encoder(x).squeeze(1)  # (batch, hidden_dim)
        logits = self.classifier(x)
        return logits

input_dim = X.shape[1]
num_classes = len(np.unique(y))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerClassifier(input_dim, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 6. 训练循环
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * batch_x.size(0)
        _, preds = torch.max(outputs, 1)
        correct += (preds == batch_y).sum().item()
        total += batch_y.size(0)

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/total:.4f}, Train Acc: {correct/total:.4f}")

# 7. 测试评估
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        outputs = model(batch_x)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch_y.cpu().numpy())

print("Test Accuracy:", accuracy_score(all_labels, all_preds))
print("Classification Report:\n", classification_report(all_labels, all_preds, zero_division=0))




Epoch 1/100, Loss: 1.1906, Train Acc: 0.5462
Epoch 2/100, Loss: 0.7043, Train Acc: 0.7059
Epoch 3/100, Loss: 0.4653, Train Acc: 0.8067
Epoch 4/100, Loss: 0.3214, Train Acc: 0.8824
Epoch 5/100, Loss: 0.3938, Train Acc: 0.8487
Epoch 6/100, Loss: 0.1306, Train Acc: 0.9580
Epoch 7/100, Loss: 0.0823, Train Acc: 0.9832
Epoch 8/100, Loss: 0.1021, Train Acc: 0.9664
Epoch 9/100, Loss: 0.0333, Train Acc: 1.0000
Epoch 10/100, Loss: 0.0636, Train Acc: 0.9748
Epoch 11/100, Loss: 0.0222, Train Acc: 1.0000
Epoch 12/100, Loss: 0.0229, Train Acc: 1.0000
Epoch 13/100, Loss: 0.0366, Train Acc: 0.9916
Epoch 14/100, Loss: 0.0448, Train Acc: 0.9916
Epoch 15/100, Loss: 0.0106, Train Acc: 1.0000
Epoch 16/100, Loss: 0.0120, Train Acc: 0.9916
Epoch 17/100, Loss: 0.0052, Train Acc: 1.0000
Epoch 18/100, Loss: 0.0043, Train Acc: 1.0000
Epoch 19/100, Loss: 0.0033, Train Acc: 1.0000
Epoch 20/100, Loss: 0.0031, Train Acc: 1.0000
Epoch 21/100, Loss: 0.0027, Train Acc: 1.0000
Epoch 22/100, Loss: 0.0024, Train Acc: 1.00