In [1]:
"""
Creates database from converted MediaPipe output files (should now be in .txt format with _mp33 suffix)
for use with RNN for Human Activity Recognition - 2D Pose Input (now 33*2D = 66 features)

Adapted from original OpenPose script by Stuart Eiffert 13/12/2017
Modifications for MediaPipe (33 points) by Your Name/AI DATE

All code is provided under the MIT License (assuming original license applies)

"""

import glob
import os
import numpy as np
# ★★★ pandasライブラリをインポートします ★★★
import pandas as pd

# --- Configuration for Output Files ---
# These files will be created in the 'data_path' directory specified below.
test_file_X = "X_test.txt"
test_file_Y = "Y_test.txt"
train_file_X = "X_train.txt"
train_file_Y = "Y_train.txt"

# --- Configuration based on your MediaPipe setup ---
# MODIFIED: Path to the root directory where action folders (containing _mp33.txt files) are located.
# This should be the 'MP_Data_JSON' directory.
# If MP_Data_JSON is in the same directory as this script, a relative path is fine.
data_path = r"C:\kanno\vscode\RNN-for-Human-Activity-Recognition-using-2D-Pose-Input-master\RNN-for-Human-Activity-Recognition-using-2D-Pose-Input-master\MP_Data_JSON"
# To use the script's current directory as a base for MP_Data_JSON:
# data_path_abs = os.path.join(os.getcwd(), "MP_Data_JSON") # Use this if you prefer an absolute path from start

# MODIFIED: Your list of actions from the MediaPipe script
activity_list = ['heel_hook', 'deadpoint', 'dyno', 'cross_move']
#activity_list = ['deadpoint', 'cross_move']
# Original script's variables, kept for context
# cluster_nums=4
# camera_nums=1
# subject_nums=12
# activity_nums=len(activity_list)
# repetition_nums=5

# --- RNN Sequence Configuration ---
num_steps = 180 #32  # Window width for RNN input (number of frames per sequence)
test_train_split = 0.0  # Percentage of data for training
split = False  # If True, a second stage of shuffling and splitting is performed.
overlap = 0.8125  # Overlap when creating sequences. 0 = 0% overlap.

# --- Ensure data_path exists ---
if not os.path.isdir(data_path):
    print(f"Error: data_path '{data_path}' not found. Please check the path.")
    exit() # Exit if base data path doesn't exist

# --- Clean up old train/test files if they exist ---
# Output files are placed in the 'data_path' directory.
files_to_remove = [test_file_X, test_file_Y, train_file_X, train_file_Y]
for f_name in files_to_remove:
    f_path = os.path.join(data_path, f_name)
    if os.path.exists(f_path):
        print(f"Removing existing file: {f_path}")
        os.remove(f_path)
print("-" * 30)

# Store the original CWD to return to it if needed at the very end,
# or to construct absolute paths if data_path is relative.
initial_cwd = os.getcwd()
abs_data_path = os.path.abspath(data_path) # Use absolute path for data_path internally

# Process each activity
for activity_idx, activity_name in enumerate(activity_list):
    current_activity_path_relative_to_abs_data_path = activity_name # e.g., "heel_hook"
    current_activity_full_path = os.path.join(abs_data_path, current_activity_path_relative_to_abs_data_path)
    
    print(f"Processing activity: {activity_name} in {current_activity_full_path}")

    if not os.path.isdir(current_activity_full_path):
        print(f"Warning: Directory not found for activity {activity_name}, skipping: {current_activity_full_path}")
        continue

    # Change directory into the activity's folder
    try:
        os.chdir(current_activity_full_path)
        print(f"  Changed CWD to: {os.getcwd()}")
    except FileNotFoundError:
        print(f"  Error: Could not change directory to {current_activity_full_path}, skipping activity.")
        # Change back to abs_data_path before continuing to next activity iteration
        os.chdir(abs_data_path) # Go back to parent (e.g., MP_Data_JSON)
        continue

    # MODIFIED: Search for files ending with _mp33.txt within the current activity directory
    for file_name_in_activity_dir in sorted(glob.glob("*_mp33.txt")):
        print(f"    Processing file: {file_name_in_activity_dir}")

        is_train = np.random.rand() < test_train_split
        print(f"      Assigning to {'train' if is_train else 'test'} set.")

        # --- ★★★ ここからが修正部分です ★★★ ---
        try:
            # pandasでテキストファイルを読み込みます。
            # '0.0'は欠損値(NaN)として扱います。ヘッダーはないのでNoneを指定。
            data_df = pd.read_csv(file_name_in_activity_dir, header=None, na_values=0.0)
            
            # DataFrameが空でないことを確認
            if data_df.empty:
                print(f"      Warning: File {file_name_in_activity_dir} is empty or could not be parsed, skipping.")
                continue

            # スプライン補間を実行します。
            # method='spline'と次数(order=3が一般的)を指定します。
            # スプライン補間は一定数の非欠損値が必要なため、データが少なすぎるとエラーになることがあります。
            # そこで、先に線形補間で大まかに埋めてからスプラインを適用すると安定します。
            data_df.interpolate(method='linear', axis=0, inplace=True, limit_direction='both')
            data_df.interpolate(method='spline', order=3, axis=0, inplace=True)


            # 補間後にもNaNが残る場合に備えて0.0で埋めます
            data_df.fillna(0.0, inplace=True)

            # DataFrameを元のテキスト形式（文字列のリスト）に戻します。
            # これで以降のコードは変更不要です。
            lines_as_strings = []
            for index, row in data_df.iterrows():
                # 各行をカンマ区切りの文字列に変換し、末尾に改行を追加
                line = ",".join(row.astype(str).tolist())
                lines_as_strings.append(line + '\n')
            file_text = lines_as_strings
        
        except FileNotFoundError:
            print(f"      Error: File not found {file_name_in_activity_dir} within {os.getcwd()}, skipping.")
            continue
        except pd.errors.EmptyDataError:
            print(f"      Warning: File {file_name_in_activity_dir} is empty, skipping.")
            continue
        # --- ★★★ 修正はここまでです ★★★ ---
        
        if not file_text:
            print(f"      Warning: File {file_name_in_activity_dir} is empty, skipping.")
            continue

        num_frames = len(file_text)
        if num_frames < num_steps:
            print(f"      Warning: File {file_name_in_activity_dir} has {num_frames} frames, less than num_steps ({num_steps}). Skipping.")
            continue
            
        if 1 - overlap <= 1e-6: # Check for overlap close to 1 or 1
            print(f"      Error: Invalid overlap value ({overlap}) results in zero or negative step. Skipping file.")
            num_framesets = 0 
        else:
            num_framesets = int((num_frames - num_steps) / (num_steps * (1 - overlap))) + 1
        
        print(f"      Total frames: {num_frames}, Num sequences possible: {num_framesets}")

        if num_framesets <= 0:
            print(f"      Not enough frames in {file_name_in_activity_dir} to create any sequences with current settings. Skipping.")
            continue

        if is_train:
            output_file_X_basename = train_file_X
            output_file_Y_basename = train_file_Y
        else:
            output_file_X_basename = test_file_X
            output_file_Y_basename = test_file_Y

        # Output files (X_train.txt etc.) are in the abs_data_path directory
        x_output_full_path = os.path.join(abs_data_path, output_file_X_basename)
        y_output_full_path = os.path.join(abs_data_path, output_file_Y_basename)

        try:
            with open(x_output_full_path, 'a') as x_file:
                for frameset_idx in range(num_framesets):
                    start_frame = int(frameset_idx * num_steps * (1 - overlap))
                    end_frame = start_frame + num_steps
                    
                    # Ensure the slice is within bounds
                    if start_frame < 0 or end_frame > num_frames or start_frame >= end_frame :
                        print(f"      Warning: Invalid frame slice [{start_frame}:{end_frame}] for num_frames {num_frames}. Skipping frameset_idx {frameset_idx}.")
                        continue

                    for line_idx in range(start_frame, end_frame):
                        x_file.write(file_text[line_idx])
            print(f"      Appended {num_framesets} sequences to {x_output_full_path}")

            with open(y_output_full_path, 'a') as y_file:
                for _ in range(num_framesets):
                    y_file.write(str(activity_idx + 1) + "\n") # Y label is 1-based activity index
            print(f"      Appended {num_framesets} labels to {y_output_full_path}")

        except IOError as e:
            print(f"      Error writing to output files: {e}")
            # If there was an error writing, we should still try to chdir back
    
    # After processing all files in the current_activity_full_path,
    # change CWD back to abs_data_path (e.g., MP_Data_JSON)
    # so the next iteration of the activity loop correctly resolves paths.
    try:
        os.chdir(abs_data_path)
        print(f"  Returned CWD to: {os.getcwd()} (after processing {activity_name})")
    except Exception as e:
        print(f"  Error on os.chdir back to base data_path ({abs_data_path}) " \
              f"from within {activity_name} directory structure: {e}")
        print(f"  Current CWD is: {os.getcwd()}. Subsequent paths might be incorrect.")
        # Potentially exit or handle more robustly if this critical chdir fails.

print("-" * 30)

# The second splitting block (if split:) from the original script
# This block should now operate with CWD being abs_data_path (e.g., MP_Data_JSON)
if split:
    print(f"Performing second stage split (if split=True) in CWD: {os.getcwd()}")
    
    # This part's logic for choosing source_X/Y_to_resplit needs review if `split=True` is used.
    # As per original, it seems to assume a single pair of X/Y files to resplit.
    # If the first loop already created X_train/Y_train and X_test/Y_test, what should this do?
    # For now, let's assume it tries to resplit the training data further, or a combined set if available.
    # This is a placeholder, the exact files to read here depend on the desired workflow for `split=True`.
    source_X_to_resplit = train_file_X # Example: re-split the training data
    source_Y_to_resplit = train_file_Y # Example: re-split the training data

    # Ensure full paths are used for reading, as CWD is now abs_data_path
    path_X_to_resplit = os.path.join(abs_data_path, source_X_to_resplit)
    path_Y_to_resplit = os.path.join(abs_data_path, source_Y_to_resplit)

    X_data, Y_data = [], []
    try:
        if os.path.exists(path_X_to_resplit):
            with open(path_X_to_resplit, 'r') as X_file_handle:
                X_data = X_file_handle.readlines()
        else:
            print(f"Error: Source file for X re-split not found: {path_X_to_resplit}")

        if os.path.exists(path_Y_to_resplit):
            with open(path_Y_to_resplit, 'r') as Y_file_handle:
                Y_data = Y_file_handle.readlines()
        else:
            print(f"Error: Source file for Y re-split not found: {path_Y_to_resplit}")

    except FileNotFoundError as e:
        print(f"Error reading files for second split: {e}. Skipping second split.")

    if X_data and Y_data:
        print(f"  Re-splitting {len(Y_data)} sequences from {source_X_to_resplit} and {source_Y_to_resplit}")
        # Remove old files again before re-writing test/train from this split
        for f_name in files_to_remove:
            f_path = os.path.join(abs_data_path, f_name)
            if os.path.exists(f_path):
                os.remove(f_path)

        # Y_data contains one label per sequence created in the first loop
        # msk should be the same length as the number of sequences in Y_data
        num_sequences_to_resplit = len(Y_data)
        msk = np.random.rand(num_sequences_to_resplit) < test_train_split
        
        current_X_sequence_lines = []
        y_data_idx = 0 # To iterate through Y_data and msk

        for i, x_line in enumerate(X_data):
            current_X_sequence_lines.append(x_line)
            if (i + 1) % num_steps == 0: # A full sequence for X has been collected
                if y_data_idx < num_sequences_to_resplit:
                    is_train_for_sequence = msk[y_data_idx]
                    target_X_basename = train_file_X if is_train_for_sequence else test_file_X
                    target_Y_basename = train_file_Y if is_train_for_sequence else test_file_Y

                    x_out_f_path = os.path.join(abs_data_path, target_X_basename)
                    y_out_f_path = os.path.join(abs_data_path, target_Y_basename)

                    with open(x_out_f_path, 'a') as x_out_f:
                        for line_in_seq in current_X_sequence_lines:
                            x_out_f.write(line_in_seq)
                    
                    with open(y_out_f_path, 'a') as y_out_f:
                        y_out_f.write(Y_data[y_data_idx]) # Y_data already has newlines
                    
                    y_data_idx += 1
                current_X_sequence_lines = [] # Reset for next sequence
        
        print("  Second stage split completed.")
        # print(msk)
    elif split: # Only print if split was true and data wasn't found
        print("  Skipping second stage split due to missing data.")


# Optional: change back to the directory where the script was initially launched
# os.chdir(initial_cwd)
print("Database creation process finished.")

------------------------------
Processing activity: heel_hook in C:\kanno\vscode\RNN-for-Human-Activity-Recognition-using-2D-Pose-Input-master\RNN-for-Human-Activity-Recognition-using-2D-Pose-Input-master\MP_Data_JSON\heel_hook
Processing activity: deadpoint in C:\kanno\vscode\RNN-for-Human-Activity-Recognition-using-2D-Pose-Input-master\RNN-for-Human-Activity-Recognition-using-2D-Pose-Input-master\MP_Data_JSON\deadpoint
Processing activity: dyno in C:\kanno\vscode\RNN-for-Human-Activity-Recognition-using-2D-Pose-Input-master\RNN-for-Human-Activity-Recognition-using-2D-Pose-Input-master\MP_Data_JSON\dyno
Processing activity: cross_move in C:\kanno\vscode\RNN-for-Human-Activity-Recognition-using-2D-Pose-Input-master\RNN-for-Human-Activity-Recognition-using-2D-Pose-Input-master\MP_Data_JSON\cross_move
------------------------------
Database creation process finished.
