# 处理csv文件

### 验证新旧csv文件是否一致

In [2]:
import csv

def compare_csv_files(file1, file2):
    with open(file1, 'r', newline='', encoding='utf-8') as f1, \
         open(file2, 'r', newline='', encoding='utf-8') as f2:
        reader1 = list(csv.reader(f1))
        reader2 = list(csv.reader(f2))

        if reader1 == reader2:
            print("两个 CSV 文件内容完全一致。")
        else:
            print("两个 CSV 文件内容不一致。")
            # 找出不同的地方
            max_len = max(len(reader1), len(reader2))
            for i in range(max_len):
                row1 = reader1[i] if i < len(reader1) else None
                row2 = reader2[i] if i < len(reader2) else None
                if row1 != row2:
                    print(f"第 {i+1} 行不同：")
                    print(f"  文件1: {row1}")
                    print(f"  文件2: {row2}")

# 使用示例
compare_csv_files('/home/lqi/lqi_temp/workspace/finished_dataset/csv_files/actions.csv', '/home/lqi/lqi_temp/HAR-in-Space/Dataset/csv_files/actions.csv')


两个 CSV 文件内容完全一致。


## Transfer Dataset to AVA-like Format

input:  `bounding_boxes.csv`
        `frames_root`
        `actions.csv`
        `output_csv`

output: the dataset in one file with ava frame

In [35]:
import os
import pandas as pd
from PIL import Image

# Configuration: define input and output file paths
RAW_CSV = '../files/bounding_boxes.csv'
FRAMES_ROOT = '/home/lqi/lqi_temp/trainingspace/ava/frames'
ACTIONS_CSV = '../files/actions_final.csv'
OUTPUT_CSV = '../files/ava_with_head.csv'


def load_video_sizes(frames_root: str, video_ids: pd.Series) -> pd.DataFrame:
    """
    Preload image dimensions for each video_id by reading the first frame.
    Returns a DataFrame with columns: video_id, width, height.
    """
    sizes = []
    for vid in video_ids.unique():
        folder = os.path.join(frames_root, vid)
        if not os.path.isdir(folder):
            raise FileNotFoundError(f"Frames directory not found: {folder}")
        # Find JPEG frames in the directory
        frames = [f for f in os.listdir(folder) if f.lower().endswith('.jpg')]
        if not frames:
            raise ValueError(f"No jpg frames in {folder}")
        # Open the first image to get its size
        with Image.open(os.path.join(folder, frames[0])) as img:
            width, height = img.size
        sizes.append({'video_id': vid, 'width': width, 'height': height})
    return pd.DataFrame(sizes)


def main():
    """
    Execute the full pipeline:
    1. Normalize bounding box coordinates based on frame dimensions.
    2. Select the representative frame closest to the median frame_id.
    3. Merge with action labels, filter and rename columns.
    4. Save the final CSV.
    """
    # 1. Read raw bounding box data
    df = pd.read_csv(RAW_CSV, dtype={'xmin': float, 'xmax': float, 'ymin': float, 'ymax': float})

    # 2. Preload video dimensions and merge into the DataFrame
    sizes_df = load_video_sizes(FRAMES_ROOT, df['video_id'])
    df = df.merge(sizes_df, on='video_id', how='left')

    # 3. Vectorized normalization of bounding box coordinates
    for coord in ['xmin', 'xmax']:
        df[coord] = (df[coord] / df['width']).round(3)
    for coord in ['ymin', 'ymax']:
        df[coord] = (df[coord] / df['height']).round(3)
    # Remove auxiliary size columns
    df.drop(columns=['width', 'height'], inplace=True)

    # 4. Select representative frame: the frame_id closest to the median
    df['frame_stamp'] = (
        df.groupby(['video_id', 'person_id'])['frame_id']
          .transform(lambda x: x.iloc[(x - x.median()).abs().argmin()])
    )
    df = df[df['frame_id'] == df['frame_stamp']].copy()
    df.drop(columns=['frame_id'], inplace=True)

    # 5. Merge with action labels and filter out missing actions
    df_actions = pd.read_csv(ACTIONS_CSV, usecols=['video_id', 'person_id', 'action'])
    df = df.merge(df_actions, on=['video_id', 'person_id'], how='left')
    df.dropna(subset=['action'], inplace=True)
    df['action'] = df['action'].astype(int)

    # 6. Rename and reorder columns for final output
    df.rename(columns={'action': 'action_id'}, inplace=True)
    df = df[
        ['video_id', 'frame_stamp', 'xmin', 'ymin', 'xmax', 'ymax', 'action_id', 'person_id']
    ]

    # 7. Save the final CSV file
    df.to_csv(OUTPUT_CSV, index=False)
    print(f"✅ Pipeline completed, output saved to {OUTPUT_CSV}, total records: {len(df)}")


if __name__ == '__main__':
    main()


✅ Pipeline completed, output saved to ../files/ava_with_head.csv, total records: 13251


## 创建训练集，验证集，测试集

### 直接生成ava官方格式的csv文件用于annotation文件夹

In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
按视频划分带动作覆盖：
  - 在视频级别贪心挑选视频，保证训练集至少包含每个 action_id；
  - 然后对剩余视频按比例 7:1:2 随机划分到 train/val/test；
  - 输出 train.csv、val.csv、test.csv（无表头），同一视频的所有行不跨集；
  - 输出到同一指定文件夹，若不存在会自动创建；
  - 运行结束时打印每个子集的行数、行占比；以及视频数、视频占比。
"""

import os
import pandas as pd
import numpy as np

# —— 配置区 —— 
INPUT_CSV = "../files/ava_with_head.csv"
OUTPUT_DIR = "../files/output/ava_dataset"  # train.csv、val.csv、test.csv 都会放在这里
TRAIN_RATIO = 0.7
VAL_RATIO   = 0.1
TEST_RATIO  = 0.2
SEED        = 42


def ensure_dir(path: str):
    """如果目录不存在，就创建它。"""
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)


def split_by_video_with_action_coverage(
    input_csv: str,
    output_dir: str,
    train_ratio: float = 0.7,
    val_ratio: float = 0.1,
    test_ratio: float = 0.2,
    seed: int = 42,
):
    # 1. 载入原始标注
    df = pd.read_csv(input_csv)
    total_rows = len(df)
    np.random.seed(seed)

    # 2. 视频级别映射 video_id -> action_id 列表
    vid2acts = df.groupby('video_id')['action_id'].unique().to_dict()
    all_videos = list(vid2acts.keys())
    total_videos = len(all_videos)

    # 3. 计算视频级目标数
    train_v_target = int(train_ratio * total_videos)
    val_v_target   = int(val_ratio   * total_videos)

    # 4. 贪心挑视频覆盖所有动作
    all_actions = set(df['action_id'].unique())
    covered = set()
    selected_videos = set()
    while covered != all_actions:
        best_vid, best_gain = None, 0
        for vid, acts in vid2acts.items():
            if vid in selected_videos:
                continue
            gain = len(set(acts) - covered)
            if gain > best_gain:
                best_gain, best_vid = gain, vid
        if best_vid is None:
            break
        selected_videos.add(best_vid)
        covered |= set(vid2acts[best_vid])

    # 5. 剩余视频随机划分
    remaining = [v for v in all_videos if v not in selected_videos]
    np.random.shuffle(remaining)
    need_more = max(train_v_target - len(selected_videos), 0)
    more_train = remaining[:need_more]
    val_videos  = remaining[need_more: need_more + val_v_target]
    test_videos = remaining[need_more + val_v_target:]

    # 最终子集视频列表
    train_videos = set(selected_videos) | set(more_train)
    val_videos   = set(val_videos)
    test_videos  = set(test_videos)

    # 6. 根据 video_id 划分行
    df_train = df[df['video_id'].isin(train_videos)]
    df_val   = df[df['video_id'].isin(val_videos)]
    df_test  = df[df['video_id'].isin(test_videos)]

    # 7. 保存至输出目录（无表头）
    ensure_dir(output_dir)
    paths = {
        'train': os.path.join(output_dir, 'ava_train.csv'),
        'val':   os.path.join(output_dir, 'ava_val.csv'),
        'test':  os.path.join(output_dir, 'ava_test.csv'),
    }
    df_train.to_csv(paths['train'], index=False, header=False)
    df_val.to_csv(  paths['val'],   index=False, header=False)
    df_test.to_csv( paths['test'],  index=False, header=False)

    # 8. 打印行级统计
    print("按视频划分完成：行级统计：")
    for name, subdf in [('Train', df_train), ('Val', df_val), ('Test', df_test)]:
        cnt = len(subdf)
        print(f"  {name}: {cnt} 条，占总 {cnt}/{total_rows} = {cnt/total_rows*100:.2f}%")

    # 9. 打印视频级统计
    print("按视频划分完成：视频级统计：")
    for name, vids in [('Train', train_videos), ('Val', val_videos), ('Test', test_videos)]:
        vcnt = len(vids)
        print(f"  {name}: {vcnt} 个视频，占总 {vcnt}/{total_videos} = {vcnt/total_videos*100:.2f}%")


if __name__ == "__main__":
    split_by_video_with_action_coverage(
        INPUT_CSV,
        OUTPUT_DIR,
        train_ratio=TRAIN_RATIO,
        val_ratio=VAL_RATIO,
        test_ratio=TEST_RATIO,
        seed=SEED,
    )


按视频划分完成：行级统计：
  Train: 9266 条，占总 9266/13251 = 69.93%
  Val: 1329 条，占总 1329/13251 = 10.03%
  Test: 2656 条，占总 2656/13251 = 20.04%
按视频划分完成：视频级统计：
  Train: 3331 个视频，占总 3331/4759 = 69.99%
  Val: 475 个视频，占总 475/4759 = 9.98%
  Test: 953 个视频，占总 953/4759 = 20.03%


按视频划分完成：行级统计：
  Train: 9266 条，占总 9266/13251 = 69.93%
  Val: 1329 条，占总 1329/13251 = 10.03%
  Test: 2656 条，占总 2656/13251 = 20.04%
按视频划分完成：视频级统计：
  Train: 3331 个视频，占总 3331/4759 = 69.99%
  Val: 475 个视频，占总 475/4759 = 9.98%
  Test: 953 个视频，占总 953/4759 = 20.03%

## 生成frame lists里面的文件

In [7]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
根据已有的 ava_train/ava_val/ava_test 标注 CSV 和 frames 目录，
生成 AVA 所需的 frame_lists/{train,val,test}.csv：
  - 输入标注文件名：ava_train.csv, ava_val.csv, ava_test.csv（无表头）；
  - 输出文件名：train.csv, val.csv, test.csv（带表头）；
  - 列格式：original_video_id video_id frame_id path labels；
  - path 格式："{video_id}/{video_id}_{frame_id:06d}.jpg"；
  - labels 列固定填 ""；
  - 输出目录不存在会自动创建。
"""

import os
import glob
import pandas as pd

# —— 配置区 —— 
ANNOT_DIR     = "../files/output/ava_dataset"   # 这里存放 ava_train.csv, ava_val.csv, ava_test.csv
FRAMES_ROOT   = "/home/lqi/lqi_temp/trainingspace/ava/frames"        # 这里每个子目录是一个 video_id，内含 .jpg
OUTPUT_DIR    = "../files/output/new_frame_lists"   # 生成 train.csv, val.csv, test.csv 到此目录
SPLITS        = ["train", "val", "test"]
INPUT_PREFIX  = "ava_"                       # 标注文件名前缀
HEADER        = "original_video_id video_id frame_id path labels\n"

def ensure_dir(d):
    if not os.path.exists(d):
        os.makedirs(d, exist_ok=True)

def load_split_videos(split):
    """读取 ava_{split}.csv，返回该 split 包含的所有 video_id 集合"""
    csv_path = os.path.join(ANNOT_DIR, f"{INPUT_PREFIX}{split}.csv")
    df = pd.read_csv(
        csv_path,
        header=None,
        names=["video_id","frame_stamp","xmin","ymin","xmax","ymax","action_id","person_id"],
    )
    return set(df["video_id"].astype(str).unique())

def make_frame_list(split, videoset):
    """为一个 split 枚举所有 frames，写入 frame_lists/{split}.csv"""
    out_path = os.path.join(OUTPUT_DIR, f"{split}.csv")
    with open(out_path, "w", encoding="utf-8") as fout:
        fout.write(HEADER)
        for vid in sorted(videoset):
            vid_dir = os.path.join(FRAMES_ROOT, vid)
            pattern = os.path.join(vid_dir, f"{vid}_*.jpg")
            for img_path in sorted(glob.glob(pattern)):
                fname = os.path.basename(img_path)
                # 提取帧编号
                frame_id = int(fname.replace(f"{vid}_", "").replace(".jpg", ""))
                relpath = f"{vid}/{fname}"
                # labels 列填 ""
                fout.write(f"{vid} {vid} {frame_id} {relpath} \"\"\n")
    print(f"生成 {out_path}")

def main():
    ensure_dir(OUTPUT_DIR)
    for split in SPLITS:
        vids = load_split_videos(split)
        make_frame_list(split, vids)
    print("所有 frame_lists 生成完毕！")

if __name__ == "__main__":
    main()


生成 ../files/output/new_frame_lists/train.csv
生成 ../files/output/new_frame_lists/val.csv
生成 ../files/output/new_frame_lists/test.csv
所有 frame_lists 生成完毕！


## 生成test和val的annotation文件

In [2]:
import os
import pandas as pd

def process_csv_folder(input_folder, output_folder='../files/output/ava_dataset'):
    os.makedirs(output_folder, exist_ok=True)
    
    for fname in os.listdir(input_folder):
        if not fname.lower().endswith('.csv'):
            continue
        in_path = os.path.join(input_folder, fname)
        
        # 1. 读取 CSV（无表头，确保每列都当字符串）
        df = pd.read_csv(in_path, header=None, dtype=str, sep=',')
        
        # 2. 跳过列数不足两列的文件
        if df.shape[1] < 2:
            print(f"跳过 {fname}：列数不足")
            continue
        
        # 3. 删除最后两列
        df = df.iloc[:, :-2]
        
        # 4. 添加倒数第二列（全空）和最后一列（全 '1'）
        #    这里我们给它们不同的列名，避免冲突
        df['blank_col'] = ''
        df['fill11_col'] = '1'
        
        # 5. 保存到 output 文件夹，不写列名、不写索引
        base, _ = os.path.splitext(fname)
        out_name = f"{base}_predicted_boxes.csv"
        df.to_csv(
            os.path.join(output_folder, out_name),
            header=False,    # 不输出列名
            index=False,     # 不输出行索引
            sep=','
        )
        print(f"已处理：{fname} → {out_name}")

if __name__ == '__main__':
    # 把下面这个路径改成你自己的 CSV 文件所在目录
    input_folder = '../files/output/ava_dataset'
    process_csv_folder(input_folder)


已处理：ava_val.csv → ava_val_predicted_boxes.csv
已处理：ava_train.csv → ava_train_predicted_boxes.csv
已处理：ava_test.csv → ava_test_predicted_boxes.csv
