# DATASET Statistic

## Pre-statistical processing

Delete tags that do not appear in the pbtxt file

In [None]:
import pandas as pd
import re

def main():
    # 1. Read the CSV file and get all unique ids from the action column
    df = pd.read_csv('../files/actions_final.csv')
    csv_action_ids = set(df['action'].unique())
    print("Action IDs in the CSV file:", csv_action_ids)
    
    # 2. Read the pbtxt file content
    with open('/home/lqi/lqi_temp/workspace/statistics/files/ava_label_map_v2.1.pbtxt', 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Use regular expressions to extract all item blocks (assuming each block starts with "item {" and ends with the first "}")
    pattern = re.compile(r'(item\s*\{.*?\})', re.DOTALL)
    item_blocks = pattern.findall(content)
    print(f"Total number of item blocks in the pbtxt file: {len(item_blocks)}")
    
    # 3. Iterate through each item block and check if its id appears in the CSV file
    kept_items = []
    for block in item_blocks:
        # Use regex to match the number after id:
        id_match = re.search(r"id:\s*(\d+)", block)
        if id_match:
            item_id = int(id_match.group(1))
            if item_id in csv_action_ids:
                kept_items.append(block)
        else:
            # If no id is matched, choose to keep it. Here we keep it.
            kept_items.append(block)
    
    print(f"Number of item blocks retained after filtering: {len(kept_items)}")
    
    # 4. Write the retained item blocks to a new pbtxt file
    new_content = "\n\n".join(kept_items)
    with open('labels_filtered.pbtxt', 'w', encoding='utf-8') as f:
        f.write(new_content)
    
    print("Filtered pbtxt file saved as labels_filtered.pbtxt.")

if __name__ == '__main__':
    main()


## Count the number of videos

In [None]:
import pandas as pd

csv_file_path = '/home/lqi/lqi_temp/HAR-in-Space/Dataset/csv_files/actions.csv'

def count_unique_video_ids():
    try:
        # Read the CSV file
        df = pd.read_csv(csv_file_path)
        
        # Check if the 'video_id' column exists
        if 'video_id' not in df.columns:
            print("Error: 'video_id' column not found in the CSV file.")
            return

        # Calculate the number of unique video IDs
        unique_ids = df['video_id'].nunique()
        print(f"Number of unique video IDs: {unique_ids}")
        return unique_ids
    except Exception as e:
        print(f"Error: {e}")

# Example usage
count_unique_video_ids()


Number of unique video IDs: 4759


4759

## action statistics

Count the number and percentage of each action

Input: csv file and pbtxt file

Output: result.pbtxt

### All videos

In [None]:
import pandas as pd
import re

def parse_pbtxt(file_path):
    """
    Parse a pbtxt file to generate a mapping dictionary from id to action name.
    """
    mapping = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    # Use regex to find each item block
    items = re.findall(r'item\s*{([^}]+)}', content, re.DOTALL)
    for item in items:
        id_match = re.search(r'id:\s*(\d+)', item)
        name_match = re.search(r'name:\s*"(.*?)"', item)
        if id_match and name_match:
            action_id = int(id_match.group(1))
            action_name = name_match.group(1)
            mapping[action_id] = action_name
    return mapping

def analyze_csv(csv_path, id2name):
    """
    Read a CSV file, count the total number of videos (unique video_id),
    and compute each action's occurrence count and percentage.
    Also record the action id and name. Percentage is formatted to two decimal places.
    """
    df = pd.read_csv(csv_path)
    
    # Total videos (unique video_id)
    total_videos = df['video_id'].nunique()
    
    # Count occurrences of each action
    action_counts = df['action'].value_counts()
    total_actions = action_counts.sum()
    
    # Build results list with id, name, count, and percentage
    results = []
    for action_id, count in action_counts.items():
        action_name = id2name.get(action_id, f'Unknown action {action_id}')
        ratio = round((count / total_actions) * 100, 2)  # two decimal places
        results.append({
            'action_id': action_id,
            'action_name': action_name,
            'count': count,
            'ratio(%)': ratio
        })
    
    return total_videos, results

if __name__ == '__main__':
    pbtxt_file = '../files/label_map.pbtxt'     # Update to your pbtxt file path
    csv_file = '../files/actions_final.csv'     # Update to your CSV file path
    
    # Generate id -> action name mapping
    id2name = parse_pbtxt(pbtxt_file)
    
    # Analyze CSV data
    total_videos, action_results = analyze_csv(csv_file, id2name)
    
    print(f'Total number of videos: {total_videos}')
    
    # Save the action statistics to CSV, sorted by count descending
    result_df = pd.DataFrame(action_results)
    result_df = result_df.sort_values('count', ascending=False)
    output_csv = '../results/action_result_all.csv'  # Output path, modify as needed
    result_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    
    print(f'Action statistics have been saved to {output_csv}')


### Only count the data of m or r

Remember to replace m and r

In [None]:
import pandas as pd
import re

def parse_pbtxt(file_path):
    """
    Parse a pbtxt file to build a mapping from id to action name.
    """
    mapping = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    # Use regex to find each item block
    items = re.findall(r'item\s*{([^}]+)}', content, re.DOTALL)
    for item in items:
        id_match = re.search(r'id:\s*(\d+)', item)
        name_match = re.search(r'name:\s*"(.*?)"', item)
        if id_match and name_match:
            action_id = int(id_match.group(1))
            action_name = name_match.group(1)
            mapping[action_id] = action_name
    return mapping

def analyze_csv(csv_path, id2name):
    """
    Read the CSV file, keep only rows where movie_or_real == 'r',
    count the total number of unique videos (by video_id),
    and compute each action’s occurrence count and percentage.
    Record action id and name; percentage is shown with two decimal places.
    """
    df = pd.read_csv(csv_path)
    
    # Keep only rows where movie_or_real is 'r'
    df = df[df['movie_or_real'] == 'r']
    
    # Total videos (unique video_id)
    total_videos = df['video_id'].nunique()
    
    # Count occurrences of each action
    action_counts = df['action'].value_counts()
    total_actions = action_counts.sum()
    
    # Build results list with id, name, count, and percentage
    results = []
    for action_id, count in action_counts.items():
        action_name = id2name.get(action_id, f'Unknown action {action_id}')
        ratio = round((count / total_actions) * 100, 2)
        results.append({
            'action_id': action_id,
            'action_name': action_name,
            'count': count,
            'ratio(%)': ratio
        })
    
    return total_videos, results

if __name__ == '__main__':
    pbtxt_file = '../files/label_map.pbtxt'     # Update to your pbtxt file path
    csv_file = '../files/actions_final.csv'     # Update to your CSV file path
    
    # Generate id -> action name mapping
    id2name = parse_pbtxt(pbtxt_file)
    
    # Analyze CSV data, filtering for movie_or_real == 'r'
    total_videos, action_results = analyze_csv(csv_file, id2name)
    
    print(f'Total number of videos (movie_or_real == "r"): {total_videos}')
    
    # Save the action statistics to CSV, sorted by count descending
    result_df = pd.DataFrame(action_results)
    result_df = result_df.sort_values('count', ascending=False)
    output_csv = '../results/action_result_real.csv'  # Output path, modify as needed
    result_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    
    print(f'Action statistics have been saved to {output_csv}')


## Label type statistics

Count the number and percentage of each label type

Input: csv file and pbtxt file

Output: label_type_result.pbtxt

### All videos

In [None]:
import pandas as pd
import re

def parse_pbtxt(file_path):
    """
    Parse a pbtxt file to build a dictionary where the key is the action id
    and the value is a dict containing the action name and label_type.
    For example: {1: {'name': 'bend/bow (at the waist)', 'label_type': 'PERSON_MOVEMENT'}, ...}
    """
    mapping = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    # Find each item block
    items = re.findall(r'item\s*{([^}]+)}', content, re.DOTALL)
    for item in items:
        id_match = re.search(r'id:\s*(\d+)', item)
        name_match = re.search(r'name:\s*"(.*?)"', item)
        label_type_match = re.search(r'label_type:\s*(\w+)', item)
        if id_match and name_match and label_type_match:
            action_id = int(id_match.group(1))
            action_name = name_match.group(1)
            label_type = label_type_match.group(1)
            mapping[action_id] = {'name': action_name, 'label_type': label_type}
    return mapping

def analyze_csv_by_label_type(csv_path, pbtxt_mapping):
    """
    Read the CSV file and, based on the action column (action id),
    add the corresponding label_type. Then count occurrences and percentage
    of each label_type. Percentages are rounded to two decimal places.
    Results are sorted by count in descending order.
    """
    df = pd.read_csv(csv_path)
    
    # Add label_type to each record using pbtxt_mapping; default to 'unknown' if missing
    df['label_type'] = df['action'].map(
        lambda x: pbtxt_mapping.get(x, {}).get('label_type', 'unknown')
    )
    
    total_records = len(df)
    
    # Count occurrences of each label_type
    label_counts = df['label_type'].value_counts()
    
    results = []
    for label, count in label_counts.items():
        ratio = round((count / total_records) * 100, 2)
        results.append({
            'label_type': label,
            'count': count,
            'ratio(%)': ratio
        })
    
    return total_records, results

if __name__ == '__main__':
    pbtxt_file = '../files/label_map.pbtxt'      # Update to your pbtxt file path
    csv_file = '../files/actions_final.csv'      # Update to your CSV file path
    
    # Parse the pbtxt file to get a mapping from action id to name and label_type
    pbtxt_mapping = parse_pbtxt(pbtxt_file)
    
    # Analyze the CSV file to count occurrences and percentages by label_type
    total_records, label_type_results = analyze_csv_by_label_type(csv_file, pbtxt_mapping)
    
    print(f'Total number of records: {total_records}')
    
    # Convert results to a DataFrame and sort by count descending
    result_df = pd.DataFrame(label_type_results)
    result_df = result_df.sort_values(by='count', ascending=False)
    
    output_csv = '../results/label_type_result_all.csv'  # Output file path, modify as needed
    result_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    
    print(f'Label_type statistics have been saved to {output_csv}')


### Count the number and percentage of label types for movie and real videos respectively

remember to replace m and r

In [None]:
import pandas as pd
import re

def parse_pbtxt(file_path):
    """
    Parse a pbtxt file to build a dictionary where the key is the action ID
    and the value is a dict containing the action name and label_type.
    For example: {1: {'name': 'bend/bow (at the waist)', 'label_type': 'PERSON_MOVEMENT'}, ...}
    """
    mapping = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    # Find each item block
    items = re.findall(r'item\s*{([^}]+)}', content, re.DOTALL)
    for item in items:
        id_match = re.search(r'id:\s*(\d+)', item)
        name_match = re.search(r'name:\s*"(.*?)"', item)
        label_type_match = re.search(r'label_type:\s*(\w+)', item)
        if id_match and name_match and label_type_match:
            action_id = int(id_match.group(1))
            action_name = name_match.group(1)
            label_type = label_type_match.group(1)
            mapping[action_id] = {'name': action_name, 'label_type': label_type}
    return mapping

def analyze_csv_by_label_type(csv_path, pbtxt_mapping):
    """
    Read the CSV file and filter for records where movie_or_real == 'r',
    map each action ID to its corresponding label_type,
    then count occurrences and percentage of each label_type
    (percentage rounded to two decimal places).
    Results are sorted by count in descending order.
    """
    df = pd.read_csv(csv_path)
    
    # Filter to records where movie_or_real is 'r'
    df_filtered = df[df['movie_or_real'] == 'r'].copy()
    
    # Add label_type to each record using pbtxt_mapping; default to 'unknown' if missing
    df_filtered['label_type'] = df_filtered['action'].map(
        lambda x: pbtxt_mapping.get(x, {}).get('label_type', 'unknown')
    )
    
    total_records = len(df_filtered)
    
    # Count occurrences of each label_type
    label_counts = df_filtered['label_type'].value_counts()
    
    results = []
    for label, count in label_counts.items():
        ratio = round((count / total_records) * 100, 2) if total_records > 0 else 0
        results.append({
            'label_type': label,
            'count': count,
            'ratio(%)': ratio
        })
    
    return total_records, results

if __name__ == '__main__':
    pbtxt_file = '../files/label_map.pbtxt'      # Update to your pbtxt file path
    csv_file = '../files/actions_final.csv'      # Update to your CSV file path
    
    # Parse the pbtxt file to get a mapping of action ID to name and label_type
    pbtxt_mapping = parse_pbtxt(pbtxt_file)
    
    # Analyze the CSV file for movie_or_real == 'r' records, counting each label_type
    total_records, label_type_results = analyze_csv_by_label_type(csv_file, pbtxt_mapping)
    
    print(f'Total records (movie_or_real == "r"): {total_records}')
    
    # Convert results to a DataFrame and sort by count descending
    result_df = pd.DataFrame(label_type_results)
    result_df = result_df.sort_values(by='count', ascending=False)
    
    output_csv = '../results/label_type_result_real.csv'  # Output file path, modify as needed
    result_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    
    print(f'Label_type statistics have been saved to {output_csv}')


## People Count

Count the percentage of people appearing in the video

Input: csv file

Output: person_count_all.csv

### All videos

In [None]:
import pandas as pd

def analyze_video_person_count(csv_path):
    # Read CSV data
    df = pd.read_csv(csv_path)
    
    # 1. Group by video_id and count unique person_id per video
    video_person_counts = (
        df.groupby("video_id")["person_id"]
        .nunique()
        .reset_index(name="person_count")
    )
    
    # 2. Count number of videos by person count
    summary = video_person_counts["person_count"].value_counts().reset_index()
    summary.columns = ["person_count", "video_count"]
    
    # 3. Calculate ratio (percentage), rounded to two decimal places
    total_videos = video_person_counts.shape[0]
    summary["ratio(%)"] = (summary["video_count"] / total_videos * 100).round(2)
    
    # Sort by person count (e.g., 1 person, 2 persons, etc.)
    summary = summary.sort_values("person_count").reset_index(drop=True)
    
    return total_videos, summary

if __name__ == '__main__':
    csv_file = '../files/actions_final.csv'  # Update to your CSV file path
    total_videos, summary_df = analyze_video_person_count(csv_file)
    
    print(f"Total number of videos: {total_videos}")
    
    # Save results to CSV file
    output_csv = '../results/person_count_all.csv'
    summary_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    
    print(f"Results have been saved to {output_csv}")


### m or r 

In [None]:
import pandas as pd

def analyze_video_person_count_movie(csv_path):
    # Read the CSV data
    df = pd.read_csv(csv_path)
    
    # Filter for records where movie_or_real == 'r'
    df_filtered = df[df['movie_or_real'] == 'r']
    
    # Group by video_id and count unique person_id per video
    video_person_counts = (
        df_filtered
        .groupby("video_id")["person_id"]
        .nunique()
        .reset_index(name="person_count")
    )
    
    # Count number of videos by person count
    summary = video_person_counts["person_count"].value_counts().reset_index()
    summary.columns = ["person_count", "video_count"]
    
    # Calculate ratio (percentage), rounded to two decimal places
    total_videos = video_person_counts.shape[0]
    summary["ratio(%)"] = (summary["video_count"] / total_videos * 100).round(2)
    
    # Sort by person_count
    summary = summary.sort_values("person_count").reset_index(drop=True)
    
    return total_videos, summary

if __name__ == '__main__':
    csv_file = '../files/actions_final.csv'  # Update to your CSV file path
    total_videos, summary_df = analyze_video_person_count_movie(csv_file)
    
    print(f"Total number of videos (movie_or_real == 'r'): {total_videos}")
    
    # Save the summary results to a CSV file
    output_csv = '../results/person_count_real.csv'
    summary_df.to_csv(output_csv, index=False, encoding='utf-8-sig')
    
    print(f"Summary results have been saved to {output_csv}")


## Total data statistics

Total number of data rows in the CSV file
Total number of videos in the CSV file (calculated based on video_id deduplication)
Number of items in the pbtxt file

### all

In [None]:
import pandas as pd
import re

def count_csv_info(csv_path):
    """
    Read the CSV file and return the total number of rows and the total number
    of unique videos (by video_id).
    """
    df = pd.read_csv(csv_path)
    total_data = len(df)
    total_videos = df['video_id'].nunique()
    return total_data, total_videos

def count_pbtxt_items(pbtxt_path):
    """
    Read the pbtxt file and use a regular expression to match all item blocks,
    returning the total number of item occurrences.
    """
    with open(pbtxt_path, 'r', encoding='utf-8') as f:
        content = f.read()
    # Match every "item {" occurrence
    items = re.findall(r'item\s*{', content)
    total_items = len(items)
    return total_items

if __name__ == '__main__':
    pbtxt_file = '../files/label_map.pbtxt'     # Update to your pbtxt file path
    csv_file = '../files/actions_final.csv'     # Update to your CSV file path

    total_data, total_videos = count_csv_info(csv_file)
    total_items = count_pbtxt_items(pbtxt_file)

    print(f"Total number of rows in CSV file: {total_data}")
    print(f"Total number of unique videos in CSV file: {total_videos}")
    print(f"Total number of item blocks in pbtxt file: {total_items}")


### m or r 

In [None]:
import pandas as pd
import re

def parse_pbtxt(file_path):
    """
    Parse a pbtxt file to build a dictionary:
    key: action id,
    value: a dict containing action name and label_type.
    For example: {1: {'name': 'bend/bow (at the waist)', 'label_type': 'PERSON_MOVEMENT'}, ...}
    """
    mapping = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    # Find each item block
    items = re.findall(r'item\s*{([^}]+)}', content, re.DOTALL)
    for item in items:
        id_match = re.search(r'id:\s*(\d+)', item)
        name_match = re.search(r'name:\s*"(.*?)"', item)
        label_type_match = re.search(r'label_type:\s*(\w+)', item)
        if id_match and name_match and label_type_match:
            action_id = int(id_match.group(1))
            action_name = name_match.group(1)
            label_type = label_type_match.group(1)
            mapping[action_id] = {'name': action_name, 'label_type': label_type}
    return mapping

def count_filtered_csv_info(csv_path, movie_or_real_value):
    """
    Read the CSV file and filter records where movie_or_real equals movie_or_real_value.
    Return:
      - Total number of filtered rows,
      - Total number of unique videos in filtered data (by video_id),
      - Filtered DataFrame (for subsequent pbtxt item count).
    """
    df = pd.read_csv(csv_path)
    df_filtered = df[df['movie_or_real'] == movie_or_real_value]
    total_rows_filtered = len(df_filtered)
    total_videos_filtered = df_filtered['video_id'].nunique()
    return total_rows_filtered, total_videos_filtered, df_filtered

def count_filtered_pbtxt_items(filtered_df, pbtxt_mapping):
    """
    For the filtered CSV data (movie_or_real == movie_or_real_value),
    count how many distinct actions (which correspond to pbtxt ids) are used,
    i.e., count how many pbtxt items are actually used.
    """
    unique_actions = set(filtered_df['action'].unique())
    filtered_count = sum(1 for action in unique_actions if action in pbtxt_mapping)
    return filtered_count

if __name__ == '__main__':
    movie_or_real_value = 'r'
    pbtxt_file = '../files/label_map.pbtxt'  # Update to your pbtxt file path
    csv_file = '../files/actions_final.csv'  # Update to your csv file path

    # Parse the pbtxt file
    pbtxt_mapping = parse_pbtxt(pbtxt_file)

    # Count CSV info for filtered data
    total_rows, total_videos, filtered_df = count_filtered_csv_info(csv_file, movie_or_real_value)
    # Count pbtxt items used in filtered data
    filtered_pbtxt_count = count_filtered_pbtxt_items(filtered_df, pbtxt_mapping)

    print(f"Total number of rows in CSV where movie_or_real == '{movie_or_real_value}': {total_rows}")
    print(f"Total number of unique videos where movie_or_real == '{movie_or_real_value}': {total_videos}")
    print(f"Number of pbtxt items used in records where movie_or_real == '{movie_or_real_value}': {filtered_pbtxt_count}")


### Count the number of items in the label map file instead of simply calculating the number of items in the label map file

In [None]:
import pandas as pd
import re

def count_csv_info(csv_path):
    """
    Read the CSV file and return:
      - total number of rows,
      - total number of unique videos (by video_id).
    """
    df = pd.read_csv(csv_path)
    total_data = len(df)
    total_videos = df['video_id'].nunique()
    return total_data, total_videos

def get_pbtxt_item_ids(pbtxt_path):
    """
    Read the pbtxt file and extract every id value in each item block
    using a regular expression. Return a set of all item ids (as ints).
    """
    with open(pbtxt_path, 'r', encoding='utf-8') as f:
        content = f.read()
    # Match the number after "id:"
    ids = re.findall(r'id:\s*(\d+)', content)
    return {int(id_str) for id_str in ids}

def count_pbtxt_item_ids_in_csv(csv_path, pbtxt_path, label_column='action'):
    """
    Count how many pbtxt item ids appear in the CSV file:
    1. Extract all item ids from the pbtxt file (as the reference set).
    2. Extract unique values from the CSV's label_column (item ids).
    3. Compute the intersection: count of item ids that both appear in
       the CSV and are defined in the pbtxt.

    Parameters:
        csv_path: path to the CSV file
        pbtxt_path: path to the pbtxt file
        label_column: column name in the CSV containing pbtxt item ids (default 'action')
    Returns:
        The number of pbtxt item ids present in the CSV file.
    """
    pbtxt_item_ids = get_pbtxt_item_ids(pbtxt_path)
    df = pd.read_csv(csv_path)
    csv_item_ids = set(df[label_column].unique())
    # Convert CSV item ids to int if not already
    csv_item_ids = {int(x) for x in csv_item_ids}
    matched_ids = pbtxt_item_ids.intersection(csv_item_ids)
    total_items = len(matched_ids)
    return total_items

if __name__ == '__main__':
    pbtxt_file = '../files/label_map.pbtxt'  # Update to your pbtxt file path
    csv_file = '../files/actions_final.csv'   # Update to your CSV file path

    total_data, total_videos = count_csv_info(csv_file)
    total_items = count_pbtxt_item_ids_in_csv(csv_file, pbtxt_file, label_column='action')

    print(f"Total number of rows in CSV file: {total_data}")
    print(f"Total number of unique videos in CSV file: {total_videos}")
    print(f"Number of pbtxt item ids appearing in the CSV file: {total_items}")
