In [9]:
import cv2
import numpy as np
import pandas as pd
from pathlib import Path
import os
from tqdm import tqdm

def process_video_dataset(
    video_path: str,
    labels_csv_path: str,
    output_base_dir: str,
    clip_duration: int = 10,
    total_clips: int = 84,
    fps: int = 30
):
    """
    Process a long video file into clips without audio and organize them by class labels.
    
    Args:
        video_path: Path to the source MKV video file
        labels_csv_path: Path to CSV file containing labels
        output_base_dir: Base directory for output class folders
        clip_duration: Duration of each clip in seconds
        total_clips: Total number of clips to generate
        fps: Frames per second to maintain in output
    """
    # Create output directory
    Path(output_base_dir).mkdir(parents=True, exist_ok=True)
    
    # Read labels
    labels_df = pd.read_csv(labels_csv_path)
    
    # Open video
    cap = cv2.VideoCapture(video_path)
    
    # Get video properties
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    video_fps = cap.get(cv2.CAP_PROP_FPS)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    # Calculate middle 14 minutes in frames
    total_duration = total_frames / video_fps
    start_time = (total_duration - (14 * 60)) / 2
    start_frame = int(start_time * video_fps)
    
    # Create class directories
    unique_labels = sorted(labels_df['cs'].unique())  # Using 'cs' column
    class_dirs = {}
    for label in unique_labels:
        class_dir = os.path.join(output_base_dir, str(label))
        Path(class_dir).mkdir(parents=True, exist_ok=True)
        class_dirs[label] = class_dir
    
    # Calculate frames per clip
    frames_per_clip = int(clip_duration * fps)
    
    print("Starting video processing...")
    
    # Process each clip
    for clip_idx in tqdm(range(total_clips), desc="Processing clips"):
        # Calculate frame ranges for this clip
        clip_start_frame = start_frame + (clip_idx * frames_per_clip)
        clip_end_frame = clip_start_frame + frames_per_clip
        
        # Get label for this clip (using mode of 10 seconds worth of labels)
        label_start_idx = clip_idx * 10
        label_end_idx = label_start_idx + 10
        clip_label = labels_df.iloc[label_start_idx:label_end_idx]['cs'].mode().iloc[0]
        
        # Setup output video writer
        clip_name = f'clip_{clip_idx:03d}.mp4'
        clip_path = os.path.join(class_dirs[clip_label], clip_name)
        
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(clip_path, fourcc, fps, (frame_width, frame_height))
        
        # Set frame position
        cap.set(cv2.CAP_PROP_POS_FRAMES, clip_start_frame)
        
        # Read and write frames for this clip
        frames_written = 0
        while frames_written < frames_per_clip:
            ret, frame = cap.read()
            if not ret:
                break
            out.write(frame)
            frames_written += 1
        
        # Release video writer
        out.release()
        
        # Verify clip was created successfully
        if not os.path.exists(clip_path) or os.path.getsize(clip_path) == 0:
            print(f"Warning: Failed to create clip {clip_idx}")
            continue
    
    # Clean up
    cap.release()
    
    return verify_dataset(output_base_dir)

def verify_dataset(output_base_dir: str):
    """
    Verify the processed dataset and return statistics.
    
    Args:
        output_base_dir: Base directory containing class folders
    
    Returns:
        dict: Dataset statistics
    """
    stats = {
        'total_clips': 0,
        'class_distribution': {},
        'problematic_clips': []
    }
    
    # Check each class directory
    for class_dir in Path(output_base_dir).iterdir():
        if class_dir.is_dir():
            class_name = class_dir.name
            clips = list(class_dir.glob('*.mp4'))
            stats['class_distribution'][class_name] = len(clips)
            stats['total_clips'] += len(clips)
            
            # Check each clip
            for clip_path in clips:
                cap = cv2.VideoCapture(str(clip_path))
                if not cap.isOpened():
                    stats['problematic_clips'].append(str(clip_path))
                else:
                    # Verify frame count
                    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                    if frame_count == 0:
                        stats['problematic_clips'].append(str(clip_path))
                cap.release()
    
    # Print statistics
    print("\nDataset Statistics:")
    print(f"Total clips: {stats['total_clips']}")
    print("\nClips per class:")
    for class_name, count in sorted(stats['class_distribution'].items()):
        print(f"Class {class_name}: {count} clips")
    
    if stats['problematic_clips']:
        print("\nWarning: The following clips may be corrupted:")
        for clip in stats['problematic_clips']:
            print(f"- {clip}")
    
    return stats

def main():
    """
    Main function to run the video preprocessing pipeline.
    """
    # Configuration
    config = {
        'video_path': "13_video.mkv",
        'labels_csv_path': "cs_labels30.csv",
        'output_base_dir': "vrwalking",
        'clip_duration': 10,  # seconds
        'total_clips': 84,
        'fps': 60
    }
    
    # Process dataset
    stats = process_video_dataset(**config)
    
    # Print final status
    print("\nProcessing complete!")
    print(f"Successfully created {stats['total_clips']} clips")
    if stats['problematic_clips']:
        print(f"Found {len(stats['problematic_clips'])} problematic clips")

if __name__ == "__main__":
    main()

Starting video processing...


Processing clips: 100%|██████████| 84/84 [04:15<00:00,  3.04s/it]



Dataset Statistics:
Total clips: 144

Clips per class:
Class 0: 78 clips
Class 1: 36 clips
Class 2: 30 clips

Processing complete!
Successfully created 144 clips


In [4]:
!pip install moviepy

Collecting moviepy
  Downloading moviepy-1.0.3.tar.gz (388 kB)
     ---------------------------------------- 0.0/388.3 kB ? eta -:--:--
     -- ---------------------------------- 30.7/388.3 kB 660.6 kB/s eta 0:00:01
     ----------------- -------------------- 174.1/388.3 kB 2.1 MB/s eta 0:00:01
     -------------------------------------- 388.3/388.3 kB 3.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting decorator<5.0,>=4.0.2 (from moviepy)
  Downloading decorator-4.4.2-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting proglog<=1.0.0 (from moviepy)
  Downloading proglog-0.1.10-py3-none-any.whl.metadata (639 bytes)
Collecting imageio_ffmpeg>=0.2.0 (from moviepy)
  Downloading imageio_ffmpeg-0.5.1-py3-none-win_amd64.whl.metadata (1.6 kB)
Downloading decorator-4.4.2-py2.py3-none-any.whl (9.2 kB)
Downloading imageio_ffmpeg-0.5.1-py3-none-win_amd64.whl (22.6 MB)
   ---------------------------------------- 0.0/2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def create_cybersickness_classes(csv_path, output_path=None):
    """
    Read FMS data and create 3 cybersickness severity classes based on distribution.
    
    Args:
        csv_path: Path to input CSV file with FMS column
        output_path: Path to save the modified CSV file (if None, will add '_processed' to input filename)
    
    Returns:
        DataFrame with new 'cs' column
    """
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Ensure FMS column exists
    if 'fms' not in df.columns:
        raise ValueError("Column 'fms' not found in CSV file")
    
    # Calculate distribution statistics using 33rd and 66th percentiles
    p33, p66 = np.percentile(df['fms'], [33.33, 66.67])
    
    # Create cybersickness severity classes
    def assign_class(fms):
        if fms <= p33:
            return 0    # Low
        elif fms <= p66:
            return 1    # Moderate
        else:
            return 2    # High
    
    # Add new column for cybersickness class
    df['cs'] = df['fms'].apply(assign_class)
    
    # Generate distribution analysis
    plt.figure(figsize=(15, 5))
    
    # Plot 1: FMS Distribution
    plt.subplot(1, 2, 1)
    sns.histplot(data=df, x='fms', bins=10)
    plt.axvline(p33, color='r', linestyle='--', label=f'33rd percentile ({p33:.2f})')
    plt.axvline(p66, color='g', linestyle='--', label=f'66th percentile ({p66:.2f})')
    plt.title('FMS Score Distribution')
    plt.xlabel('FMS Score')
    plt.ylabel('Count')
    plt.legend()
    
    # Plot 2: Class Distribution
    plt.subplot(1, 2, 2)
    class_counts = df['cs'].value_counts().sort_index()
    sns.barplot(x=class_counts.index, y=class_counts.values)
    plt.title('Cybersickness Class Distribution')
    plt.xlabel('Class')
    plt.ylabel('Count')
    
    # Add class labels
    for i, count in enumerate(class_counts):
        plt.text(i, count, str(count), ha='center', va='bottom')
    
    plt.tight_layout()
    
    # Save the visualization
    plt.savefig('cybersickness_distribution.png')
    plt.close()
    
    # Print distribution statistics
    print("\nDistribution Statistics:")
    print(f"33rd percentile (Class 0-1 boundary): {p33:.2f}")
    print(f"66th percentile (Class 1-2 boundary): {p66:.2f}")
    
    print("\nClass Definitions:")
    print(f"Class 0 (Low): FMS ≤ {p33:.2f}")
    print(f"Class 1 (Moderate): {p33:.2f} < FMS ≤ {p66:.2f}")
    print(f"Class 2 (High): FMS > {p66:.2f}")
    
    # Additional statistics
    print("\nDetailed Statistics by Class:")
    for class_num in range(3):
        class_data = df[df['cs'] == class_num]['fms']
        print(f"\nClass {class_num}:")
        print(f"  Count: {len(class_data)}")
        print(f"  Mean FMS: {class_data.mean():.2f}")
        print(f"  Min FMS: {class_data.min():.2f}")
        print(f"  Max FMS: {class_data.max():.2f}")
    
    print("\nOverall Class Distribution:")
    print(df['cs'].value_counts().sort_index().to_string())
    
    # Save the modified DataFrame
    if output_path is None:
        output_path = csv_path.rsplit('.', 1)[0] + '_processed.csv'
    df.to_csv(output_path, index=False)
    print(f"\nProcessed CSV saved to: {output_path}")
    
    # Create a summary DataFrame
    summary_df = pd.DataFrame({
        'Class': ['Low (0)', 'Moderate (1)', 'High (2)'],
        'FMS_Range': [
            f'≤ {p33:.2f}',
            f'{p33:.2f} - {p66:.2f}',
            f'> {p66:.2f}'
        ],
        'Count': [
            len(df[df['cs'] == 0]),
            len(df[df['cs'] == 1]),
            len(df[df['cs'] == 2])
        ],
        'Percentage': [
            f"{(len(df[df['cs'] == 0])/len(df)*100):.1f}%",
            f"{(len(df[df['cs'] == 1])/len(df)*100):.1f}%",
            f"{(len(df[df['cs'] == 2])/len(df)*100):.1f}%"
        ]
    })
    
    # Save summary to CSV
    summary_path = output_path.rsplit('.', 1)[0] + '_summary.csv'
    summary_df.to_csv(summary_path, index=False)
    print(f"\nSummary statistics saved to: {summary_path}")
    
    return df

def main():
    # Configuration
    INPUT_CSV_PATH = "Cybersickness_Label.csv"
    OUTPUT_CSV_PATH = "cs_labels.csv"  # Optional
    
    # Process the data
    df = create_cybersickness_classes(INPUT_CSV_PATH, OUTPUT_CSV_PATH)
    
    # Display sample of the processed data
    print("\nSample of processed data:")
    print(df[['fms', 'cs']].head(10))

if __name__ == "__main__":
    main()


Distribution Statistics:
33rd percentile (Class 0-1 boundary): 1.00
66th percentile (Class 1-2 boundary): 2.00

Class Definitions:
Class 0 (Low): FMS ≤ 1.00
Class 1 (Moderate): 1.00 < FMS ≤ 2.00
Class 2 (High): FMS > 2.00

Detailed Statistics by Class:

Class 0:
  Count: 16800
  Mean FMS: 1.00
  Min FMS: 1.00
  Max FMS: 1.00

Class 1:
  Count: 3240
  Mean FMS: 2.00
  Min FMS: 2.00
  Max FMS: 2.00

Class 2:
  Count: 6840
  Mean FMS: 4.31
  Min FMS: 3.00
  Max FMS: 7.00

Overall Class Distribution:
cs
0    16800
1     3240
2     6840

Processed CSV saved to: cs_labels.csv

Summary statistics saved to: cs_labels_summary.csv

Sample of processed data:
   fms  cs
0    1   0
1    1   0
2    1   0
3    1   0
4    1   0
5    1   0
6    1   0
7    1   0
8    1   0
9    1   0
