In [None]:
import pandas as pd
import os
import shutil
import random
import numpy as np

In [None]:
# Read the Excel file
df = pd.read_excel(r"C:\Users\acer\Desktop\Data_0\Classification_1.xlsx")

# Fix seed for reproducibility
random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)

# Randomly shuffle the dataframe
df = df.sample(frac=1, random_state=random_seed).reset_index(drop=True)

# Calculate split sizes
total = len(df)
train_split = int(0.7 * total)
val_split = int(0.15 * total)

# Split the dataframe
train_df = df[:train_split]
val_df = df[train_split:train_split+val_split]
test_df = df[train_split+val_split:]

# Create train, validation, and test folders and save the corresponding classification.xlsx files
dict_df = {'train':train_df, 'validation':val_df, 'test':test_df}
data_path = r"C:\Users\acer\Desktop\Data_2D3layers"
for folder in ['train', 'validation', 'test']:
    folder_path = os.path.join(data_path, folder)
    os.makedirs(folder_path, exist_ok=True)
    file_name = f'{folder}_classification.xlsx'
    excel_path = os.path.join(folder_path, file_name)
    dict_df[folder].to_excel(excel_path, index=False)

# Get folder lists for each split
train_folders = train_df['ID'].tolist()
val_folders = val_df['ID'].tolist()
test_folders = test_df['ID'].tolist()

# Function to move folders
def move_folders(folder_list, source_path, destination_path):
    for folder in folder_list:
        for xyz in ['x', 'y', 'z']:
            folder_xyz = f"{folder}_{xyz}"
            source = os.path.join(source_path, folder_xyz)
            dest = os.path.join(destination_path, folder_xyz)
            shutil.copytree(source, dest, dirs_exist_ok=True)

# Move folders to their respective destinations
source = r"C:\Users\acer\Desktop\Data_0\Nii_toJPG_3layers"
destination_train = r"C:\Users\acer\Desktop\Data_2D3layers\train"
destination_validation = r"C:\Users\acer\Desktop\Data_2D3layers\validation"
destination_test = r"C:\Users\acer\Desktop\Data_2D3layers\test"
move_folders(train_folders,source, destination_train)
move_folders(val_folders,source, destination_validation)
move_folders(test_folders,source, destination_test)

print("Folders have been moved and Excel files have been created successfully.")

In [4]:
import pandas as pd
import shutil
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
import numpy as np

def create_train_val_test_split(csv_path, input_folder, output_dir, id_column='ID', class_column='Class', 
                                train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, stratify_by_class=True, random_state=42):
    """
    Split dataset into train/validation/test sets and copy files accordingly.
    
    Parameters:
    -----------
    csv_path : str
        Path to CSV file with ID and class columns
    input_folder : str
        Path to folder containing the .nii.gz files
    output_dir : str
        Base directory where train/val/test folders will be created
    id_column : str
        Name of the column containing file names (default: 'ID')
    class_column : str
        Name of the column containing class labels (default: 'Class')
    train_ratio : float
        Proportion for training set (default: 0.7)
    val_ratio : float
        Proportion for validation set (default: 0.15)
    test_ratio : float
        Proportion for test set (default: 0.15)
    stratify_by_class : bool
        Whether to maintain class distribution across splits (default: True)
    random_state : int
        Random seed for reproducibility (default: 42)
    """
    
    # Validate ratios
    if abs(train_ratio + val_ratio + test_ratio - 1.0) > 1e-6:
        raise ValueError("Train, validation, and test ratios must sum to 1.0")
    
    # Read CSV file
    df = pd.read_csv(csv_path)
    
    # Validate required columns
    if id_column not in df.columns or class_column not in df.columns:
        raise ValueError(f"CSV must contain '{id_column}' and '{class_column}' columns")
    
    # Create full file paths by combining input_folder with filenames
    df['full_path'] = df[id_column].apply(lambda x: os.path.join(input_folder, f"{x}_adjustedBG.nii.gz"))
    
    # Check which files exist
    existing_files = df['full_path'].apply(os.path.exists)
    missing_files = df[~existing_files]
    
    if len(missing_files) > 0:
        print(f"Warning: {len(missing_files)} files not found in {input_folder}")
        print("Missing files:")
        for _, row in missing_files.head(5).iterrows():
            print(f"  - {row[id_column]}")
        if len(missing_files) > 5:
            print(f"  ... and {len(missing_files) - 5} more")
        print()
        
        # Remove missing files from dataframe
        df = df[existing_files].reset_index(drop=True)
    
    print(f"Loaded {len(df)} existing files from CSV")
    print(f"Input folder: {input_folder}")
    print(f"ID column: '{id_column}', Class column: '{class_column}'")
    print(f"Class distribution:")
    print(df[class_column].value_counts())
    print()
    
    # Create output directories
    output_path = Path(output_dir)
    train_dir = output_path / 'train'
    val_dir = output_path / 'val'
    test_dir = output_path / 'test'
    
    for dir_path in [train_dir, val_dir, test_dir]:
        dir_path.mkdir(parents=True, exist_ok=True)
        # Create class subdirectories
        for class_name in df[class_column].unique():
            (dir_path / str(class_name)).mkdir(exist_ok=True)
    
    # Prepare stratification
    stratify = df[class_column] if stratify_by_class else None
    
    # First split: separate train from (val + test)
    temp_val_test_ratio = val_ratio + test_ratio
    train_df, val_test_df = train_test_split(
        df, 
        test_size=temp_val_test_ratio,
        stratify=stratify,
        random_state=random_state
    )
    
    # Second split: separate val from test
    # Adjust the test_size for the second split
    test_ratio_adjusted = test_ratio / temp_val_test_ratio
    stratify_val_test = val_test_df[class_column] if stratify_by_class else None
    
    val_df, test_df = train_test_split(
        val_test_df,
        test_size=test_ratio_adjusted,
        stratify=stratify_val_test,
        random_state=random_state
    )
    
    # Copy files function
    def copy_files(df_subset, target_dir, split_name):
        print(f"Copying {split_name} files...")
        copied_count = 0
        missing_count = 0
        
        for idx, row in df_subset.iterrows():
            source_path = row['full_path']
            class_name = row[class_column]
            filename = f"{row[id_column]}_adjustedBG.nii.gz"  # Use original filename from CSV
            
            # Check if source file exists (should exist since we filtered above)
            if not os.path.exists(source_path):
                print(f"Warning: File not found: {source_path}")
                missing_count += 1
                continue
            
            # Create destination path
            dest_path = target_dir / str(class_name) / filename
            
            # Copy file
            try:
                shutil.copy2(source_path, dest_path)
                copied_count += 1
            except Exception as e:
                print(f"Error copying {source_path}: {e}")
                missing_count += 1
        
        print(f"  Copied: {copied_count} files")
        if missing_count > 0:
            print(f"  Missing/Failed: {missing_count} files")
        print()
        
        return copied_count, missing_count
    
    # Copy files to respective directories
    results = {}
    results['train'] = copy_files(train_df, train_dir, 'train')
    results['val'] = copy_files(val_df, val_dir, 'validation')
    results['test'] = copy_files(test_df, test_dir, 'test')
    
    # Print summary
    print("="*50)
    print("SPLIT SUMMARY")
    print("="*50)
    print(f"Total files: {len(df)}")
    print(f"Train: {len(train_df)} ({len(train_df)/len(df)*100:.1f}%)")
    print(f"Val:   {len(val_df)} ({len(val_df)/len(df)*100:.1f}%)")
    print(f"Test:  {len(test_df)} ({len(test_df)/len(df)*100:.1f}%)")
    print()
    
    # Print class distribution for each split
    if stratify_by_class:
        print("Class distribution by split:")
        for split_name, df_subset in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
            print(f"\n{split_name}:")
            class_counts = df_subset[class_column].value_counts()
            for class_name, count in class_counts.items():
                percentage = count / len(df_subset) * 100
                print(f"  {class_name}: {count} ({percentage:.1f}%)")
    
    # Save split information (only include relevant columns)
    cols_to_save = [id_column, class_column]
    train_df[cols_to_save].to_csv(output_path / 'train_split.csv', index=False)
    val_df[cols_to_save].to_csv(output_path / 'val_split.csv', index=False)
    test_df[cols_to_save].to_csv(output_path / 'test_split.csv', index=False)
    
    print(f"\nSplit CSV files saved in {output_path}")
    print(f"Files organized in: {output_path}")
    
    return {
        'train_df': train_df,
        'val_df': val_df,
        'test_df': test_df,
        'results': results
    }

csv_file = r"/tmjoa_3d/data/Classification_1.csv"
input_folder = r"/tmjoa_3d/data/5_adjustedBG"
output_directory = r"/tmjoa_3d/data/training_dataset"

results = create_train_val_test_split(
    csv_path=csv_file,
    input_folder=input_folder,
    output_dir=output_directory,
    id_column='ID',  # Your column name for filenames
    class_column='c_erosion',  # Your column name for classes
    train_ratio=0.7,
    val_ratio=0.2,
    test_ratio=0.1,
    stratify_by_class=True,
    random_state=42
)

Loaded 364 existing files from CSV
Input folder: /tmjoa_3d/data/5_adjustedBG
ID column: 'ID', Class column: 'c_erosion'
Class distribution:
c_erosion
1    201
0    163
Name: count, dtype: int64

Copying train files...
  Copied: 254 files

Copying validation files...
  Copied: 73 files

Copying test files...
  Copied: 37 files

SPLIT SUMMARY
Total files: 364
Train: 254 (69.8%)
Val:   73 (20.1%)
Test:  37 (10.2%)

Class distribution by split:

Train:
  1: 140 (55.1%)
  0: 114 (44.9%)

Val:
  1: 40 (54.8%)
  0: 33 (45.2%)

Test:
  1: 21 (56.8%)
  0: 16 (43.2%)

Split CSV files saved in /tmjoa_3d/data/training_dataset
Files organized in: /tmjoa_3d/data/training_dataset
