In [None]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

# Paths
fact_table_path = "images/training/fact_table.csv"
segments_dir = "images/training/segments"
output_dir = "images/training/dataset"

# Create output directories
train_images_dir = os.path.join(output_dir, "train/images")
val_images_dir = os.path.join(output_dir, "val/images")
os.makedirs(train_images_dir, exist_ok=True)
os.makedirs(val_images_dir, exist_ok=True)

# Load the fact_table CSV
fact_table = pd.read_csv(fact_table_path)

# Filter out rows with "__OMIT__" in the true_label column
fact_table = fact_table[fact_table["true_label"] != "__OMIT__"]

# Split the dataset into 80% train and 20% validation
train_df, val_df = train_test_split(fact_table, test_size=0.2, random_state=42)

# Function to copy images and write labels.txt
def process_split(df, images_dir, labels_file):
    with open(labels_file, "w") as f:
        for _, row in df.iterrows():
            src_image_path = os.path.join(segments_dir, row["segment_file_path"])
            dest_image_path = os.path.join(images_dir, row["segment_file_path"])
            
            # Copy the image to the destination directory
            shutil.copy(src_image_path, dest_image_path)
            
            # Write the label to labels.txt
            f.write(f"{row['segment_file_path']}\t{row['true_label']}\n")

# Process the training split
process_split(train_df, train_images_dir, os.path.join(output_dir, "train/labels.txt"))

# Process the validation split
process_split(val_df, val_images_dir, os.path.join(output_dir, "val/labels.txt"))

print("Dataset successfully reorganized for EasyOCR training!")