# Data processing

## Step 1: Extract Images

In [None]:
import os
import shutil
import tarfile

In [None]:
data_dir = 'data'
image_output_dir = os.path.join(data_dir, 'images')

In [None]:

# Ensure the output directory exists
os.makedirs(image_output_dir, exist_ok=True)

# Extract each .tar.gz file to the image_output_dir
for i in range(1, 13):
    tar_path = os.path.join(data_dir, f'images_{i:02}.tar.gz')
    with tarfile.open(tar_path, 'r:gz') as tar:
        tar.extractall(path=image_output_dir)

## Step 2: Load Image Names from Text Files

In [None]:
def load_image_list(file_path):
    with open(file_path, 'r') as file:
        image_files = file.read().splitlines()
    return image_files

# Load the training and test image lists
train_val_images = load_image_list(os.path.join(data_dir, 'train_val_list.txt'))
test_images = load_image_list(os.path.join(data_dir, 'test_list.txt'))

# Creating train_val and test list that contain images name
train_val_images = [f"{img}" for img in train_val_images]
test_images = [f"{img}" for img in test_images]

len(train_val_images), len(test_images)

## Step 3: Move and split images to train_val and test folders

In [None]:
source_dir = 'data/images/images_001/images'  # Directory for images_001 (sub data)
train_dir = 'data/train_images'  # Destination for training images
test_dir = 'data/test_images'    # Destination for test images

In [None]:
iterator = 0
for img_name in os.listdir(source_dir):
    if img_name in train_val_images:
        shutil.move(os.path.join(source_dir, img_name), train_dir)
    elif img_name in test_images:
        shutil.move(os.path.join(source_dir, img_name), test_dir)

    iterator += 1

    if iterator % 100 == 0:
        print(iterator)    

print("Images have been successfully moved to their respective directories.")