In [1]:
# !wget https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-0.tar.gz

https://github.com/AlibabaResearch/AdvancedLiterateMachinery/tree/main/DocumentUnderstanding/VGT IMP CHECKIT

In [2]:
# !wget https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/labels.tar.gz

In [3]:
# importing prerequisites
import sys
import requests
import tarfile
import json
import numpy as np
from os import path
from PIL import Image
from PIL import ImageFont, ImageDraw
from glob import glob
from matplotlib import pyplot as plt
%matplotlib inline

In [4]:
# # Extracting the dataset
# tar = tarfile.open('/train-0.tar.gz')
# tar.extractall()
# tar.close()

In [5]:
# !rm -rf train-0.tar.gz

In [6]:
# # Extracting the labels
# tar = tarfile.open('/labels.tar.gz')
# tar.extractall()
# tar.close()

In [7]:
# !rm -rf labels-0.tar.gz

In [8]:
import os

# Directory path
directory = "/kaggle/input/documnet-layout-recognition-dataset-publaynet-t0/train-0/publaynet/train"

# Get the number of files in the directory
num_files = len(os.listdir(directory))

# Print the result
print(f"Number of files in '{directory}': {num_files}")

Number of files in '/kaggle/input/documnet-layout-recognition-dataset-publaynet-t0/train-0/publaynet/train': 47958


In [9]:
# Verifying the file was extracted properly
data_path = "/kaggle/input/documnet-layout-recognition-dataset-publaynet-t0/labels/publaynet/train.json"
path.exists(data_path)

True

In [10]:
import os
import json
import gc

# Paths
coco_json_path = "/kaggle/input/documnet-layout-recognition-dataset-publaynet-t0/labels/publaynet/train.json"
images_dir = "/kaggle/input/documnet-layout-recognition-dataset-publaynet-t0/train-0/publaynet/train"
filtered_json_path = "/kaggle/working/filtered_publaynet.json"  # Output JSON

# Get list of available images
available_images = set(os.listdir(images_dir))

# Open JSON as a stream
filtered_images = []
filtered_annotations = []

with open(coco_json_path, "r") as f:
    coco_data = json.load(f)  # Load JSON structure

# Process images and store a map of valid image IDs
image_id_map = {}
new_image_id = 0

for img in coco_data["images"]:
    if img["file_name"] in available_images:
        image_id_map[img["id"]] = new_image_id  # Re-map image ID
        img["id"] = new_image_id
        filtered_images.append(img)
        new_image_id += 1

# Process annotations (RAM-efficient)
with open(coco_json_path, "r") as f:
    for ann in coco_data["annotations"]:
        if ann["image_id"] in image_id_map:
            ann["image_id"] = image_id_map[ann["image_id"]]
            filtered_annotations.append(ann)

# Save filtered JSON (Streaming write to avoid RAM spike)
filtered_coco_data = {
    "images": filtered_images,
    "annotations": filtered_annotations,
    "categories": coco_data["categories"]
}

with open(filtered_json_path, "w") as f:
    json.dump(filtered_coco_data, f, indent=4)

# Free up memory
del coco_data  # Remove the large JSON object
del filtered_images
del filtered_annotations
del image_id_map

# Force garbage collection
gc.collect()

print(f"Filtered dataset saved to {filtered_json_path}")

Filtered dataset saved to /kaggle/working/filtered_publaynet.json


In [11]:
import os
import json
from tqdm import tqdm

# Paths
coco_json_path = "/kaggle/working/filtered_publaynet.json"
images_dir = "/kaggle/input/documnet-layout-recognition-dataset-publaynet-t0/train-0/publaynet/train"
output_dir = "/kaggle/working/yolo_dataset"

# Create YOLO directories
os.makedirs(os.path.join(output_dir, "labels"), exist_ok=True)
os.makedirs(os.path.join(output_dir, "images"), exist_ok=True)

# Class mapping (Only keeping "table" and "figure")
coco_to_yolo = {4: 0, 5: 1}  # Table -> 0, Figure -> 1

# Load COCO JSON
with open(coco_json_path, "r") as f:
    coco_data = json.load(f)

# Process images
image_map = {img["id"]: (img["file_name"], img["width"], img["height"]) for img in coco_data["images"]}

# Process annotations
for ann in tqdm(coco_data["annotations"], desc="Processing annotations"):
    category_id = ann["category_id"]
    if category_id not in coco_to_yolo:
        continue  # Skip other classes

    image_id = ann["image_id"]

    # Get image details
    if image_id not in image_map:
        continue  # Skip annotations with missing images

    image_name, img_width, img_height = image_map[image_id]

    # Convert COCO bbox to YOLO format
    x, y, w, h = ann["bbox"]
    x_center = (x + w / 2) / img_width
    y_center = (y + h / 2) / img_height
    w = w / img_width
    h = h / img_height

    # Save in YOLO format
    label_path = os.path.join(output_dir, "labels", image_name.replace(".jpg", ".txt"))
    with open(label_path, "a") as f:
        f.write(f"{coco_to_yolo[category_id]} {x_center:.6f} {y_center:.6f} {w:.6f} {h:.6f}\n")

    # Copy images
    os.system(f"cp {os.path.join(images_dir, image_name)} {os.path.join(output_dir, 'images', image_name)}")

print("Conversion completed! YOLO dataset is ready.")

Processing annotations: 100%|██████████| 465838/465838 [05:01<00:00, 1545.84it/s]

Conversion completed! YOLO dataset is ready.





In [12]:
import os
import shutil
import random

# Set paths
dataset_path = "/kaggle/working/yolo_dataset"
train_path = os.path.join(dataset_path, "train_images")
val_path = os.path.join(dataset_path, "val_images")

# Create directories if they don't exist
os.makedirs(os.path.join(train_path, "labels"), exist_ok=True)
os.makedirs(os.path.join(val_path, "labels"), exist_ok=True)

# Get all images and their corresponding labels
all_images = [f for f in os.listdir(os.path.join(dataset_path, "images")) if f.endswith(".jpg")]
random.shuffle(all_images)

# Split ratio
split_ratio = 0.8
train_count = int(len(all_images) * split_ratio)
train_images = all_images[:train_count]
val_images = all_images[train_count:]

# Move images and their corresponding labels
for img in train_images:
    # Move image
    shutil.move(
        os.path.join(dataset_path, "images", img),
        os.path.join(train_path, img)
    )
    # Move corresponding label file
    label_file = img.replace(".jpg", ".txt")
    if os.path.exists(os.path.join(dataset_path, "labels", label_file)):
        shutil.move(
            os.path.join(dataset_path, "labels", label_file),
            os.path.join(train_path, "labels", label_file)
        )

for img in val_images:
    # Move image
    shutil.move(
        os.path.join(dataset_path, "images", img),
        os.path.join(val_path, img)
    )
    # Move corresponding label file
    label_file = img.replace(".jpg", ".txt")
    if os.path.exists(os.path.join(dataset_path, "labels", label_file)):
        shutil.move(
            os.path.join(dataset_path, "labels", label_file),
            os.path.join(val_path, "labels", label_file)
        )

print("Dataset split completed with images and labels!")

Dataset split completed with images and labels!


In [13]:
!pip install --upgrade ultralytics ray[tune]

Collecting ultralytics
  Downloading ultralytics-8.3.73-py3-none-any.whl.metadata (35 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting tensorboardX>=1.9 (from ray[tune])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading ultralytics-8.3.73-py3-none-any.whl (914 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m914.6/914.6 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.14-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, tensorboardX, ultralytics
Successfully installed tensorboardX-2.6.2.2 ultralytics-8.3.73 ultralytics-thop-2.0.14


In [14]:
!pip install -U ray[tune]



In [15]:
import yaml
import os

def write_yaml_file_robust(data, filepath):
    """Writes a dictionary to a YAML file, handling directory creation."""

    output_dir = os.path.dirname(filepath)
    if output_dir:  # Check if a directory part exists
        try:
            os.makedirs(output_dir, exist_ok=True) #Creates directory if it doesn't exist. Does not raise error if it exists.
            print(f"Directory '{output_dir}' exists or was successfully created.")
        except OSError as e:
            print(f"Error creating directory '{output_dir}': {e}")
            return  # Exit early if directory creation fails

    try:
        with open(filepath, 'w') as f:
            yaml.dump(data, f)
        print(f"YAML file successfully written to: {filepath}")
    except Exception as e:
        print(f"Error writing YAML file: {e}")



yaml_data = {
    "path": "/kaggle/working/yolo_dataset",  # Base path
    "train": "train_images",  # Training images directory
    "val": "val_images",      # Validation images directory
    "nc": 2,                  # Number of classes
    "names": ["table", "figure"]  # Class names
}

output_filepath = "/kaggle/working/dataset.yaml"  # Replace with your desired path and filename
write_yaml_file_robust(yaml_data, output_filepath)

Directory '/kaggle/working' exists or was successfully created.
YAML file successfully written to: /kaggle/working/dataset.yaml


In [16]:
# import os

# # Create directories
# os.makedirs('/kaggle/working/yolo_dataset/train_images/labels', exist_ok=True)
# os.makedirs('/kaggle/working/yolo_dataset/val_images/labels', exist_ok=True)

# # Import shutil for file operations
# import shutil
# import glob

# # Move files to train_images/labels
# for file in glob.glob('/kaggle/working/yolo_dataset/labels/*.txt'):
#     shutil.move(file, '/kaggle/working/yolo_dataset/train_images/labels/')

# # Move files to val_images/labels
# for file in glob.glob('/kaggle/working/yolo_dataset/labels/*.txt'):
#     shutil.move(file, '/kaggle/working/yolo_dataset/val_images/labels/')

In [17]:
import os
import shutil
import glob

# 1. First, let's clean up and create fresh directories
def setup_fresh_directories():
    base_dir = "/kaggle/working/yolo_dataset"
    
    # Remove cache files if they exist
    cache_files = glob.glob(os.path.join(base_dir, "*.cache"))
    for cache_file in cache_files:
        os.remove(cache_file)
    
    # Create directories
    dirs = [
        os.path.join(base_dir, "train_images"),
        os.path.join(base_dir, "train_images", "labels"),
        os.path.join(base_dir, "val_images"),
        os.path.join(base_dir, "val_images", "labels")
    ]
    
    for d in dirs:
        os.makedirs(d, exist_ok=True)
        
    return base_dir

# 2. Move files to correct locations
def reorganize_files():
    base_dir = "/kaggle/working/yolo_dataset"
    
    # Move all label files to correct locations
    for img_file in os.listdir(os.path.join(base_dir, "train_images")):
        if img_file.endswith('.jpg'):
            label_file = img_file.replace('.jpg', '.txt')
            
            # Check if label exists in original labels directory
            orig_label_path = os.path.join(base_dir, "labels", label_file)
            if os.path.exists(orig_label_path):
                shutil.copy2(
                    orig_label_path,
                    os.path.join(base_dir, "train_images", "labels", label_file)
                )
    
    for img_file in os.listdir(os.path.join(base_dir, "val_images")):
        if img_file.endswith('.jpg'):
            label_file = img_file.replace('.jpg', '.txt')
            
            # Check if label exists in original labels directory
            orig_label_path = os.path.join(base_dir, "labels", label_file)
            if os.path.exists(orig_label_path):
                shutil.copy2(
                    orig_label_path,
                    os.path.join(base_dir, "val_images", "labels", label_file)
                )

# 3. Verify the dataset structure
def verify_dataset():
    base_dir = "/kaggle/working/yolo_dataset"
    
    train_imgs = len(glob.glob(os.path.join(base_dir, "train_images", "*.jpg")))
    train_labels = len(glob.glob(os.path.join(base_dir, "train_images", "labels", "*.txt")))
    val_imgs = len(glob.glob(os.path.join(base_dir, "val_images", "*.jpg")))
    val_labels = len(glob.glob(os.path.join(base_dir, "val_images", "labels", "*.txt")))
    
    print(f"Training: {train_imgs} images, {train_labels} labels")
    print(f"Validation: {val_imgs} images, {val_labels} labels")
    
    # Check label format
    if train_labels > 0:
        sample_label = glob.glob(os.path.join(base_dir, "train_images", "labels", "*.txt"))[0]
        print("\nSample label content:")
        with open(sample_label, 'r') as f:
            print(f.read().strip())

# 4. Update yaml file
def update_yaml():
    yaml_content = {
        "path": "/kaggle/working/yolo_dataset",
        "train": "train_images",
        "val": "val_images",
        "nc": 2,
        "names": ["table", "figure"]
    }
    
    with open("/kaggle/working/dataset.yaml", 'w') as f:
        yaml.dump(yaml_content, f, default_flow_style=False)

# Execute all steps
setup_fresh_directories()
reorganize_files()
verify_dataset()
update_yaml()

Training: 18896 images, 18896 labels
Validation: 4725 images, 4725 labels

Sample label content:
0 0.519094 0.239037 0.862584 0.269257


In [18]:
# import os
# import shutil
# import random

# # Reset the directory structure
# def reset_and_setup_directories():
#     base_dir = "/kaggle/working/yolo_dataset"
#     train_dir = os.path.join(base_dir, "train_images")
#     val_dir = os.path.join(base_dir, "val_images")
    
#     # Create fresh directories
#     for dir_path in [
#         os.path.join(train_dir, "labels"),
#         os.path.join(val_dir, "labels")
#     ]:
#         os.makedirs(dir_path, exist_ok=True)
    
#     return base_dir, train_dir, val_dir

# # Split dataset
# def split_dataset():
#     base_dir = "/kaggle/working/yolo_dataset"
#     images_dir = os.path.join(base_dir, "images")
#     labels_dir = os.path.join(base_dir, "labels")
    
#     # Get all image files
#     image_files = [f for f in os.listdir(images_dir) if f.endswith('.jpg')]
#     random.shuffle(image_files)
    
#     # Split ratio
#     split_ratio = 0.8
#     split_idx = int(len(image_files) * split_ratio)
    
#     train_images = image_files[:split_idx]
#     val_images = image_files[split_idx:]
    
#     # Move files
#     for img_file in train_images:
#         label_file = img_file.replace('.jpg', '.txt')
#         # Move image
#         shutil.copy2(
#             os.path.join(images_dir, img_file),
#             os.path.join(base_dir, "train_images", img_file)
#         )
#         # Move label if exists
#         if os.path.exists(os.path.join(labels_dir, label_file)):
#             shutil.copy2(
#                 os.path.join(labels_dir, label_file),
#                 os.path.join(base_dir, "train_images", "labels", label_file)
#             )
    
#     for img_file in val_images:
#         label_file = img_file.replace('.jpg', '.txt')
#         # Move image
#         shutil.copy2(
#             os.path.join(images_dir, img_file),
#             os.path.join(base_dir, "val_images", img_file)
#         )
#         # Move label if exists
#         if os.path.exists(os.path.join(labels_dir, label_file)):
#             shutil.copy2(
#                 os.path.join(labels_dir, label_file),
#                 os.path.join(base_dir, "val_images", "labels", label_file)
#             )

# # Execute the reorganization
# reset_and_setup_directories()
# split_dataset()

# # Verify the split
# def verify_split():
#     train_images = len(os.listdir('/kaggle/working/yolo_dataset/train_images'))
#     train_labels = len(os.listdir('/kaggle/working/yolo_dataset/train_images/labels'))
#     val_images = len(os.listdir('/kaggle/working/yolo_dataset/val_images'))
#     val_labels = len(os.listdir('/kaggle/working/yolo_dataset/val_images/labels'))
    
#     print(f"Training set: {train_images} images, {train_labels} labels")
#     print(f"Validation set: {val_images} images, {val_labels} labels")

# verify_split()

In [19]:
import os
import glob

# Clean up any existing cache files first
for cache_file in glob.glob("/kaggle/working/yolo_dataset/*.cache"):
    os.remove(cache_file)

def check_dataset_structure():
    base_dir = "/kaggle/working/yolo_dataset"
    
    # Check train directory
    train_images = glob.glob(os.path.join(base_dir, "train_images", "*.jpg"))
    train_labels = glob.glob(os.path.join(base_dir, "train_images", "labels", "*.txt"))
    
    # Check val directory
    val_images = glob.glob(os.path.join(base_dir, "val_images", "*.jpg"))
    val_labels = glob.glob(os.path.join(base_dir, "val_images", "labels", "*.txt"))
    
    print("Directory structure:")
    print(f"Train Images ({len(train_images)}): {os.path.join(base_dir, 'train_images', '*.jpg')}")
    print(f"Train Labels ({len(train_labels)}): {os.path.join(base_dir, 'train_images', 'labels', '*.txt')}")
    print(f"Val Images ({len(val_images)}): {os.path.join(base_dir, 'val_images', '*.jpg')}")
    print(f"Val Labels ({len(val_labels)}): {os.path.join(base_dir, 'val_images', 'labels', '*.txt')}")
    
    # Check label format of a few files
    if train_labels:
        print("\nSample train label content:")
        with open(train_labels[0], 'r') as f:
            print(f.read().strip())
    
    if val_labels:
        print("\nSample val label content:")
        with open(val_labels[0], 'r') as f:
            print(f.read().strip())

check_dataset_structure()

# Let's also check if images and labels match
def verify_image_label_pairs():
    base_dir = "/kaggle/working/yolo_dataset"
    
    # Check train set
    train_images = set(os.path.splitext(os.path.basename(f))[0] 
                      for f in glob.glob(os.path.join(base_dir, "train_images", "*.jpg")))
    train_labels = set(os.path.splitext(os.path.basename(f))[0] 
                      for f in glob.glob(os.path.join(base_dir, "train_images", "labels", "*.txt")))
    
    # Check val set
    val_images = set(os.path.splitext(os.path.basename(f))[0] 
                    for f in glob.glob(os.path.join(base_dir, "val_images", "*.jpg")))
    val_labels = set(os.path.splitext(os.path.basename(f))[0] 
                    for f in glob.glob(os.path.join(base_dir, "val_images", "labels", "*.txt")))
    
    print("\nMatching analysis:")
    print(f"Train: {len(train_images.intersection(train_labels))}/{len(train_images)} images have matching labels")
    print(f"Val: {len(val_images.intersection(val_labels))}/{len(val_images)} images have matching labels")

verify_image_label_pairs()

# Print the current yaml file content
print("\nDataset YAML content:")
with open("/kaggle/working/dataset.yaml", 'r') as f:
    print(f.read())

Directory structure:
Train Images (18896): /kaggle/working/yolo_dataset/train_images/*.jpg
Train Labels (18896): /kaggle/working/yolo_dataset/train_images/labels/*.txt
Val Images (4725): /kaggle/working/yolo_dataset/val_images/*.jpg
Val Labels (4725): /kaggle/working/yolo_dataset/val_images/labels/*.txt

Sample train label content:
0 0.519094 0.239037 0.862584 0.269257

Sample val label content:
0 0.499320 0.483881 0.808674 0.716865

Matching analysis:
Train: 18896/18896 images have matching labels
Val: 4725/4725 images have matching labels

Dataset YAML content:
names:
- table
- figure
nc: 2
path: /kaggle/working/yolo_dataset
train: train_images
val: val_images



In [20]:
import shutil

def fix_directory_structure():
    base_dir = "/kaggle/working/yolo_dataset"
    
    # Move label files to correct locations
    train_labels = glob.glob(os.path.join(base_dir, "train_images", "labels", "*.txt"))
    val_labels = glob.glob(os.path.join(base_dir, "val_images", "labels", "*.txt"))
    
    # Move train labels to root of train_images
    for label in train_labels:
        shutil.move(label, os.path.join(base_dir, "train_images", os.path.basename(label)))
    
    # Move val labels to root of val_images
    for label in val_labels:
        shutil.move(label, os.path.join(base_dir, "val_images", os.path.basename(label)))
    
    # Update yaml file
    yaml_data = {
        "path": "/kaggle/working/yolo_dataset",
        "train": "train_images",
        "val": "val_images",
        "nc": 2,
        "names": ["table", "figure"]
    }
    
    with open("/kaggle/working/dataset.yaml", 'w') as f:
        yaml.dump(yaml_data, f, default_flow_style=False)

fix_directory_structure()

In [23]:
!pip install grpcio==1.62.2

Collecting grpcio==1.62.2
  Downloading grpcio-1.62.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Downloading grpcio-1.62.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: grpcio
  Attempting uninstall: grpcio
    Found existing installation: grpcio 1.68.1
    Uninstalling grpcio-1.68.1:
      Successfully uninstalled grpcio-1.68.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.10.0 requires tensorflow==2.17.0, but you have tensorflow 2.17.1 which is incompatible.[0m[31m
[0mSuccessfully installed grpcio-1.62.2


In [25]:
!pip install --upgrade ray

Note: you may need to restart the kernel to use updated packages.


In [None]:
# !rm -rf runs/
from ultralytics import YOLO

# Load a model
# model = YOLO('yolov8n.pt')
model = YOLO('yolo11n.pt')

# Train the model without Ray Tune
model.train(data='dataset.yaml', epochs=50, imgsz=640)

In [None]:
from ultralytics import YOLO

# Load the trained model
model = YOLO('runs/detect/train/weights/best.pt')  # Path to your trained model weights

# Perform prediction on an image
results = model.predict(source='path/to/your/image.jpg', save=True, conf=0.5)  # Replace with your image path

# Extract and print bounding box coordinates
for result in results:
    boxes = result.boxes  # Bounding box coordinates
    for box in boxes:
        # Get coordinates (xmin, ymin, xmax, ymax)
        xmin, ymin, xmax, ymax = box.xyxy[0].tolist()
        print(f"Bounding Box Coordinates: xmin={xmin}, ymin={ymin}, xmax={xmax}, ymax={ymax}")

        # Get class ID and confidence
        class_id = box.cls[0].item()
        confidence = box.conf[0].item()
        print(f"Class ID: {class_id}, Confidence: {confidence}")

# The output image with bounding boxes will be saved in the 'runs/detect/predict' folder