## Here we prepare the coco data set for generating cocept set

In [1]:
dataset_path= "/ds/images/coco_2014/dataset_coco.json"

In [7]:
import json
import os

def get_coco_snipet(json_path, index=1, train=True):
    """
    Extracts a single sample from a COCO-style JSON file and saves it as a new JSON file.

    Args:
        json_path (str): Path to the COCO JSON file.
        index (int): Index of the desired sample (1-based index).
        train (bool): Whether to extract from the "train" or "validation" set.

    Returns:
        None: Saves a new JSON file with the snippet.
    """
    try:
        # Load the JSON file
        with open(json_path, 'r') as f:
            coco_data = json.load(f)

        # Filter items from the "images" key based on the "split" field
        subset = [item for item in coco_data.get("images", []) if item.get("split") == ("train" if train else "val")]

        # Check if the index is within range
        if index < 1 or index > len(subset):
            raise IndexError(f"Index {index} is out of range for the '{'train' if train else 'validation'}' dataset.")

        # Get the 1-based index sample
        selected_sample = subset[index - 1]

        # Create the output JSON structure
        output_data = {
            "images": [selected_sample],
            "info": coco_data.get("info", {}),
            "licenses": coco_data.get("licenses", []),
            "categories": coco_data.get("categories", []),
        }

        # Generate output file name
        output_file = f"coco_snippet_{'train' if train else 'validation'}_index_{index}.json"

        # Save the new JSON file
        with open(output_file, 'w') as f:
            json.dump(output_data, f, indent=4)

        print(f"Snippet saved to {output_file}")

    except FileNotFoundError:
        print(f"File not found: {json_path}")
    except json.JSONDecodeError:
        print(f"Failed to parse JSON file: {json_path}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
# get_coco_snipet('path/to/coco.json', index=1, train=True)


In [8]:
get_coco_snipet(dataset_path, index=1, train=True)

Snippet saved to coco_snippet_train_index_1.json


## Patch processing


In [16]:
import os
import json
from PIL import Image

# Function to split image into n patches
def split_image(image, n):
    width, height = image.size
    patches = []
    patch_width = width // n
    patch_height = height // n
    
    # Create n x n patches
    for i in range(n):
        for j in range(n):
            left = j * patch_width
            upper = i * patch_height
            right = left + patch_width
            lower = upper + patch_height
            
            # Crop the image and add it to patches list
            patch = image.crop((left, upper, right, lower))
            patches.append(patch)
    return patches

# Function to save patches and update JSON
def save_patches_and_update_json(json_data, root_dir, output_dir, n):
    image_data = json_data['images'][0]
    img_filepath = image_data['filepath']
    img_filename = image_data['filename']
    
    # Load the image from the filepath
    image_path = os.path.join(root_dir, img_filepath, img_filename)
    image = Image.open(image_path)
    
    # Split the image into n patches
    patches = split_image(image, n)
    

    # Create a direcrtoy for patches
    patch_root = img_filename.split('.')[0]
    #patch_file_path = os.path.join(output_dir, patch_dir)
    #os.makedirs(patch_file_path , exist_ok=True)
    # Prepare new image JSON data for each patch
    
    new_images = []
    for i, patch in enumerate(patches):
        patch_filename = f"{patch_root}_patch_{i+1}.jpg"
        patch_path = os.path.join(output_dir, patch_filename)
        
        # Save the patch image
        patch.save(patch_path)
        
        # Create new JSON entry for the patch
        new_image_data = image_data.copy()
        new_image_data['filename'] = patch_filename
        new_image_data['split'] = f"{new_image_data['split']}_patch"
        new_image_data['filepath'] = os.path.split(output_dir)[-1]
        new_image_data['imgid'] = f"{image_data['imgid']}_{i+1}"
        new_image_data['sentences'] = [sent.copy() for sent in image_data['sentences']]  # Clone the sentences
        
        # Update the new image entry with updated sentid, filepath, and imgid
        new_images.append(new_image_data)
    
    # Return the new images JSON data
    json_data['images'] = new_images
    return json_data



In [17]:
# Load the JSON data
json_path = "/home/kadir/xl-vlms/playground/coco_snippet_train_index_1.json"
with open(json_path, 'r') as f:
    json_data = json.load(f)

# Define your root and output directories
root_dir = "/ds/images/coco_2014"
output_dir = "/ds/images/xai_vision/train_patches"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Number of patches (n)
n = 10  # For example, split into 2x2 patches

# Process the image and update JSON
updated_json_data = save_patches_and_update_json(json_data, root_dir, output_dir, n)

# Save the updated JSON data
updated_json_path = "/ds/images/xai_vision/patches_json_file.json"
with open(updated_json_path, 'w') as f:
    json.dump(updated_json_data, f, indent=4)

print(f"Processed {n}x{n} patches, saved them in {output_dir}, and updated the JSON file.")


Processed 10x10 patches, saved them in /ds/images/xai_vision/train_patches, and updated the JSON file.
