In [1]:
import json

# Path to the original train.json file
train_json_path = '/Users/jeffreysherer/Desktop/Dissertation/Slake1.0/train.json'

# Load the train.json file
with open(train_json_path, 'r') as file:
    train_data = json.load(file)

# Display the first few entries to verify the content
print(train_data[:3])  # Adjust the number of entries to display based on your need


[{'img_id': 1, 'img_name': 'xmlab1/source.jpg', 'question': 'What modality is used to take this image?', 'answer': 'MRI', 'q_lang': 'en', 'location': 'Abdomen', 'modality': 'MRI', 'answer_type': 'OPEN', 'base_type': 'vqa', 'content_type': 'Modality', 'triple': ['vhead', '_', '_'], 'qid': 0}, {'img_id': 1, 'img_name': 'xmlab1/source.jpg', 'question': 'Which part of the body does this image belong to?', 'answer': 'Abdomen', 'q_lang': 'en', 'location': 'Abdomen', 'modality': 'MRI', 'answer_type': 'OPEN', 'base_type': 'vqa', 'content_type': 'Position', 'triple': ['vhead', '_', '_'], 'qid': 1}, {'img_id': 1, 'img_name': 'xmlab1/source.jpg', 'question': 'What is the mr weighting in this image?', 'answer': 'T2', 'q_lang': 'en', 'location': 'Abdomen', 'modality': 'MRI', 'answer_type': 'OPEN', 'base_type': 'vqa', 'content_type': 'Modality', 'triple': ['vhead', '_', '_'], 'qid': 2}]


In [2]:
import os

def load_detection_json(img_folder):
    # Path to the folder containing detection.json
    detection_json_path = os.path.join('/Users/jeffreysherer/Dissertation/data/imgs-1', img_folder, 'detection.json')
    
    # Load the detection.json file
    if os.path.exists(detection_json_path):
        with open(detection_json_path, 'r') as file:
            detection_data = json.load(file)
        return detection_data
    else:
        return None  # Return None if file does not exist

# Test the function with an example folder
example_folder = 'xmlab1'
example_detection = load_detection_json(example_folder)
print(example_detection)  # Display the content of the detection.json for verification


[{'Liver': [54.0, 106.0, 30.0, 31.0]}]


In [3]:
# Function to add detection data to train_data entries
def integrate_detection_data(train_data):
    for item in train_data:
        # Extract the folder name from img_name
        folder_name = item['img_name'].split('/')[0]
        
        # Load detection data for the corresponding folder
        detection_data = load_detection_json(folder_name)
        
        # If detection data exists, add it to the train data item
        if detection_data:
            item['detection'] = detection_data
    
    return train_data

# Update train_data with detection information
updated_train_data = integrate_detection_data(train_data)

# Print a few updated entries to verify that detection data has been added correctly
print(updated_train_data[:3])  # Adjust the number of entries to display based on your need


[{'img_id': 1, 'img_name': 'xmlab1/source.jpg', 'question': 'What modality is used to take this image?', 'answer': 'MRI', 'q_lang': 'en', 'location': 'Abdomen', 'modality': 'MRI', 'answer_type': 'OPEN', 'base_type': 'vqa', 'content_type': 'Modality', 'triple': ['vhead', '_', '_'], 'qid': 0, 'detection': [{'Liver': [54.0, 106.0, 30.0, 31.0]}]}, {'img_id': 1, 'img_name': 'xmlab1/source.jpg', 'question': 'Which part of the body does this image belong to?', 'answer': 'Abdomen', 'q_lang': 'en', 'location': 'Abdomen', 'modality': 'MRI', 'answer_type': 'OPEN', 'base_type': 'vqa', 'content_type': 'Position', 'triple': ['vhead', '_', '_'], 'qid': 1, 'detection': [{'Liver': [54.0, 106.0, 30.0, 31.0]}]}, {'img_id': 1, 'img_name': 'xmlab1/source.jpg', 'question': 'What is the mr weighting in this image?', 'answer': 'T2', 'q_lang': 'en', 'location': 'Abdomen', 'modality': 'MRI', 'answer_type': 'OPEN', 'base_type': 'vqa', 'content_type': 'Modality', 'triple': ['vhead', '_', '_'], 'qid': 2, 'detectio

In [5]:
import os
import json
import uuid  # To generate unique IDs

# Base path where the original train.json is located
base_path = '/Users/jeffreysherer/Desktop/Dissertation/Slake1.0'

# Directory for the augmented files
augmented_dir = os.path.join(base_path, 'augmented')

# Create the augmented directory if it doesn't exist
os.makedirs(augmented_dir, exist_ok=True)

# Directory containing the image folders with detection.json files
image_directory = '/Users/jeffreysherer/Dissertation/data/imgs-1'

# Path to the original train.json file
train_json_path = os.path.join(base_path, 'train.json')

# Load the train.json file
with open(train_json_path, 'r') as file:
    train_data = json.load(file)

# Function to update the data with bounding box coordinates
def update_data_with_bounding_boxes(data):
    for entry in data:
        img_folder = os.path.dirname(entry['img_name'])
        detection_json_path = os.path.join(image_directory, img_folder, 'detection.json')

        if os.path.exists(detection_json_path):
            with open(detection_json_path, 'r') as file:
                detection_data = json.load(file)
            entry['detection'] = detection_data

    return data

# Update the train data with bounding box coordinates
updated_train_data = update_data_with_bounding_boxes(train_data)

# Function to transform the data to the required format
def transform_data(data):
    transformed_data = []
    for entry in data:
        unique_id = str(uuid.uuid4())  # Generate a unique ID for each entry
        img_path = entry['img_name']  # Use the img_name as the image path
        question = entry['question']
        answer = entry['answer']
        
        transformed_entry = {
            "id": unique_id,
            "image": img_path,
            "conversations": [
                {
                    "from": "human",
                    "value": f"<image>\n{question}"
                },
                {
                    "from": "gpt",
                    "value": answer
                }
            ]
        }
        if 'detection' in entry:
            transformed_entry['detection'] = entry['detection']
        transformed_data.append(transformed_entry)
    return transformed_data

# Transform the train data
transformed_train_data = transform_data(updated_train_data)

# Path to save the final train.json file
final_train_json_path = os.path.join(augmented_dir, 'final_train.json')

# Write the transformed train data to a new JSON file
with open(final_train_json_path, 'w') as file:
    json.dump(transformed_train_data, file, indent=4)

print("Final train.json has been saved to", final_train_json_path)


Final train.json has been saved to /Users/jeffreysherer/Desktop/Dissertation/Slake1.0/augmented/final_train.json


In [6]:
import json
import uuid  # To generate unique IDs

# Assuming 'updated_train_data' is already defined from the previous cell

# Transform the data to the required format
transformed_data = []
for entry in updated_train_data:
    unique_id = str(uuid.uuid4())  # Generate a unique ID for each entry
    img_path = entry['img_name']  # Use the img_name as the image path
    question = entry['question']
    answer = entry['answer']
    
    transformed_entry = {
        "id": unique_id,
        "image": img_path,
        "conversations": [
            {
                "from": "human",
                "value": f"<image>\n{question}"
            },
            {
                "from": "gpt",
                "value": answer
            }
        ]
    }
    if 'detection' in entry:
        transformed_entry['detection'] = entry['detection']
    transformed_data.append(transformed_entry)

# Path to save the final transformed JSON file
final_train_json_path = '/Users/jeffreysherer/Desktop/Dissertation/Slake1.0/augmented/final_train.json'

# Write the transformed data to a new JSON file
with open(final_train_json_path, 'w') as file:
    json.dump(transformed_data, file, indent=4)

print("Final train.json has been saved to", final_train_json_path)


Final train.json has been saved to /Users/jeffreysherer/Desktop/Dissertation/Slake1.0/augmented/final_train.json


In [7]:
import json
import random

# Path to the final transformed train.json file
final_train_json_path = '/Users/jeffreysherer/Desktop/Dissertation/Slake1.0/augmented/final_train.json'

# Load the final transformed train.json file
with open(final_train_json_path, 'r') as file:
    transformed_data = json.load(file)

# Randomly sample 5 entries from the dataset to inspect
sampled_entries = random.sample(transformed_data, 5)

# Print the sampled entries
for entry in sampled_entries:
    print(json.dumps(entry, indent=4))


{
    "id": "a13bafb5-1129-4df1-a2f6-68cef4831a5f",
    "image": "xmlab157/source.jpg",
    "conversations": [
        {
            "from": "human",
            "value": "<image>\n\u8fd9\u5f20\u56fe\u7247\u7684\u6210\u50cf\u65b9\u5f0f\u662f\u4ec0\u4e48?"
        },
        {
            "from": "gpt",
            "value": "X-Ray"
        }
    ],
    "detection": [
        {
            "Pneumonia": [
                113.0,
                200.0,
                227.0,
                381.0
            ]
        }
    ]
}
{
    "id": "22088a0c-9c49-45e2-903d-6522b8e4734d",
    "image": "xmlab141/source.jpg",
    "conversations": [
        {
            "from": "human",
            "value": "<image>\nWhere is/are the abnormality located?"
        },
        {
            "from": "gpt",
            "value": "Right Lung, Left"
        }
    ],
    "detection": [
        {
            "Mass": [
                290.0,
                326.0,
                153.0,
                198.0
    

Now doing the trst adn validate json files


In [8]:
import json
import uuid  # To generate unique IDs
import os

# Paths to the original validate.json and test.json files
validate_json_path = '/Users/jeffreysherer/Desktop/Dissertation/Slake1.0/validate.json'
test_json_path = '/Users/jeffreysherer/Desktop/Dissertation/Slake1.0/test.json'

# Directory containing the image folders with detection.json files
image_directory = '/Users/jeffreysherer/Dissertation/data/imgs-1'

# Load the validate.json file
with open(validate_json_path, 'r') as file:
    validate_data = json.load(file)

# Load the test.json file
with open(test_json_path, 'r') as file:
    test_data = json.load(file)

# Function to update the data with bounding box coordinates
def update_data_with_bounding_boxes(data):
    for entry in data:
        img_folder = os.path.dirname(entry['img_name'])
        detection_json_path = os.path.join(image_directory, img_folder, 'detection.json')

        if os.path.exists(detection_json_path):
            with open(detection_json_path, 'r') as file:
                detection_data = json.load(file)
            entry['detection'] = detection_data

    return data

# Function to transform the data to the required format
def transform_data(data):
    transformed_data = []
    for entry in data:
        unique_id = str(uuid.uuid4())  # Generate a unique ID for each entry
        img_path = entry['img_name']  # Use the img_name as the image path
        question = entry['question']
        answer = entry['answer']
        
        transformed_entry = {
            "id": unique_id,
            "image": img_path,
            "conversations": [
                {
                    "from": "human",
                    "value": f"<image>\n{question}"
                },
                {
                    "from": "gpt",
                    "value": answer
                }
            ]
        }
        if 'detection' in entry:
            transformed_entry['detection'] = entry['detection']
        transformed_data.append(transformed_entry)
    return transformed_data

# Update validate and test data with bounding box coordinates
updated_validate_data = update_data_with_bounding_boxes(validate_data)
updated_test_data = update_data_with_bounding_boxes(test_data)

# Transform validate and test data
transformed_validate_data = transform_data(updated_validate_data)
transformed_test_data = transform_data(updated_test_data)

# Display the first few entries to verify the content
print(json.dumps(transformed_validate_data[:3], indent=4))  # Adjust the number of entries to display based on your need
print(json.dumps(transformed_test_data[:3], indent=4))  # Adjust the number of entries to display based on your need

# Paths to save the final transformed JSON files
final_validate_json_path = '/Users/jeffreysherer/Desktop/Dissertation/Slake1.0/augmented/final_validate.json'
final_test_json_path = '/Users/jeffreysherer/Desktop/Dissertation/Slake1.0/augmented/final_test.json'

# Write the transformed validate data to a new JSON file
with open(final_validate_json_path, 'w') as file:
    json.dump(transformed_validate_data, file, indent=4)

# Write the transformed test data to a new JSON file
with open(final_test_json_path, 'w') as file:
    json.dump(transformed_test_data, file, indent=4)

print("Final validate.json has been saved to", final_validate_json_path)
print("Final test.json has been saved to", final_test_json_path)


[
    {
        "id": "29a5c627-dfa3-4e30-ab3c-c088a78b9a7a",
        "image": "xmlab0/source.jpg",
        "conversations": [
            {
                "from": "human",
                "value": "<image>\nWhat modality is used to take this image?"
            },
            {
                "from": "gpt",
                "value": "MRI"
            }
        ],
        "detection": [
            {
                "Liver": [
                    33.0,
                    67.0,
                    98.0,
                    108.0
                ]
            },
            {
                "Left Kidney": [
                    148.0,
                    142.0,
                    46.0,
                    32.0
                ]
            },
            {
                "Right Kidney": [
                    65.0,
                    138.0,
                    41.0,
                    37.0
                ]
            },
            {
                "Spleen": [
                   

In [9]:
import os
import json
import uuid  # To generate unique IDs

# Paths to the original validate.json and test.json files
validate_json_path = '/Users/jeffreysherer/Desktop/Dissertation/Slake1.0/validate.json'
test_json_path = '/Users/jeffreysherer/Desktop/Dissertation/Slake1.0/test.json'

# Directory containing the image folders with detection.json files
image_directory = '/Users/jeffreysherer/Dissertation/data/imgs-1'

# Load the validate.json file
with open(validate_json_path, 'r') as file:
    validate_data = json.load(file)

# Load the test.json file
with open(test_json_path, 'r') as file:
    test_data = json.load(file)

# Function to update and transform the data with bounding box coordinates
def transform_data_with_bounding_boxes(data):
    transformed_data = []
    for entry in data:
        unique_id = str(uuid.uuid4())  # Generate a unique ID for each entry
        img_path = entry['img_name']  # Use the img_name as the image path
        question = entry['question']
        answer = entry['answer']

        # Extract the folder name from img_name to locate detection.json
        img_folder = os.path.dirname(img_path)
        detection_json_path = os.path.join(image_directory, img_folder, 'detection.json')
        
        # Load detection data if it exists
        detection_data = None
        if os.path.exists(detection_json_path):
            with open(detection_json_path, 'r') as file:
                detection_data = json.load(file)
        
        transformed_entry = {
            "id": unique_id,
            "image": img_path,
            "conversations": [
                {
                    "from": "human",
                    "value": f"<image>\n{question}"
                },
                {
                    "from": "gpt",
                    "value": answer
                }
            ]
        }
        
        if detection_data:
            transformed_entry['detection'] = detection_data
        
        transformed_data.append(transformed_entry)
    
    return transformed_data

# Transform validate and test data
transformed_validate_data = transform_data_with_bounding_boxes(validate_data)
transformed_test_data = transform_data_with_bounding_boxes(test_data)

# Display the first few entries to verify the content
print(json.dumps(transformed_validate_data[:3], indent=4))  # Adjust the number of entries to display based on your need
print(json.dumps(transformed_test_data[:3], indent=4))  # Adjust the number of entries to display based on your need

# Paths to save the final transformed JSON files
final_validate_json_path = '/Users/jeffreysherer/Desktop/Dissertation/Slake1.0/augmented/final_validate.json'
final_test_json_path = '/Users/jeffreysherer/Desktop/Dissertation/Slake1.0/augmented/final_test.json'

# Write the transformed validate data to a new JSON file
with open(final_validate_json_path, 'w') as file:
    json.dump(transformed_validate_data, file, indent=4)

# Write the transformed test data to a new JSON file
with open(final_test_json_path, 'w') as file:
    json.dump(transformed_test_data, file, indent=4)

print("Final validate.json has been saved to", final_validate_json_path)
print("Final test.json has been saved to", final_test_json_path)


[
    {
        "id": "4dd4703e-2e6f-4b98-aef4-69e4bd3adf0e",
        "image": "xmlab0/source.jpg",
        "conversations": [
            {
                "from": "human",
                "value": "<image>\nWhat modality is used to take this image?"
            },
            {
                "from": "gpt",
                "value": "MRI"
            }
        ],
        "detection": [
            {
                "Liver": [
                    33.0,
                    67.0,
                    98.0,
                    108.0
                ]
            },
            {
                "Left Kidney": [
                    148.0,
                    142.0,
                    46.0,
                    32.0
                ]
            },
            {
                "Right Kidney": [
                    65.0,
                    138.0,
                    41.0,
                    37.0
                ]
            },
            {
                "Spleen": [
                   

In [10]:
# Paths to save the updated validate and test JSON files
augmented_folder = '/Users/jeffreysherer/Desktop/Dissertation/Slake1.0/augmented'
updated_validate_json_path = os.path.join(augmented_folder, 'BBF_validate.json')
updated_test_json_path = os.path.join(augmented_folder, 'BBF_test.json')

# Ensure the augmented folder exists
os.makedirs(augmented_folder, exist_ok=True)

# Write the updated validate data to a new JSON file
with open(updated_validate_json_path, 'w') as file:
    json.dump(updated_validate_data, file, indent=4)

# Write the updated test data to a new JSON file
with open(updated_test_json_path, 'w') as file:
    json.dump(updated_test_data, file, indent=4)

print("Updated validate.json has been saved to", updated_validate_json_path)
print("Updated test.json has been saved to", updated_test_json_path)


Updated validate.json has been saved to /Users/jeffreysherer/Desktop/Dissertation/Slake1.0/augmented/BBF_validate.json
Updated test.json has been saved to /Users/jeffreysherer/Desktop/Dissertation/Slake1.0/augmented/BBF_test.json


In [11]:
import os
import json
import uuid  # To generate unique IDs

# Function to transform the data with detection coordinates after the answer
def transform_data_with_bounding_boxes_last(data):
    transformed_data = []
    for entry in data:
        unique_id = str(uuid.uuid4())  # Generate a unique ID for each entry
        img_path = entry['img_name']  # Use the img_name as the image path
        question = entry['question']
        answer = entry['answer']

        # Extract the folder name from img_name to locate detection.json
        img_folder = os.path.dirname(img_path)
        detection_json_path = os.path.join(image_directory, img_folder, 'detection.json')
        
        # Load detection data if it exists
        detection_data = None
        if os.path.exists(detection_json_path):
            with open(detection_json_path, 'r') as file:
                detection_data = json.load(file)
        
        transformed_entry = {
            "id": unique_id,
            "image": img_path,
            "conversations": [
                {
                    "from": "human",
                    "value": f"<image>\n{question}"
                },
                {
                    "from": "gpt",
                    "value": answer
                }
            ]
        }

        # Add detection data after the answer
        if detection_data:
            transformed_entry['detection'] = detection_data
        
        transformed_data.append(transformed_entry)
    
    return transformed_data

# Transform the train, validate, and test data with detection coordinates last
transformed_train_data_bbl = transform_data_with_bounding_boxes_last(updated_train_data)
transformed_validate_data_bbl = transform_data_with_bounding_boxes_last(updated_validate_data)
transformed_test_data_bbl = transform_data_with_bounding_boxes_last(updated_test_data)

# Paths to save the final BBL JSON files
final_train_json_path_bbl = os.path.join(augmented_dir, 'BBL_train.json')
final_validate_json_path_bbl = os.path.join(augmented_dir, 'BBL_validate.json')
final_test_json_path_bbl = os.path.join(augmented_dir, 'BBL_test.json')

# Write the transformed train data with detection coordinates last to a new JSON file
with open(final_train_json_path_bbl, 'w') as file:
    json.dump(transformed_train_data_bbl, file, indent=4)

# Write the transformed validate data with detection coordinates last to a new JSON file
with open(final_validate_json_path_bbl, 'w') as file:
    json.dump(transformed_validate_data_bbl, file, indent=4)

# Write the transformed test data with detection coordinates last to a new JSON file
with open(final_test_json_path_bbl, 'w') as file:
    json.dump(transformed_test_data_bbl, file, indent=4)

print("BBL_train.json has been saved to", final_train_json_path_bbl)
print("BBL_validate.json has been saved to", final_validate_json_path_bbl)
print("BBL_test.json has been saved to", final_test_json_path_bbl)


BBL_train.json has been saved to /Users/jeffreysherer/Desktop/Dissertation/Slake1.0/augmented/BBL_train.json
BBL_validate.json has been saved to /Users/jeffreysherer/Desktop/Dissertation/Slake1.0/augmented/BBL_validate.json
BBL_test.json has been saved to /Users/jeffreysherer/Desktop/Dissertation/Slake1.0/augmented/BBL_test.json
