In [1]:
import requests
import json
import os

# URL pointing directly to the raw JSON data
url = 'https://huggingface.co/datasets/BoKelvin/SLAKE/raw/main/train.json'

# Sending a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    try:
        # Load the JSON data
        data = response.json()

        # Define the desired directory and filename
        writable_directory = '/Users/jeffreysherer/Desktop/DissertationCode'
        filename = os.path.join(writable_directory, 'train.json')

        # Create the directory if it does not exist
        os.makedirs(writable_directory, exist_ok=True)

        # Write JSON data to a file
        with open(filename, 'w') as file:
            json.dump(data, file, indent=4)

        print(f"Data downloaded and saved successfully to {filename}!")

    except ValueError as e:
        print("Failed to decode JSON:", e)
    except FileNotFoundError as fnf_error:
        print(f"Directory not found: {fnf_error}")
    except OSError as os_error:
        print(f"OS error: {os_error}")
else:
    print("Failed to download data. Status code:", response.status_code)


Data downloaded and saved successfully to /Users/jeffreysherer/Desktop/DissertationCode/train.json!


In [2]:
import json

def find_keys(obj, keys_set):
    """ Recursively find all keys in a JSON object and add them to a set. """
    if isinstance(obj, dict):
        for key, value in obj.items():
            keys_set.add(key)
            find_keys(value, keys_set)
    elif isinstance(obj, list):
        for item in obj:
            find_keys(item, keys_set)

def main():
    filename = '/tmp/train.json'  # Corrected path to the JSON file

    # Load the JSON data from the file
    try:
        with open(filename, 'r') as file:
            data = json.load(file)
        
        # Set to store unique keys
        unique_keys = set()
        
        # Find all unique keys in the JSON data
        find_keys(data, unique_keys)
        
        # Print all unique keys
        print("Unique keys in the JSON file:")
        for key in unique_keys:
            print(key)

    except FileNotFoundError:
        print(f"Error: The file {filename} does not exist.")
    except json.JSONDecodeError:
        print("Error: Failed to decode JSON from the file.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()


Unique keys in the JSON file:
img_id
img_name
modality
base_type
content_type
q_lang
answer
question
triple
answer_type
location
qid


In [3]:
import os
import json
import shutil

def has_numerical_values(data):
    """Recursively search for any numerical value in JSON data."""
    if isinstance(data, dict):
        for k, v in data.items():
            if has_numerical_values(v):
                print(f"Found numerical value in key '{k}': {v}")
                return True
    elif isinstance(data, list):
        for index, item in enumerate(data):
            if has_numerical_values(item):
                print(f"Found numerical value in list at index {index}: {item}")
                return True
    elif isinstance(data, (int, float)):
        print(f"Found numerical value: {data}")
        return True
    return False

def process_instance(src_path, numerical_dir, non_numerical_dir):
    """Process a single instance, moving it to the correct directory."""
    json_file_path = os.path.join(src_path, 'detection.json')
    if not os.path.isfile(json_file_path):
        print(f"No detection.json file found in {src_path}")
        return

    with open(json_file_path, 'r') as file:
        try:
            data = json.load(file)
        except json.JSONDecodeError:
            print(f"Failed to decode JSON in file {json_file_path}")
            return

    # Determine the destination directory
    dest_path = numerical_dir if has_numerical_values(data) else non_numerical_dir

    # Create the new directory if it doesn't exist
    dest_instance_path = os.path.join(dest_path, os.path.basename(src_path))
    os.makedirs(dest_instance_path, exist_ok=True)

    # Copy all files and subdirectories from the source to the destination
    for item in os.listdir(src_path):
        src_item_path = os.path.join(src_path, item)
        dest_item_path = os.path.join(dest_instance_path, item)
        if os.path.isdir(src_item_path):
            shutil.copytree(src_item_path, dest_item_path, dirs_exist_ok=True)
        else:
            shutil.copy2(src_item_path, dest_item_path)

    print(f"Moved {src_path} to {dest_instance_path}")


# Paths
imgs_dir = '/Users/jeffreysherer/Desktop/DissertationCode/imgs'
numerical_dir = '/Users/jeffreysherer/Desktop/DissertationCode/BB+Mask'
non_numerical_dir = '/Users/jeffreysherer/Desktop/DissertationCode/NoBB+Mask'

# Process all instances
for instance_folder in os.listdir(imgs_dir):
    src_path = os.path.join(imgs_dir, instance_folder)
    if os.path.isdir(src_path):
        process_instance(src_path, numerical_dir, non_numerical_dir)


Found numerical value: 49.0
Found numerical value in list at index 0: 49.0
Found numerical value in key 'Liver': [49.0, 74.0, 81.0, 96.0]
Found numerical value in list at index 0: {'Liver': [49.0, 74.0, 81.0, 96.0]}
Moved /Users/jeffreysherer/Desktop/DissertationCode/imgs/xmlab29 to /Users/jeffreysherer/Desktop/DissertationCode/BB+Mask/xmlab29
Found numerical value: 576.0
Found numerical value in list at index 0: 576.0
Found numerical value in key 'Atelectasis': [576.0, 490.0, 240.0, 222.0]
Found numerical value in list at index 0: {'Atelectasis': [576.0, 490.0, 240.0, 222.0]}
Moved /Users/jeffreysherer/Desktop/DissertationCode/imgs/xmlab198 to /Users/jeffreysherer/Desktop/DissertationCode/BB+Mask/xmlab198
Found numerical value: 124.0
Found numerical value in list at index 0: 124.0
Found numerical value in key 'Brain Edema': [124.0, 98.0, 45.0, 68.0]
Found numerical value in list at index 0: {'Brain Edema': [124.0, 98.0, 45.0, 68.0]}
Moved /Users/jeffreysherer/Desktop/DissertationCode/

In [4]:
import os

def count_directories(path):
    """Count directories in a given path."""
    return sum(1 for entry in os.listdir(path) if os.path.isdir(os.path.join(path, entry)))

# Path to the 'NoBB+Mask' directory
nobb_mask_dir = '/Users/jeffreysherer/Desktop/DissertationCode/NoBB+Mask'

# Count the number of directories in 'NoBB+Mask'
nobb_mask_count = count_directories(nobb_mask_dir)

# Print the count
print(f"Total directories in NoBB+Mask: {nobb_mask_count}")


Total directories in NoBB+Mask: 62


In [5]:
import os
import shutil

# Define source, bounding_box, and mask directories
source_dir = '/Users/jeffreysherer/Desktop/DissertationCode/BB+Mask'
bounding_box_dir = '/Users/jeffreysherer/Desktop/DissertationCode/bounding_box'
mask_dir = '/Users/jeffreysherer/Desktop/DissertationCode/mask'

# Ensure the bounding box and mask directories exist
os.makedirs(bounding_box_dir, exist_ok=True)
os.makedirs(mask_dir, exist_ok=True)

# Define bounding box indices
bounding_box_indices = set(range(120, 200)) | set(range(295, 394))

# Process each subdirectory in the source directory
for subdir in os.listdir(source_dir):
    # Extract numeric part assuming format 'xmlab<number>'
    subdir_index_str = ''.join(filter(str.isdigit, subdir[len('xmlab'):]))  # Get numeric part
    if subdir_index_str:
        subdir_index = int(subdir_index_str)
        subdir_path = os.path.join(source_dir, subdir)

        if subdir_index in bounding_box_indices:
            # Move to bounding box directory
            target_subdir_path = os.path.join(bounding_box_dir, subdir)
        else:
            # Move to mask directory
            target_subdir_path = os.path.join(mask_dir, subdir)

        # Move the folder
        shutil.move(subdir_path, target_subdir_path)
        print(f"Moved {subdir} to {target_subdir_path}")

print("Files have been organized into bounding box and mask folders.")


Moved xmlab29 to /Users/jeffreysherer/Desktop/DissertationCode/mask/xmlab29
Moved xmlab198 to /Users/jeffreysherer/Desktop/DissertationCode/bounding_box/xmlab198
Moved xmlab508 to /Users/jeffreysherer/Desktop/DissertationCode/mask/xmlab508
Moved xmlab395 to /Users/jeffreysherer/Desktop/DissertationCode/mask/xmlab395
Moved xmlab16 to /Users/jeffreysherer/Desktop/DissertationCode/mask/xmlab16
Moved xmlab537 to /Users/jeffreysherer/Desktop/DissertationCode/mask/xmlab537
Moved xmlab361 to /Users/jeffreysherer/Desktop/DissertationCode/bounding_box/xmlab361
Moved xmlab153 to /Users/jeffreysherer/Desktop/DissertationCode/bounding_box/xmlab153
Moved xmlab359 to /Users/jeffreysherer/Desktop/DissertationCode/bounding_box/xmlab359
Moved xmlab154 to /Users/jeffreysherer/Desktop/DissertationCode/bounding_box/xmlab154
Moved xmlab366 to /Users/jeffreysherer/Desktop/DissertationCode/bounding_box/xmlab366
Moved xmlab530 to /Users/jeffreysherer/Desktop/DissertationCode/mask/xmlab530
Moved xmlab11 to /Us

Count Mask and BB

In [6]:
import os

def count_folders(directory):
    """Count the number of directories within a specified directory."""
    if not os.path.exists(directory):
        print(f"Directory not found: {directory}")
        return 0
    return sum(1 for item in os.listdir(directory) if os.path.isdir(os.path.join(directory, item)))

# Define the paths to the bounding box and mask directories
bounding_box_dir = '/Users/jeffreysherer/Desktop/DissertationCode/bounding_box'
mask_dir = '/Users/jeffreysherer/Desktop/DissertationCode/mask'

# Count the directories in each
bounding_box_count = count_folders(bounding_box_dir)
mask_count = count_folders(mask_dir)

# Print the counts
print(f"Number of folders in bounding box directory: {bounding_box_count}")
print(f"Number of folders in mask directory: {mask_count}")


Number of folders in bounding box directory: 179
Number of folders in mask directory: 401


Check the bb's are all there

In [7]:
import os

# Define the path to the bounding_box directory
bounding_box_dir = '/Users/jeffreysherer/Desktop/DissertationCode/bounding_box'

# Define the ranges to check
ranges_to_check = list(range(120, 200)) + list(range(295, 394))

# Function to generate folder names based on the specified range
def generate_folder_name(num):
    return f"xmlab{num}"

# Check each folder in the defined ranges
missing_folders = []
for num in ranges_to_check:
    folder_name = generate_folder_name(num)
    folder_path = os.path.join(bounding_box_dir, folder_name)
    # Check if the folder exists
    if not os.path.exists(folder_path):
        missing_folders.append(folder_name)

# Report the findings
if missing_folders:
    print("The following folders are missing:")
    for folder in missing_folders:
        print(folder)
else:
    print("All folders from xmlab120 to xmlab199 and xmlab295 to xmlab393 are present in the bounding box directory.")


All folders from xmlab120 to xmlab199 and xmlab295 to xmlab393 are present in the bounding box directory.


Check if BB and Masks are different

In [8]:
import os

def list_files_and_folders(directory):
    """List all files and folders in a given directory."""
    return {item for item in os.listdir(directory) if os.path.exists(os.path.join(directory, item))}

# Define the directories
bounding_box_dir = '/Users/jeffreysherer/Desktop/DissertationCode/bounding_box'
mask_dir = '/Users/jeffreysherer/Desktop/DissertationCode/mask'

# List all files and folders in each directory
bounding_box_contents = list_files_and_folders(bounding_box_dir)
mask_contents = list_files_and_folders(mask_dir)

# Find common files and folders
common_items = bounding_box_contents & mask_contents

# Report results
if common_items:
    print("The following items are present in both bounding_box and mask directories:")
    for item in common_items:
        print(item)
else:
    print("No common items found between bounding_box and mask directories.")


No common items found between bounding_box and mask directories.


Draws Bounding Boxes on the masked images (currently adds the title of each box from the json)

In [9]:
import os
import json
import cv2
import matplotlib.pyplot as plt

def draw_bounding_box(img_path, json_path, output_path):
    """Draws bounding boxes on an image based on coordinates from a JSON file."""
    # Load the source image
    image = cv2.imread(img_path)
    if image is None:
        print(f"Error: Failed to read image {img_path}.")
        return
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    # Load the JSON data
    with open(json_path, 'r') as json_file:
        data = json.load(json_file)

    for entry in data:
        for title, coordinates in entry.items():
            try:
                x, y, width, height = coordinates

                # Define points for the bounding box
                start_point = (int(x), int(y))
                end_point = (int(x + width), int(y + height))

                # Draw the bounding box
                color = (255, 0, 0)  # Red color in RGB
                thickness = 2
                image_rgb = cv2.rectangle(image_rgb, start_point, end_point, color, thickness)

                # Add the title text above the bounding box
                font = cv2.FONT_HERSHEY_SIMPLEX
                font_scale = 0.5
                font_thickness = 1
                text_x = int(x)
                text_y = int(y - 5 if y - 5 > 0 else y + 15)
                image_rgb = cv2.putText(image_rgb, title, (text_x, text_y), font, font_scale, color, font_thickness, cv2.LINE_AA)

            except (ValueError, KeyError) as e:
                print(f"Skipping invalid bounding box entry: {entry}. Error: {e}")

    # Save the resulting image
    plt.imsave(output_path, image_rgb)


# Define the paths
mask_dir = '/Users/jeffreysherer/Desktop/DissertationCode/mask'
new_bbs_dir = '/Users/jeffreysherer/Desktop/DissertationCode/newBBs'

# Create the newBBs directory if it doesn't exist
os.makedirs(new_bbs_dir, exist_ok=True)

# Process each instance in the mask directory
for instance_name in os.listdir(mask_dir):
    instance_folder = os.path.join(mask_dir, instance_name)
    if not os.path.isdir(instance_folder):
        continue

    source_image_path = os.path.join(instance_folder, 'source.jpg')
    detection_json_path = os.path.join(instance_folder, 'detection.json')
    dest_folder = os.path.join(new_bbs_dir, instance_name)
    os.makedirs(dest_folder, exist_ok=True)
    dest_image_path = os.path.join(dest_folder, 'newBB.png')

    if os.path.exists(source_image_path) and os.path.exists(detection_json_path):
        draw_bounding_box(source_image_path, detection_json_path, dest_image_path)
        print(f"Processed {instance_name} and saved new bounding box image to {dest_image_path}.")
    else:
        print(f"Skipped {instance_name} due to missing source image or detection JSON.")

print("All relevant files in the mask folder have been processed.")


Processed xmlab29 and saved new bounding box image to /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab29/newBB.png.
Processed xmlab508 and saved new bounding box image to /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab508/newBB.png.
Processed xmlab395 and saved new bounding box image to /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab395/newBB.png.
Processed xmlab16 and saved new bounding box image to /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab16/newBB.png.
Processed xmlab537 and saved new bounding box image to /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab537/newBB.png.
Processed xmlab530 and saved new bounding box image to /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab530/newBB.png.
Processed xmlab11 and saved new bounding box image to /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab11/newBB.png.
Processed xmlab539 and saved new bounding box image to /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xm

In [14]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load the BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

def get_bert_embedding(text):
    """Get the BERT vector representation for a given text."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        # Get the embedding of the `[CLS]` token (first token)
        return outputs.last_hidden_state[0][0].numpy()

def find_best_match(bbox_labels, answers):
    """Match bounding box labels with answers based on cosine similarity."""
    bbox_embeddings = {label: get_bert_embedding(label) for label in bbox_labels}
    answer_embeddings = {answer: get_bert_embedding(answer) for answer in answers}

    matches = {}
    for bbox_label, bbox_vector in bbox_embeddings.items():
        best_match = None
        best_score = -1
        for answer, answer_vector in answer_embeddings.items():
            score = cosine_similarity([bbox_vector], [answer_vector])[0][0]
            if score > best_score:
                best_score = score
                best_match = answer
        matches[bbox_label] = best_match
        print(f"Bounding Box Label: {bbox_label} -> Best Match: {best_match} (Score: {best_score})")
    return matches

# Example labels and answers
bbox_labels = ["Liver", "Left Kidney", "Right Kidney", "Spleen"]
answers = ["Liver is present", "Left Kidney is healthy", "Right Kidney is visible", "Spleen looks enlarged"]

# Find the best matches
matches = find_best_match(bbox_labels, answers)


  from .autonotebook import tqdm as notebook_tqdm


Bounding Box Label: Liver -> Best Match: Spleen looks enlarged (Score: 0.8437981605529785)
Bounding Box Label: Left Kidney -> Best Match: Left Kidney is healthy (Score: 0.9078639149665833)
Bounding Box Label: Right Kidney -> Best Match: Left Kidney is healthy (Score: 0.8932849168777466)
Bounding Box Label: Spleen -> Best Match: Liver is present (Score: 0.8500627279281616)


In [1]:
import os

# Directory where the folders are located
src_dir = '/Users/jeffreysherer/Desktop/DissertationCode/newBBs'

# Iterate over each subdirectory in the source directory
for subdir in os.listdir(src_dir):
    subdir_path = os.path.join(src_dir, subdir)
    
    # Check if the path is indeed a directory
    if os.path.isdir(subdir_path):
        file_path = os.path.join(subdir_path, 'newBB.png')
        
        # Check if the file exists in the directory
        if os.path.exists(file_path):
            new_file_path = os.path.join(subdir_path, 'mask.png')
            
            # Rename the file
            os.rename(file_path, new_file_path)
            print(f"Renamed {file_path} to {new_file_path}")
        else:
            print(f"No 'newBB.png' found in {subdir_path}")
    else:
        print(f"Skipped {subdir_path}, not a directory")

print("Renaming completed.")


Renamed /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab29/newBB.png to /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab29/mask.png
Renamed /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab508/newBB.png to /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab508/mask.png
Renamed /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab395/newBB.png to /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab395/mask.png
Renamed /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab16/newBB.png to /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab16/mask.png
Renamed /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab537/newBB.png to /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab537/mask.png
Renamed /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab530/newBB.png to /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab530/mask.png
Renamed /Users/jeffreysherer/Desktop/DissertationCode/newBBs/xmlab11/newBB.png t

In [2]:
import os
import shutil

# Define the source directories
imgs_dir = '/Users/jeffreysherer/Desktop/DissertationCode/imgs'
newbbs_dir = '/Users/jeffreysherer/Desktop/DissertationCode/newBBs'

# Define the target directory for the new dataset
bb_dataset_dir = '/Users/jeffreysherer/Desktop/DissertationCode/BBDataset'

# Ensure the target directory exists, create if not
if not os.path.exists(bb_dataset_dir):
    os.makedirs(bb_dataset_dir)

# Iterate over each folder in the imgs directory
for folder in os.listdir(imgs_dir):
    original_folder_path = os.path.join(imgs_dir, folder)
    newbbs_folder_path = os.path.join(newbbs_dir, folder)
    target_folder_path = os.path.join(bb_dataset_dir, folder)

    # Copy the original folder to the new dataset directory
    if os.path.isdir(original_folder_path):
        shutil.copytree(original_folder_path, target_folder_path)

        # If the same folder exists in newBBs, replace the mask.png
        if os.path.exists(newbbs_folder_path):
            newbbs_mask_path = os.path.join(newbbs_folder_path, 'mask.png')
            target_mask_path = os.path.join(target_folder_path, 'mask.png')

            if os.path.isfile(newbbs_mask_path):
                # Replace the mask.png in the target directory
                shutil.copy2(newbbs_mask_path, target_mask_path)
                print(f"Replaced mask.png in {folder} with the version from newBBs.")
            else:
                print(f"No mask.png to replace in {folder}.")
        else:
            print(f"No matching folder in newBBs for {folder}.")
    else:
        print(f"Skipped {folder}, not a directory.")

print("Dataset creation completed.")


Replaced mask.png in xmlab29 with the version from newBBs.
No matching folder in newBBs for xmlab198.
Replaced mask.png in xmlab508 with the version from newBBs.
Replaced mask.png in xmlab395 with the version from newBBs.
Replaced mask.png in xmlab16 with the version from newBBs.
Replaced mask.png in xmlab537 with the version from newBBs.
No matching folder in newBBs for xmlab361.
No matching folder in newBBs for xmlab153.
No matching folder in newBBs for xmlab359.
No matching folder in newBBs for xmlab154.
No matching folder in newBBs for xmlab366.
Replaced mask.png in xmlab530 with the version from newBBs.
Replaced mask.png in xmlab11 with the version from newBBs.
No matching folder in newBBs for xmlab392.
Replaced mask.png in xmlab539 with the version from newBBs.
Replaced mask.png in xmlab18 with the version from newBBs.
Replaced mask.png in xmlab506 with the version from newBBs.
No matching folder in newBBs for xmlab162.
No matching folder in newBBs for xmlab350.
Replaced mask.png

In [10]:
import os
import json
import random
from pathlib import Path

def load_data(base_path):
    dataset = []
    for folder in os.listdir(base_path):
        folder_path = Path(base_path) / folder
        if folder_path.is_dir():
            img_path = str(folder_path / "source.jpeg")
            detection_path = folder_path / "detection.json"
            question_path = folder_path / "question.json"
            
            if detection_path.exists() and question_path.exists():
                with open(detection_path, 'r') as file:
                    detections = json.load(file)
                
                with open(question_path, 'r') as file:
                    questions = json.load(file)
                
                for question_data in questions:  # Assuming questions is a list
                    # Extract all bounding boxes
                    bounding_boxes = [d for detection in detections for d in detection.values()]
                    entry = {
                        "img_id": folder.strip('xmlab'),
                        "img_name": img_path,
                        "question": question_data["question"],
                        "answer": question_data["answer"],
                        "bounding_box": bounding_boxes,  # Now includes all bounding boxes
                        "q_lang": "en",
                        "modality": question_data.get("modality", ""),
                        "answer_type": "OPEN" if question_data.get("answer_type") == "OPEN" else "CLOSED",
                        "base_type": "vqa",
                        "content_type": question_data.get("content_type", "")
                    }
                    dataset.append(entry)
    return dataset


def split_data(dataset, train_ratio=0.7):
    random.shuffle(dataset)
    split_idx = int(len(dataset) * train_ratio)
    return dataset[:split_idx], dataset[split_idx:]

def save_data(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

# Paths
base_path = "/Users/jeffreysherer/Desktop/DissertationCode/BBDataset"

# Load and process data
dataset = load_data(base_path)
train_data, test_data = split_data(dataset)

# Save to JSON files
save_data(train_data, "/Users/jeffreysherer/Desktop/DissertationCode/train.json")
save_data(test_data, "/Users/jeffreysherer/Desktop/DissertationCode/test.json")
