In [2]:
from datasets import get_dataset_loader
import os

In [26]:
from datasets.image_text_dataset import ImageTextDataset
import json
import os
from typing import Any, Callable, Dict, List

import numpy as np
from torch.utils.data import Dataset

from datasets.constants import WORDS
from models.constants import TASK_PROMPTS

class COCODataset(ImageTextDataset):
    def create_dataset(
        self,
    ) -> None:
        annotation_path = os.path.join(self.data_dir, self.annotation_file)
        with open(annotation_path) as f:
            karpathy_data = json.load(f)

        data = []
        for datum in karpathy_data["images"]:
            split_ = datum["split"]
            if split_ != self.split:
                continue

            img_id = datum["filename"].split(".")[0]

            if "train" in img_id:
                source = "train2014"
            elif "val" in img_id:
                source = "val2014"
            else:
                raise NotImplementedError(
                    f"Please specify the image directory for the image: {img_id}"
                )

            image_path = os.path.join(self.data_dir, source, datum["filename"])
            instruction = TASK_PROMPTS.get(self.prompt_template, {}).get(
                "ShortCaptioning", "An image of "
            )
            targets = [d["raw"].strip() for d in datum["sentences"]]
            response = targets[0]  # take only the first caption

            item = {
                "img_id": img_id,
                "instruction": instruction,
                "response": response,
                "image": image_path,
                "targets": "$$".join(targets),
            }
            data.append(item)

        if self.dataset_size > 0:
            data = self.rng.choice(data, size=self.dataset_size, replace=False)

        self.data = data

In [4]:
dataset_cls = COCODataset
dataset = dataset_cls(
        data_dir='/ds/images/coco_2014',
        annotation_file='/ds/images/coco_2014/dataset_coco.json',
        questions_file='annotations.json',
        split='train',
        dataset_size=500,
        seed=0,
        dataset_name='coco',
        mode=("val" if True else "train"),
        prompt_template='llava',
        token_of_interest_num_samples=-1,
)


In [5]:
dataset[0]

{'img_id': 'COCO_train2014_000000064070',
 'instruction': '\nProvide a one-sentence caption for the provided image.',
 'response': 'The huge twin engine airliner has red, blue, and orange paint.',
 'image': '/ds/images/coco_2014/train2014/COCO_train2014_000000064070.jpg',
 'targets': 'The huge twin engine airliner has red, blue, and orange paint.$$Small air plane preparing to land over water.$$An very large airplane that has landed at an airport.$$A Southwest airplane taxis at an airport by the water.$$A Soutwest Airlines jet airplaine taxiing along a runway.',
 'text': '\nProvide a one-sentence caption for the provided image.'}

In [6]:
print(dataset[0])

{'img_id': 'COCO_train2014_000000064070', 'instruction': '\nProvide a one-sentence caption for the provided image.', 'response': 'The huge twin engine airliner has red, blue, and orange paint.', 'image': '/ds/images/coco_2014/train2014/COCO_train2014_000000064070.jpg', 'targets': 'The huge twin engine airliner has red, blue, and orange paint.$$Small air plane preparing to land over water.$$An very large airplane that has landed at an airport.$$A Southwest airplane taxis at an airport by the water.$$A Soutwest Airlines jet airplaine taxiing along a runway.', 'text': '\nProvide a one-sentence caption for the provided image.'}


In [None]:
file = '/ds/images/FFA/physionet.org/files/ffa-ir-medical-report/1.0.0/ffair_annotation.json'
import json

# Load the JSON file
with open(file, 'r') as file:
    data = json.load(file)

# Sort the keys alphabetically (or define your custom order)
sorted_keys = sorted(data.keys())  # Replace this with your desired order if needed

# Print the keys in the sorted order
for key in sorted_keys:
    print(key)


dataset
images


In [7]:
data["images"]


NameError: name 'data' is not defined

In [13]:
import os
import json
from sklearn.model_selection import train_test_split


def extract_relevant_sentences(text):
    """
    Extract sections starting with INDICATION:, FINDINGS, and IMPRESSION.
    """
    sections = {}
    current_section = None

    # Split the text into lines
    lines = text.splitlines()
    for line in lines:
        line = line.strip()
        if line.startswith("FINDINGS"):
            current_section = "FINDINGS"
            sections[current_section] = line[len("FINDINGS:"):].strip()
        elif line.startswith("IMPRESSION"):
            current_section = "IMPRESSION"
            sections[current_section] = line[len("IMPRESSION:"):].strip()
        elif current_section:
            # Add to the current section if it spans multiple lines
            sections[current_section] += " " + line

    # Combine all relevant sections into a single string
    relevant_text = " ".join([sections[section] for section in [ "FINDINGS", "IMPRESSION"] if section in sections])
    return relevant_text


def organize_data_in_format(directory, output_file, train_ratio=0.8):
    data = []
    imgid = 0  # Unique ID for each image
    
    # Iterate through each entry in the directory
    for entry in os.listdir(directory):
        entry_path = os.path.join(directory, entry)
        
        # If the entry is a .txt file
        if entry.endswith('.txt'):
            # Read the content of the .txt file
            with open(entry_path, 'r') as file:
                raw_text = file.read()
            
            # Extract relevant sections from the text
            relevant_text = extract_relevant_sentences(raw_text)
            
            if not relevant_text:
                continue  # Skip if no relevant content is found
            
            # Tokenize the text into sentences
            captions = relevant_text.split(". ")  # Split by periods for sentences
            
            # Get folder name corresponding to the .txt file
            folder_name = os.path.splitext(entry)[0]
            folder_path = os.path.join(directory, folder_name)
            
            # Ensure the folder exists
            if os.path.isdir(folder_path):
                for img_file in os.listdir(folder_path):
                    if img_file.endswith('.dcm'):
                        # Full path to the image
                        image_path = os.path.join(folder_path, img_file)
                        
                        # Extract a filename for the image
                        filename = os.path.basename(image_path)
                        
                        # Create a new entry for the image
                        entry_data = {
                            'filepath': folder_name,
                            'sentids': list(range(imgid * 100, imgid * 100 + len(captions))),
                            'filename': filename,
                            'imgid': imgid,
                            'split': 'val',  # Temporary, will adjust after splitting
                            'sentences': [],
                            'cocoid': imgid
                        }
                        
                        # Add sentences to the entry
                        for i, caption in enumerate(captions):
                            tokens = caption.strip().split()  # Tokenize caption
                            entry_data['sentences'].append({
                                'tokens': tokens,
                                'raw': caption.strip(),
                                'imgid': imgid,
                                'sentid': imgid * 100 + i
                            })
                        
                        # Increment the image ID for each image
                        imgid += 1
                        
                        # Add the entry to the data
                        data.append(entry_data)
    
    # Split data into train and val sets
    train_data, val_data = train_test_split(data, train_size=train_ratio, random_state=42)
    
    # Update the split key for train and val datasets
    for item in train_data:
        item['split'] = 'train'
    for item in val_data:
        item['split'] = 'val'
    
    # Wrap the data under the "images" key and add "type" as "coco"
    output_data = {
        "images": train_data + val_data,
        "type": "coco"
    }
    
    # Save the result to a JSON file
    with open(output_file, 'w') as json_file:
        json.dump(output_data, json_file, indent=2)
    
    print(f"Data successfully saved to {output_file}")





In [14]:
# Example usage
directory = "/netscratch/kadir/xl-vlms/sample_data"  # Replace with the path to your data directory
output_file = "organized_data.json"
organize_data_in_format(directory, output_file)

Data successfully saved to organized_data.json


In [15]:
from datasets.image_text_dataset import ImageTextDataset
import json
import os
from typing import Any, Callable, Dict, List

import numpy as np
from torch.utils.data import Dataset

from datasets.constants import WORDS
from models.constants import TASK_PROMPTS

class XRAYdataset(ImageTextDataset):
    def create_dataset(
        self,
    ) -> None:
        annotation_path = os.path.join(self.data_dir, self.annotation_file)
        with open(annotation_path) as f:
            karpathy_data = json.load(f)

        data = []
        for datum in karpathy_data["images"]:
            split_ = datum["split"]
            if split_ != self.split:
                continue

            img_id = datum["filename"].split(".")[0]


            image_path = os.path.join(self.data_dir, datum["filepath"], datum["filename"])
            instruction = TASK_PROMPTS.get(self.prompt_template, {}).get(
                 "Findings", "Please provide a detailed finding of chest X-ray "
            )
            targets = [d["raw"].strip() for d in datum["sentences"]]
            response = targets[0]  # take only the first caption

            item = {
                "img_id": img_id,
                "instruction": instruction,
                "response": response,
                "image": image_path,
                "targets": "$$".join(targets),
            }
            data.append(item)

        if self.dataset_size > 0:
            data = self.rng.choice(data, size=self.dataset_size, replace=False)

        self.data = data

In [31]:

from datasets.image_text_dataset import ImageTextDataset

dataset_cls = XRAYdataset
dataset = dataset_cls(
        data_dir='/netscratch/kadir/xl-vlms/sample_data',
        annotation_file='/home/kadir/xl-vlms/playground/organized_data.json',
        questions_file='annotations.json',
        split='train',
        dataset_size=10,
        seed=0,
        dataset_name='coco',
        mode=("val" if True else "train"),
        prompt_template='chextagent',
        token_of_interest_num_samples=-1,
)


In [23]:
dataset[0]
missing_files = []
for entry in dataset:
        full_path = entry.get("image", '')
        print(full_path)
        # Check if the file exists
        if not os.path.exists(full_path):
            missing_files.append(full_path)
    
# Report results
if missing_files:
    print("The following files are missing:")
    for missing in missing_files:
        print(missing)
else:
    print("All file paths exist!")

/netscratch/kadir/xl-vlms/sample_data/s56105641/b35e1481-bc791f9a-dfafdae3-ef03967d-3c1cff7c.dcm
/netscratch/kadir/xl-vlms/sample_data/s57473907/e31b309f-72d6b511-383b9a4b-7d20b6a2-de6dfa53.dcm
/netscratch/kadir/xl-vlms/sample_data/s51820245/a7d4ea5c-3d1aa223-f8df852b-e4e86c60-a0b2036f.dcm
/netscratch/kadir/xl-vlms/sample_data/s58896631/ad9066c8-3e02858c-2e0556ed-aaded1ee-50a2d7fe.dcm
/netscratch/kadir/xl-vlms/sample_data/s50423320/0f50bb73-76e0eff4-0f30b9ef-02dc8eeb-58a4c2ec.dcm
/netscratch/kadir/xl-vlms/sample_data/s59888039/6d4403f0-08e832f3-5478e509-3050a2d3-b81b2140.dcm
/netscratch/kadir/xl-vlms/sample_data/s59910851/5dcfd4b2-d8d01a9e-8e0cdb88-59b1e9a4-8b86cc80.dcm
/netscratch/kadir/xl-vlms/sample_data/s56105641/348151d9-e2d62d79-e214f7e6-27bcf16b-cdc41317.dcm
/netscratch/kadir/xl-vlms/sample_data/s58672627/e84af51e-f53d38fb-3ab64326-73b0188f-4071f521.dcm
/netscratch/kadir/xl-vlms/sample_data/s51232822/b62a4d54-81fc729c-89ae4a70-347c7b1b-058d49a1.dcm
All file paths exist!
