In [2]:
import os
import pandas as pd
import cv2
from tqdm import tqdm

csv_file_path = "../DL_for_Hin_Chest_X_Ray/HIN_archive/Data_Entry_2017.csv"
df = pd.read_csv(csv_file_path)
IMAGE_DIR = "../DL_for_Hin_Chest_X_Ray/HIN_archive/images/"
OUTPUT_DIR = "../DL_for_Hin_Chest_X_Ray/HIN_archive/resized_images/"

df = df[:10] # For testing, comment it out

for _, row in tqdm(df.iterrows(), total=len(df)):
    file_name = row["Image Index"]
    image_path = os.path.join(IMAGE_DIR, file_name)

    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    image = cv2.resize(image, (224, 224))
    output_path = os.path.join(OUTPUT_DIR, file_name)
    cv2.imwrite(output_path, image)
    

100%|██████████| 10/10 [00:00<00:00, 79.05it/s]


In [9]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import cv2

csv_file_path = '../DL_for_Hin_Chest_X_Ray/Data_Entry_2017_filtered_2.csv'
# csv_file_path = '../DL_for_Hin_Chest_X_Ray/HIN_archive/Data_Entry_2017.csv'
df = pd.read_csv(csv_file_path)

IMAGE_DIR = "../DL_for_Hin_Chest_X_Ray/HIN_archive/images/"
ALL_LABELS = sorted(df["Finding Labels"].str.split("|").explode().unique())
ALL_LABELS_WITHOUT_NO = [l for l in ALL_LABELS if l != "No Finding"]
NUMBER_CLASSES = len(ALL_LABELS)

def preprocess_image(file_path, image_size):
    image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        return None
    image = cv2.resize(image, (image_size, image_size))
    return image

def prepare_data(df, image_size, image_dir=IMAGE_DIR):
    images = []
    labels = []
    
    for _, row in tqdm(df.iterrows(), total=len(df)):
        image_path = os.path.join(image_dir, row["Image Index"])
        image = preprocess_image(image_path, image_size)
        
        if image is not None:
            images.append(image)

            current_label = np.zeros(NUMBER_CLASSES, dtype=int)
        
            if row["Finding Labels"] != "No Finding":
                indices = [i for i, label in enumerate(ALL_LABELS_WITHOUT_NO) if label in sorted(row["Finding Labels"].split("|"))]
                for idx in indices:
                    if 0 <= idx < NUMBER_CLASSES:
                        current_label[idx] = 1
            else:
                current_label[NUMBER_CLASSES - 1] = 1
            labels.append(current_label)
    
    images = np.array(images).reshape(-1, image_size, image_size)
    images = np.repeat(images[..., np.newaxis], 3, axis=-1) #rgb
    
    labels = np.array(labels)
    
    return images, labels

In [10]:
import pickle

images, labels = prepare_data(df, 224)
with open("../DL_for_HIN_Chest_X_Ray/all_images.pkl", "wb") as file:
    pickle.dump(images, file)
with open("../DL_for_HIN_Chest_X_Ray/all_labels.pkl", "wb") as file:
    pickle.dump(labels, file)

100%|██████████| 87770/87770 [19:01<00:00, 76.90it/s] 


In [11]:
with open(f"../DL_for_HIN_Chest_X_Ray/all_images.pkl", "rb") as file:
    images_2 = pickle.load(file)
with open(f"../DL_for_HIN_Chest_X_Ray/all_labels.pkl", "rb") as file:
    labels_2 = pickle.load(file)
    
print(images_2.shape)
print(labels_2.shape)

(87770, 224, 224, 3)
(87770, 4)
