In [3]:
import os
import cv2
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
# Set the path to the folder containing your .tif images
image_folder = '../data/reduced_images_10000/'

In [None]:
# List all image files in the folder
image_files = os.listdir(image_folder)

# Load and display the first few images
num_images_to_display = 5

for i, image_filename in enumerate(image_files[:num_images_to_display]):
    image_path = os.path.join(image_folder, image_filename)
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)  # Load the image in color
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    plt.subplot(1, num_images_to_display, i + 1)
    plt.imshow(image)
    plt.title(f'Image {i + 1}')
    plt.axis('off')

plt.show()

# You can also load and inspect image properties, such as dimensions
image = cv2.imread(os.path.join(image_folder, image_files[0]), cv2.IMREAD_UNCHANGED)
height, width = image.shape[:2]
print(f"Image dimensions: Height={height}, Width={width}")

In [16]:
# Load your dataset (e.g., using pandas)
reduced_train_labels = pd.read_csv('../data/reduced_labels_10000.csv')

In [5]:
# Inspect the loaded dataset
print("Dataset Inspection:")
print(reduced_train_labels.head())  # Display the first few rows of the dataset
print(reduced_train_labels.describe())  # Display basic statistics of the dataset
print(reduced_train_labels.info())  # Display information about the dataset

Dataset Inspection:
                                         id  label
0  b2ceaa1d67517b7a1b78b818719f4433a3954d16      1
1  aaae66f04aedbe2972560c1ca0f844d9a8c0e125      0
2  64442cb5270538cb8ab39a020506a9db8280e452      0
3  793c110499ab92f4c5c5324f55c595c5e237a968      0
4  9234d1fb4f9d39feaa9df97403e1c04dc47eae59      0
              label
count  10000.000000
mean       0.500000
std        0.500025
min        0.000000
25%        0.000000
50%        0.500000
75%        1.000000
max        1.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10000 non-null  object
 1   label   10000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 156.4+ KB
None


In [6]:
# Check for corrupt or incomplete images and remove them from the dataset if necessary
def is_valid_image(image_path):
    try:
        image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
        if image is not None:
            return True
        else:
            return False
    except Exception as e:
        return False

In [14]:
# Initialize a list to store IDs of corrupt or incomplete images
corrupt_image_ids = []
# Filter out valid images and keep track of corrupt/incomplete ones
valid_image_files = []
for img_file in image_files:
    img_path = os.path.join(image_folder, img_file)
    if is_valid_image(img_path):
        valid_image_files.append(img_file)
    else:
        corrupt_image_ids.append(os.path.splitext(img_file)[0])
        
print("Number of corrupt Image:", len(corrupt_image_ids))

Number of corrupt Image: 0


In [24]:
# Extract the image IDs from the filenames (assuming filenames are in the format 'image_id.tif')
image_ids = [os.path.splitext(img)[0] for img in image_files]

# Check if any image filenames are not in the reduced_train_labels dataset
missing_images = [image_id for image_id in image_ids if image_id not in reduced_train_labels['id'].values]

# Check if labels for the images are either 0 or 1
invalid_labels = []
for image_id in image_ids:
    label = reduced_train_labels[reduced_train_labels['id'] == image_id]['label'].values
    if len(label) == 0 or label[0] not in [0, 1]:
        invalid_labels.append(image_id)

if len(missing_images) > 0:
    print(f"Missing Images in Dataset for {len(missing_images)} IDs:")
    for image_id in missing_images:
        print(image_id)

if len(invalid_labels) > 0:
    print(f"Invalid Labels for {len(invalid_labels)} Images:")
    for image_id in invalid_labels:
        print(image_id)

if not missing_images and not invalid_labels:
    print("All Image IDs are in the dataset, and labels are either 0 or 1.")

All Image IDs are in the dataset, and labels are either 0 or 1.
