# IQ-OTH/NCCD Dataset - Data Preprocessing

In [1]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
# Defining the Dataset Directory
root = "The IQ-OTHNCCD lung cancer dataset/"

In [6]:
# Class Labels
classes = {'Bengin cases', 'Malignant cases', 'Normal cases'}

In [7]:
# Empty lists for images and labels
images = []
labels = []

In [8]:
image_size = 224

# Looping through each class folder
for class_name in classes:
    class_directory = os.path.join(root, class_name)
    
    # Ensure the directory exists
    if not os.path.exists(class_directory):
        print(f"Directory not found: {class_directory}")
        continue
    
    # Looping through each image in the class folder
    for filename in os.listdir(class_directory):
        image_path = os.path.join(class_directory, filename)
        
        # Load images using OpenCV and check if image_file is read correctly
        image_file = cv2.imread(image_path)
        if image_file is None:
            print(f"Could not read image: {image_path}")
            continue
        
        # Process image
        image_file = cv2.cvtColor(image_file, cv2.COLOR_BGR2RGB)
        image_file = cv2.resize(image_file, (image_size, image_size))
        
        # Append processed images and their labels to the lists
        images.append(image_file)
        labels.append(class_name)

print(f"Loaded {len(images)} images.")

Loaded 1097 images.


In [10]:
# Encoding labels to numeric variables
labels_dictionary = {"Bengin cases": 0, "Malignant cases": 1, "Normal cases": 2}
encoded_labels = [labels_dictionary[label] for label in labels]

In [11]:
# Perform Random Shuffling
permutations = np.random.permutation(len(encoded_labels))

In [12]:
X = np.array(images)
X = X/255.
Y = np.array(encoded_labels)

# Shuffle the images and labels
X = X[permutations]
Y = Y[permutations]