In [15]:
import os
from google.colab import drive
drive.mount('/content/drive')
# List the files in the directory to find the correct file
directory_path = '/content/drive/MyDrive/archive/Data'
files = os.listdir(directory_path)
print(files)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
['valid', 'test', 'train', 'cropped_images']


In [16]:
import tensorflow as tf
from tensorflow import keras
import os
import zipfile
import matplotlib.pyplot as plt
import numpy as np
import keras
import cv2
from skimage.feature import hog
from skimage import exposure

from tensorflow.keras.preprocessing.image import ImageDataGenerator

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

In [17]:
import os

# Define paths to the directories
train_dir = '/content/drive/MyDrive/archive/Data/train'
valid_dir = '/content/drive/MyDrive/archive/Data/valid'
test_dir = '/content/drive/MyDrive/archive/Data/test'

# Function to count total images in a directory
def count_total_images(directory):
    return sum([len(files) for r, d, files in os.walk(directory)])

# Count and print the total number of images in each directory
train_total = count_total_images(train_dir)
valid_total = count_total_images(valid_dir)
test_total = count_total_images(test_dir)

print(f"Total images in train directory: {train_total}")
print(f"Total images in validation directory: {valid_total}")
print(f"Total images in test directory: {test_total}")


Total images in train directory: 12289
Total images in validation directory: 72
Total images in test directory: 315


In [4]:
# List to store class names
class_names = []

# Loop through the directories in train_dir
for class_name in os.listdir(train_dir):
    class_path = os.path.join(train_dir, class_name)
    if os.path.isdir(class_path):  # Check if it is a directory
        class_names.append(class_name)

# Print the class names
print("Class names:", class_names)

Class names: ['normal', 'squamous.cell.carcinoma_left.hilum_T1_N2_M0_IIIa', 'large.cell.carcinoma_left.hilum_T2_N2_M0_IIIa', 'adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib', 'augmented', 'hog']


In [5]:
import cv2
import numpy as np
import os

# Define the paths
train_dir = '/content/drive/MyDrive/archive/Data/train'  # Adjust this path if necessary
output_dir = '/content/drive/MyDrive/archive/Data/cropped_images'  # Output directory for cropped images

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Loop through all files in the training directory
for filename in os.listdir(train_dir):
    img_path = os.path.join(train_dir, filename)

    if os.path.isfile(img_path) and filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):
        img = cv2.imread(img_path)

        if img is None:
            print(f"Error loading image: {img_path}")
            continue

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)
        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        if contours:
            largest_contour = max(contours, key=cv2.contourArea)
            x, y, w, h = cv2.boundingRect(largest_contour)
            roi = img[y:y+h, x:x+w]

            # Save the cropped image
            cropped_filename = f"cropped_{filename}"  # New filename for the cropped image
            cv2.imwrite(os.path.join(output_dir, cropped_filename), roi)

        else:
            print(f"No contours found for image: {filename}")

cv2.destroyAllWindows()  # Close all OpenCV windows after processing
output_total = count_total_images(output_dir)
print(f"Total images in cropped directory: {output_total}")

Total images in cropped directory: 0


In [6]:
from sklearn.model_selection import train_test_split


# Initialize lists to store features and labels
X = []
y = []

# Directory containing the original images
train_dir = '/content/drive/MyDrive/archive/Data/train'
# Directory to save augmented images
augmented_dir = '/content/drive/MyDrive/archive/Data/train/augmented'
os.makedirs(augmented_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Directory to save HoG images
hog_dir = '/content/drive/MyDrive/archive/Data/train/hog'  # Adjust this path if it doesn't exist
os.makedirs(hog_dir, exist_ok=True)  # Create the directory if it doesn't exist

# Define the augmentation strategy
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Class mapping (adjust this based on your directory structure)
class_names = ['normal', 'squamous.cell.carcinoma_left.hilum_T1_N2_M0_IIIa', 'large.cell.carcinoma_left.hilum_T2_N2_M0_IIIa', 'adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib']  # Add all your classes here
class_labels = {class_name: index for index, class_name in enumerate(class_names)}

# Loop through all images in the directory
for class_name in class_names:
    class_dir = os.path.join(train_dir, class_name)  # Directory for each class
    if os.path.exists(class_dir):  # Check if the class directory exists
        for filename in os.listdir(class_dir):
            if filename.lower().endswith(('.jpg', '.png')):
                img_path = os.path.join(class_dir, filename)  # Full path to the image
                roi = cv2.imread(img_path)  # Load the image in BGR format

                # Step 2: Resizing and Normalization
                resized_img = cv2.resize(roi, (224, 224))  # Resize the image
                normalized_img = resized_img / 255.0  # Normalize the image (scale pixel values to [0, 1])

                # Check if the normalized image is 3D (for HoG)
                if normalized_img.ndim == 3:
                    # Compute HoG features
                    fd, hog_image = hog(normalized_img, orientations=8, pixels_per_cell=(16, 16),
                                        cells_per_block=(1, 1), visualize=True, channel_axis=-1)

                    # Append the extracted HoG features to X
                    X.append(fd)

                    # Append the corresponding label to y
                    y.append(class_labels[class_name])  # Get the numeric label for the class

                    # Rescale HoG image for better visualization
                    hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 10))

                    # Save HoG image
                    hog_filename = f'hog_{class_name}_{filename.split(".")[0]}.png'
                    cv2.imwrite(os.path.join(hog_dir, hog_filename), (hog_image_rescaled * 255).astype(np.uint8))

                    # Step 3: Data Augmentation
                    normalized_img = np.expand_dims(normalized_img, axis=0)  # Expand dimensions to match the input shape for datagen

                    # Generate augmented images and save them
                    aug_iter = datagen.flow(normalized_img, batch_size=1, save_to_dir=augmented_dir,
                                            save_prefix='aug', save_format='png')  # Save format can be 'png' or 'jpg'

                    for i in range(5):  # Show and save 5 augmented images
                        aug_img = next(aug_iter)[0]  # Get the next augmented image
                        aug_img = (aug_img * 255).astype(np.uint8)  # Convert back to uint8 for saving

                        # Create a filename for the augmented image
                        augmented_filename = f'aug_{class_name}_{filename.split(".")[0]}_aug_{i + 1}.png'  # Change format if needed

                        # Save the augmented image
                        cv2.imwrite(os.path.join(augmented_dir, augmented_filename), aug_img)

# Convert lists to numpy arrays
X = np.array(X)
y = np.array(y)

# Check if data is collected before splitting
if len(X) > 0 and len(y) > 0:
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("Data split into training and testing sets:")
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
else:
    print("No data to split. Check the image directories.")


Data split into training and testing sets:
X_train shape: (490, 1568), y_train shape: (490,)
X_test shape: (123, 1568), y_test shape: (123,)


In [7]:
print("Shape of X:", X.shape if hasattr(X, 'shape') else len(X))
print("Length of y:", len(y) if isinstance(y, list) else y.shape[0])


Shape of X: (613, 1568)
Length of y: 613


In [8]:
#Step 1: SVM with HoG

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Assuming `X` is the feature matrix with HoG features and `y` contains the labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an SVM classifier
svm = SVC(kernel='rbf')
svm.fit(X_train, y_train)

# Make predictions
y_pred = svm.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'SVM Accuracy: {accuracy * 100:.2f}%')


SVM Accuracy: 91.06%


In [9]:
# Load and preprocess image data for CNN
# (This part should be similar to how you loaded images for HoG feature extraction)

X_cnn = []
y_cnn = []
for class_name in class_names:
    class_dir = os.path.join(train_dir, class_name)
    if os.path.exists(class_dir):
        for filename in os.listdir(class_dir):
            if filename.lower().endswith(('.jpg', '.png')):
                img_path = os.path.join(class_dir, filename)
                img = cv2.imread(img_path)

                resized_img = cv2.resize(img, (224, 224))
                normalized_img = resized_img / 255.0

                X_cnn.append(normalized_img)
                y_cnn.append(class_labels[class_name])

X_cnn = np.array(X_cnn)
y_cnn = np.array(y_cnn)

# Split data for CNN
X_train_cnn, X_test_cnn, y_train_cnn, y_test_cnn = train_test_split(
    X_cnn, y_cnn, test_size=0.2, random_state=42
)

# Define the CNN architecture
cnn_model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')  # Use softmax for multi-class classification
])

# Compile the model
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model using the image data
cnn_model.fit(X_train_cnn, y_train_cnn, epochs=10, batch_size=32, validation_split=0.2)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 525ms/step - accuracy: 0.2352 - loss: 2.1702 - val_accuracy: 0.4184 - val_loss: 1.2776
Epoch 2/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 61ms/step - accuracy: 0.5330 - loss: 1.1388 - val_accuracy: 0.6735 - val_loss: 0.9759
Epoch 3/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.6568 - loss: 0.9065 - val_accuracy: 0.7857 - val_loss: 0.5359
Epoch 4/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step - accuracy: 0.6561 - loss: 0.8892 - val_accuracy: 0.7347 - val_loss: 0.6537
Epoch 5/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 58ms/step - accuracy: 0.7618 - loss: 0.6243 - val_accuracy: 0.8265 - val_loss: 0.4646
Epoch 6/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.8952 - loss: 0.3194 - val_accuracy: 0.8163 - val_loss: 0.5066
Epoch 7/10
[1m13/13[0m [32m━━

<keras.src.callbacks.history.History at 0x7efa30fb56c0>

In [None]:
cnn_model.summary()

In [None]:
from sklearn.metrics import roc_auc_score

# Predict probabilities for the test set using the CNN model and X_test_cnn
y_pred_prob = cnn_model.predict(X_test_cnn)

# Calculate AUC-ROC score
auc = roc_auc_score(y_test_cnn, y_pred_prob, multi_class='ovr')
print(f'AUC-ROC: {auc:.2f}')

In [None]:
from keras.utils import to_categorical

# Assuming your labels are in a list or array format and are integers
y_train = to_categorical(y_train)  # Ensure this matches the number of classes


In [13]:
import os
import numpy as np
import cv2
from sklearn.model_selection import train_test_split
from keras.applications import VGG16
from keras.models import Model
from keras.layers import Dense, Flatten, Dropout
from keras.utils import to_categorical

# Assuming class_labels is a dictionary mapping class names to numerical labels
class_names = list(class_labels.keys())  # List of class names
X_vgg = []
y_vgg = []

# Load and preprocess image data for VGG16
for class_name in class_names:
    class_dir = os.path.join(train_dir, class_name)
    if os.path.exists(class_dir):
        for filename in os.listdir(class_dir):
            if filename.lower().endswith(('.jpg', '.png')):
                img_path = os.path.join(class_dir, filename)
                img = cv2.imread(img_path)

                # Resize to (224, 224) for VGG16
                resized_img = cv2.resize(img, (224, 224))
                normalized_img = resized_img / 255.0  # Normalize the image

                X_vgg.append(normalized_img)
                y_vgg.append(class_labels[class_name])

# Convert lists to NumPy arrays
X_vgg = np.array(X_vgg)
y_vgg = np.array(y_vgg)

# Convert labels to one-hot encoding (if using categorical crossentropy)
num_classes = len(class_labels)  # Number of classes
y_vgg = to_categorical(y_vgg, num_classes)

# Split data for VGG16
X_train_vgg, X_test_vgg, y_train_vgg, y_test_vgg = train_test_split(
    X_vgg, y_vgg, test_size=0.2, random_state=42
)

# Load VGG16 model without the top layer (fully connected layers)
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Add custom layers for your specific task
x = base_model.output
x = Flatten()(x)
x = Dense(128, activation='relu')(x)  # Optional dense layer for further processing
x = Dropout(0.5)(x)  # Optional dropout for regularization
predictions = Dense(num_classes, activation='softmax')(x)  # Final output layer

# Create the model
vgg_model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
vgg_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model using the image data
vgg_model.fit(X_train_vgg, y_train_vgg, epochs=10, batch_size=32, validation_split=0.2)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step
Epoch 1/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 3s/step - accuracy: 0.2786 - loss: 2.8626 - val_accuracy: 0.2347 - val_loss: 1.4070
Epoch 2/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 463ms/step - accuracy: 0.3177 - loss: 1.3973 - val_accuracy: 0.2551 - val_loss: 1.3699
Epoch 3/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 463ms/step - accuracy: 0.2356 - loss: 1.3860 - val_accuracy: 0.3265 - val_loss: 1.3670
Epoch 4/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 489ms/step - accuracy: 0.2785 - loss: 1.3669 - val_accuracy: 0.3265 - val_loss: 1.3779
Epoch 5/10
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 462ms/step - accuracy: 0.3477 - loss: 1.3643 - val_a

<keras.src.callbacks.history.History at 0x7efa1c103580>

In [14]:
vgg_model.summary()