In [56]:
# Operating System related operations
import os

# Image processing libraries
import cv2 
from PIL import Image

# Numerical operations and array manipulation
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Deep learning related libraries
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, GlobalAveragePooling2D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications.efficientnet import EfficientNetB0
from vit_keras import vit
import tensorflow as tf

# Evaluation and metrics libraries
from sklearn.metrics import classification_report, confusion_matrix


In [76]:
class ImagePreprocessor:
    def __init__(self, target_size=(224, 224)):
        self.target_size = target_size

    # Updated function name for clarity and added normalization
    def resize_and_normalize_image(self, image):
        image = cv2.resize(image, self.target_size)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = image.astype(np.float32) / 255.0
        return image

    # Renamed function for consistency with the new preprocessing function name
    def preprocess_image(self, image):
        return self.resize_and_normalize_image(image)

In [58]:
# def display_sample_images(directory, num_samples=5):
#     classes = os.listdir(directory)
#     for cls in classes:
#         class_dir = os.path.join(directory, cls)
#         image_files = os.listdir(class_dir)
#         sample_files = random.sample(image_files, num_samples)
        
#         fig, axes = plt.subplots(1, num_samples, figsize=(num_samples * 2, 2))
#         fig.suptitle(cls)
#         for ax, file in zip(axes, sample_files):
#             img_path = os.path.join(class_dir, file)
#             img = Image.open(img_path)
#             ax.imshow(img)
#             ax.axis('off')
#         plt.show()

In [52]:
# def plot_class_distribution(directory):
#     """Plot the distribution of classes in the dataset."""
#     classes = os.listdir(directory)
#     class_counts = {cls: len(os.listdir(os.path.join(directory, cls))) for cls in classes}
    
#     plt.figure(figsize=(10, 6))
#     plt.bar(class_counts.keys(), class_counts.values())
#     plt.xlabel('Class')
#     plt.ylabel('Number of images')
#     plt.title('Class Distribution')
#     plt.show()

In [88]:
class ChestXRayClassifier:
    def __init__(self, data_directory, csv_file, target_size=(224, 224), batch_size=32, preprocessor=None):
        self.data_directory = data_directory
        self.csv_file = csv_file
        self.target_size = target_size
        self.batch_size = batch_size
        self.preprocessor = preprocessor or ImagePreprocessor(target_size=target_size)
        self.model = None  # Model will be selected based on user choice
        self.datagen = ImageDataGenerator(
            rescale=1./255,
            validation_split=0.2,
            preprocessing_function=self.preprocessor.preprocess_image
        )
    
    def display_sample_images(self, num_samples=5):
        for cls in os.listdir(self.data_directory):
            class_dir = os.path.join(self.data_directory, cls)
            image_files = os.listdir(class_dir)

            # Check if the number of samples requested is valid
            if not (0 < num_samples <= len(image_files)):
                print(f"Warning: Insufficient images in '{class_dir}' to display {num_samples} samples.")
                continue  # Skip this class if insufficient samples

            sample_files = random.sample(image_files, num_samples)
            
            fig, axes = plt.subplots(1, num_samples, figsize=(num_samples * 2, 2))
            fig.suptitle(cls)
            for ax, file in zip(axes, sample_files):
                img_path = os.path.join(class_dir, file)
                img = Image.open(img_path)
                ax.imshow(img)
                ax.axis('off')
            plt.show()

    def plot_class_distribution(self):
        classes = os.listdir(self.data_directory)
        class_counts = {cls: len(os.listdir(os.path.join(self.data_directory, cls))) for cls in classes}
        
        plt.figure(figsize=(10, 6))
        plt.bar(class_counts.keys(), class_counts.values())
        plt.xlabel('Class')
        plt.ylabel('Number of images')
        plt.title('Class Distribution')
        plt.show()    
    
    def build_vit_model(self):
        vit_model = vit.vit_b16(
            image_size=self.target_size,
            activation='softmax',
            pretrained=True,
            include_top=False,
            pretrained_top=False,
            classes=3
        )
        model = Sequential([
            vit_model,
            Flatten(),
            Dense(256, activation='relu'),
            Dropout(0.5),
            Dense(3, activation='softmax')
        ])
        return model

    def build_efficientnet_model(self):
        base_model = EfficientNetB0(include_top=False, input_shape=(*self.target_size, 3), weights="imagenet")
        base_model.trainable = False  # Freeze the base_model
        model = Sequential([
            base_model,
            GlobalAveragePooling2D(),
            Dense(256, activation='relu'),
            Dropout(0.5),
            Dense(3, activation='softmax')
        ])
        return model

    def compile_and_train(self, model_choice='vit'):
        self.display_sample_images()
        self.plot_class_distribution()
        if model_choice == 'vit':
            self.model = self.build_vit_model()
        elif model_choice == 'efficientnet':
            self.model = self.build_efficientnet_model()
        else:
            raise ValueError("Invalid model choice. Please choose 'vit' or 'efficientnet'.")

        self.model.compile(optimizer=Adam(),
                           loss='categorical_crossentropy',
                           metrics=['accuracy'])

        train_generator = self.datagen.flow_from_directory(
            os.path.join(self.data_directory, 'train'),  # if 'train' is a subdirectory inside `self.data_directory`
            target_size=self.target_size,
            batch_size=self.batch_size,
            class_mode='categorical',
            subset='training'
        )

        validation_generator = self.datagen.flow_from_directory(
            os.path.join(self.data_directory, 'train'),  # same here
            target_size=self.target_size,
            batch_size=self.batch_size,
            class_mode='categorical',
            subset='validation'
        )

        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True
        )

        history = self.model.fit(
            train_generator,
            epochs=30,
            validation_data=validation_generator,
            callbacks=[early_stopping]
        )
        return history

    def plot_history(self, history, model_name):
        
        # Plotting accuracy
        plt.figure(figsize=(14, 5))
        plt.subplot(1, 2, 1)
        plt.plot(history.history['accuracy'], label='Training Accuracy')
        plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
        plt.title(f'{model_name} Training and Validation Accuracy')
        plt.xlabel('Epoch')
        plt.ylabel('Accuracy')
        plt.legend()

        # Plotting loss
        plt.subplot(1, 2, 2)
        plt.plot(history.history['loss'], label='Training Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title(f'{model_name} Training and Validation Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()
    
    def evaluate_model(self, test_directory):
        test_datagen = ImageDataGenerator(rescale=1./255)
        test_generator = test_datagen.flow_from_directory(
            test_directory,
            target_size=self.target_size,
            batch_size=self.batch_size,
            class_mode='categorical',
            shuffle=False
        )

        predictions = self.model.predict(test_generator)
        predicted_labels = np.argmax(predictions, axis=1)
        true_labels = test_generator.classes
        cm = confusion_matrix(true_labels, predicted_labels)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=test_generator.class_indices.keys(),
                    yticklabels=test_generator.class_indices.keys())
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.show()

        print("Classification Report:\n", classification_report(true_labels, predicted_labels, target_names=list(test_generator.class_indices.keys())))
    
    def save_model(self, file_path):
        
        self.model.save(file_path)
        print(f"Model saved to {file_path}")

In [1]:
# data_directory = 'Data'
# csv_file = 'train.csv'  
# train_directory = os.path.join(data_directory, 'train')
# test_directory = os.path.join(data_directory, 'test')

In [2]:
# display_sample_images(train_directory)

This visual displays a sample of chest X-ray images used for training a machine learning model to detect pneumonia, including cases of COVID-19. The dataset appears to be divided into three categories:

1. `normal` - These images represent a set of normal chest X-rays where no pneumonia is present. They serve as the control group in the dataset. The images have a consistent clarity where lung fields, the heart, and the diaphragm boundaries can be clearly seen.

2. `covid` - This category includes X-rays of patients who have been diagnosed with COVID-19. The images typically show signs of pneumonia, such as hazy opacities in the lung fields, which are indicative of viral infections like COVID-19.

3. `virus` - These X-rays represent cases of vius not necessarily caused by the COVID-19 virus. Features may include diffuse lung involvement and other patterns that differentiate it from the normal and COVID-19 categories.

In [3]:
# plot_class_distribution(train_directory)


The bar chart illustrates the distribution of different classes of chest X-ray images in a training dataset. This distribution is critical for understanding the dataset's composition and potential biases that could affect a machine learning model's training process.

- `normal`: Represents a large number of images showing no signs of pneumonia. This class has the highest count, indicating a substantial set of examples for the model to learn what a healthy chest X-ray looks like.

- `covid`: Contains a fewer number of images compared to the `normal` class. These images are of patients diagnosed with COVID-19 and display characteristics of viral pneumonia in chest X-rays.

- `virus`: Slightly fewer images than the `normal` class but more than the `covid` class. This group includes images of viral pneumonia not attributed to COVID-19.

The disparity in the number of images across classes can lead to class imbalance issues, where a model might become biased towards the class with more examples.

In [4]:
# classifier = ChestXRayClassifier(data_directory, csv_file)

In [5]:
# history_vit = classifier.compile_and_train(model_choice='vit')

In [6]:
# classifier.plot_history(history_vit, model_name='Vision Transformer')

In [7]:
# history_efficientnet = classifier.compile_and_train(model_choice='efficientnet')

In [8]:
# classifier.plot_history(history_efficientnet, model_name='EfficientNet')

In [9]:
# classifier.evaluate_model(test_directory)

In [10]:
# classifier.save_model('Models/h5/main.h5')

In [11]:
# import tensorflow as tf


# # Specify the path to save the Keras model
# keras_model_path = 'Models/h5/'
# classifier.save_model(keras_model_path)

# # Convert the model to TFLite format
# converter = tf.lite.TFLiteConverter.from_keras_model(classifier.model)
# tflite_model = converter.convert()

# # Save the TFLite model to a binary file
# tflite_model_path = 'Models/tf/main.tflite'
# with open(tflite_model_path, 'wb') as f:
#     f.write(tflite_model)

In [12]:
# import pandas as pd
# from tensorflow.keras.models import load_model

# model = load_model("Models/h5/")

# image_preprocessor = ImagePreprocessor(target_size=(224, 224))

# submission_df = pd.read_csv("sample_submission.csv")

# test_datagen = ImageDataGenerator(rescale=1./255)
# test_generator = test_datagen.flow_from_dataframe(
#     dataframe=submission_df,
#     directory='Data/test',
#     x_col='Image',
#     y_col=None,
#     class_mode=None,
#     target_size=(224, 224),
#     shuffle=False,
#     batch_size=32
# )

# predictions = model.predict(test_generator, steps=len(test_generator))

# predicted_labels = (predictions > 0.5).astype(int).flatten()

# submission_df['Label'] = predicted_labels

# submission_df.to_csv('Sub/kaggle_submission.csv', index=False)
