In [None]:
# project to distinguish stages in diabetic retinopathy from mild, moderate, no_dr, proliferate_dr and severe

In [8]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import os
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, array_to_img
import random
from PIL import Image
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

In [5]:
base_dir = r"C:\Users\UltraBook 3.1\Desktop\data_analysis projects\Healthcare_projects\diabetic_retinopathy\colored_images"

In [74]:
# open each folder and perform data augmentation
# 7220 images for each class

def count_images_per_folder(base_dir):
    folder_dict = {}
    # count the number of images per folder and insert in dictionary
    for folder in os.listdir(base_dir):
        full_path = os.path.join(base_dir, folder)
        if os.path.isdir(full_path):
            count = sum(1 for file in os.listdir(full_path) if file.endswith('.png') and os.path.isfile(os.path.join(full_path, file)))
            folder_dict[folder] = count
    return folder_dict

# target_count = max(class_dict.values()) * 8


def additional_augmented_img(class_dict, target_count):
    # count the number of additional images needed to augment each class 
    augment_needed = {}
    for folder, count in class_dict.items():
        augment_needed[folder] =  target_count - count
    return augment_needed


# Augmentation config
datagen = ImageDataGenerator(
    rotation_range=15,
    zoom_range=0.1,
    brightness_range=[0.8, 1.2],
    horizontal_flip=True,
    fill_mode='nearest'
)


def data_augment_process(base_dir, augmented, class_dict, datagen, batch_size):
    for folder in os.listdir(base_dir):
        full_path = os.path.join(base_dir, folder)
        if os.path.isdir(full_path):
            existing_images = [f for f in os.listdir(full_path) if f.endswith('.png')]
            
            if folder in augmented and folder in class_dict:
                current_count = class_dict[folder]
                count_needed = augmented[folder]
                img_index = current_count + 1
                i = 0
                while i < count_needed:
                    for img_name in existing_images:
                        img_path = os.path.join(full_path, img_name)
                        img = load_img(img_path)
                        x = img_to_array(img)
                        x = x.reshape((1,) + x.shape)                    
                        gen = datagen.flow(x, batch_size=batch_size)
                        for batch in gen:
                            for j in range(batch.shape[0]):
                                if i >= count_needed:
                                    break
                                save_path = os.path.join(full_path, f"{folder}_aug_{img_index}.png")
                                array_to_img(batch[j]).save(save_path)
                                img_index += 1
                                i += 1
                            break  # 10 batch per image
                        if i >= count_needed:
                            break




In [75]:
class_dict = count_images_per_folder(base_dir)
target_count = max(class_dict.values()) * 8
augmented = additional_augmented_img(class_dict, target_count)
data_augment_process(base_dir, augmented, class_dict, datagen, 10)

In [76]:


# # take 80% of the images and put in a train folder and 20% in the test folder in the main directory
main_dir = r"C:\Users\UltraBook 3.1\Desktop\data_analysis projects\Healthcare_projects\diabetic_retinopathy"


# Mapping of folder names to class labels
data_dict = {
    'No_DR': 0, 
    'Mild' : 1,
    'Moderate': 2,
    'Severe': 3,
    'Proliferate_DR': 4}

def pairing_data(base_dir, data_dict):
    data = {}
    for class_name, class_label in data_dict.items():
        folder_path = os.path.join(base_dir, class_name)
        if os.path.isdir(folder_path):
            for file in os.listdir(folder_path):
                image_name = os.path.splitext(file)[0] # removes .png
                image_name= str(image_name)
                data[image_name] = class_label
    return data


def save_dict_to_csv(data, output_path):
    csv_path = os.path.join(output_path, "image_labels.csv")
    df = pd.DataFrame(list(data.items()), columns=['image_name', 'label'])
    df.to_csv(csv_path, index=False)
         

In [77]:
save_dict_to_csv(pairing_data(base_dir, data_dict), main_dir)

In [84]:

# # image size
# img_size = (224, 224)

# X_train, y_train = [], []
# X_test, y_test = [], []

# for class_name, label in data_dict.items():
#     folder_path = os.path.join(base_dir, class_name)
#     image_files = [f for f in os.listdir(folder_path) if f.endswith('.png')]
#     random.shuffle(image_files)  # shuffle for randomness
    
#     split_index = int(0.8 * len(image_files))
    
#     train_files = image_files[:split_index]
#     test_files = image_files[split_index:]
    
#     # Load training images
#     for file in train_files:
#         img_path = os.path.join(folder_path, file)
#         img = Image.open(img_path).resize(img_size)
#         img_array = np.array(img, dtype=np.float32) / 255.0
#         X_train.append(img_array)
#         y_train.append(label)
    
#     # Load testing images
#     for file in test_files:
#         img_path = os.path.join(folder_path, file)
#         img = Image.open(img_path).resize(img_size)
#         img_array = np.array(img, dtype=np.float32) / 255.0
#         X_test.append(img_array)
#         y_test.append(label)

# # Convert to NumPy arrays
# X_train, y_train = np.array(X_train), np.array(y_train)
# X_test, y_test = np.array(X_test), np.array(y_test)
# print("Training set:", X_train.shape, y_train.shape)
# print("Testing set:", X_test.shape, y_test.shape)

In [10]:
img_size = (224, 224)
batch_size = 32

datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2  # 80% train, 20% test
)

train_generator = datagen.flow_from_directory(
    directory=base_dir,
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical',  # one-hot encoded labels
    subset='training',
    shuffle=True
)

test_generator = datagen.flow_from_directory(
    directory=base_dir,
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical',
    subset='validation',
    shuffle=False
)


Found 57760 images belonging to 5 classes.
Found 14440 images belonging to 5 classes.


In [11]:
# proceed with the architecture
# start with 3 layers and two maxpooling layers and check the performance

model = Sequential([
    keras.Input(shape=(224,224,3)), 
    Conv2D(32, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),

    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),

    Conv2D(128, (3, 3), activation='relu'),
    MaxPooling2D(2, 2),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.2),
    Dense(5, activation='softmax')  # For 5 classes
])


In [12]:
model.summary()

In [13]:
model.compile(
    optimizer=Adam(learning_rate=0.0001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
history = model.fit(
    train_generator,
    validation_data=test_generator,
    epochs=10, verbose=1
)


Epoch 1/10
[1m1805/1805[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2270s[0m 1s/step - accuracy: 0.4803 - loss: 1.2294 - val_accuracy: 0.6015 - val_loss: 0.9696
Epoch 2/10
[1m1805/1805[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2048s[0m 1s/step - accuracy: 0.6235 - loss: 0.9188 - val_accuracy: 0.7096 - val_loss: 0.7367
Epoch 3/10
[1m1805/1805[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1967s[0m 1s/step - accuracy: 0.7263 - loss: 0.7024 - val_accuracy: 0.8117 - val_loss: 0.5169
Epoch 4/10
[1m1805/1805[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1929s[0m 1s/step - accuracy: 0.8078 - loss: 0.5093 - val_accuracy: 0.8757 - val_loss: 0.3703
Epoch 5/10
[1m1805/1805[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1929s[0m 1s/step - accuracy: 0.8564 - loss: 0.3934 - val_accuracy: 0.8925 - val_loss: 0.3015
Epoch 6/10
[1m1805/1805[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1923s[0m 1s/step - accuracy: 0.8885 - loss: 0.3044 - val_accuracy: 0.9154 - val_loss: 0.2486
Epoc

In [99]:
predictions = model.predict(test_generator, verbose=1)
print(predictions.shape) 
predicted_classes = np.argmax(predictions, axis=1)
loss, accuracy = model.evaluate(test_generator)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

[1m452/452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 346ms/step
(14440, 5)


In [100]:
# for key, val in data_dict.items():
#     if val in predicted_classes: 
#         print(f"{key})

array([0, 0, 0, ..., 4, 4, 4])

In [97]:
# save the model
# model.save("Diabetic_Retinopathy_CNN.h5")



In [2]:
# from tensorflow.keras.models import load_model

# model = load_model('Diabetic_Retinopathy_CNN.h5')  # or .keras



In [7]:
# visualizations of th training data
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Model Accuracy')
plt.legend()
plt.grid(True)
plt.show()

  self._warn_if_super_not_called()


[1m452/452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 375ms/step - accuracy: 0.9362 - loss: 0.1976
Test Accuracy: 93.82%
