<a href="https://colab.research.google.com/github/LujainAK/GP/blob/main/DataAugmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###**Data Preprocessing Code**

###Import Libraries

In [54]:
from google.colab import drive
import pandas as pd
import os
import cv2
from keras.preprocessing.image import ImageDataGenerator

###Read the csv file and specify paths

In [36]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


###Data Augmentation

In [37]:
ODIR_path = '/content/drive/MyDrive/Colab Notebooks/ODIR Dataset'
filtered_csv = '/content/drive/MyDrive/Colab Notebooks/ODIR Dataset/filtered.csv'
filtered_df = df = pd.read_csv(filtered_csv)

In [38]:
# Check the balance of the classes
class_counts = filtered_df['class_name'].value_counts()
print(class_counts)

Normal                              2873
Diabetes                            1608
Cataract                             293
Glaucoma                             284
Age related Macular Degeneration     266
Name: class_name, dtype: int64


In [39]:
classes = { 0: "Normal",
            1: "Diabetes",
            2: "Glaucoma",
            3: "Cataract",
            4: "Age related Macular Degeneration"
          }

In [40]:
# Create folders for the augmented images in training and testing
augmented_training = os.path.join(ODIR_path, 'augmented_training')
augmented_testing = os.path.join(ODIR_path, 'augmented_testing')
os.makedirs(augmented_training, exist_ok=True)
os.makedirs(augmented_testing, exist_ok=True)

for cls in classes.values():
  disease_augmented_training = os.path.join(augmented_training, (cls + '_augmented_training'))
  disease_augmented_testing = os.path.join(augmented_testing, (cls + '_augmented_testing'))
  os.makedirs(disease_augmented_training, exist_ok=True)
  os.makedirs(disease_augmented_testing, exist_ok=True)

print("Folders for the augmented images in augmented training and augmented testing folders have been created.")


Folders for the augmented images in augmented training and augmented testing folders have been created.


In [41]:
# Find the class with maximum number of images
training_folder = '/content/drive/MyDrive/Colab Notebooks/ODIR Dataset/training'
testing_folder = '/content/drive/MyDrive/Colab Notebooks/ODIR Dataset/testing'
max_class = class_counts.idxmax()
max_class_training = os.path.join(training_folder, (max_class + '_training'))
max_class_testing = os.path.join(testing_folder, (max_class + '_testing'))

In [42]:
# Specify the parameters of data augmentation
datagen = ImageDataGenerator(rotation_range=20, width_shift_range=0.1, height_shift_range=0.1, shear_range=0.2, zoom_range=0.2)

In [43]:
# Get the size of the original images
sample_image_path = os.path.join(max_class_training, os.listdir(max_class_training)[0])
sample_image = cv2.imread(sample_image_path)
original_size = sample_image.shape[:2]  # Retrieve the height and width of the image

In [44]:
for cls in classes.values():
  # Augmentation in training
  disease_training = os.path.join(training_folder, (cls + '_training'))
  disease_augmented_training = os.path.join(augmented_training, (cls + '_augmented_training'))
  training_difference = len(os.listdir(max_class_training)) - len(os.listdir(disease_training))
  if training_difference > 0 and len(os.listdir(disease_augmented_training)) < training_difference:
    training_augmented_images = datagen.flow_from_directory(
            training_folder,
            target_size= original_size,
            batch_size = 1,
            classes = [(cls+'_training')],
            class_mode = None,
            save_to_dir = disease_augmented_training,
            save_prefix = ('augmented_' + cls + '_'),
            save_format = 'jpg',
            shuffle = False
            )
    for _ in range(training_difference):
      training_augmented_images.next()

print("Augmented images for training have been created.")

Augmented images for training have been created.


In [45]:
# Check the number of the augmented images for each class in training
for i in os.listdir(augmented_training):
  itrain = os.path.join(augmented_training, i)
  print(i + " = ", len(os.listdir(itrain)))

Normal_augmented_training =  0
Diabetes_augmented_training =  886
Glaucoma_augmented_training =  1813
Cataract_augmented_training =  1806
Age related Macular Degeneration_augmented_training =  1825


In [46]:
for cls in classes.values():
  #Augmentation in testing
  disease_testing = os.path.join(testing_folder, (cls + '_testing'))
  disease_augmented_testing = os.path.join(augmented_testing, (cls + '_augmented_testing'))
  testing_difference = len(os.listdir(max_class_testing)) - len(os.listdir(disease_testing))
  if testing_difference > 0 and len(os.listdir(disease_augmented_testing)) < testing_difference:
    testing_augmented_images = datagen.flow_from_directory(
            testing_folder,
            target_size = original_size,
            batch_size = 1,
            classes = [(cls+'_testing')],
            class_mode = None,
            save_to_dir = disease_augmented_testing,
            save_prefix = 'augmented_' + cls + '_',
            save_format ='jpg',
            shuffle = False
            )
    # Generate the specified number of augmented images
    for _ in range(testing_difference):
      testing_augmented_images.next()

print("Augmented images for testing have been created.")

Augmented images for testing have been created.


In [47]:
# Check the number of the augmented images for each class in testing
for i in os.listdir(augmented_testing):
  itest = os.path.join(augmented_testing, i)
  print(i + " = ", len(os.listdir(itest)))

Normal_augmented_testing =  0
Glaucoma_augmented_testing =  776
Cataract_augmented_testing =  774
Age related Macular Degeneration_augmented_testing =  782
Diabetes_augmented_testing =  379


###Create a CSV file for the augmented images in training and testing

In [48]:
augmented_data = []

id_counter = 0
# Iterate over the folders in the main folder
for cls in classes.values():
  disease_augmented_training = os.path.join(augmented_training, (cls + '_augmented_training'))
  disease_augmented_testing = os.path.join(augmented_testing, (cls + '_augmented_testing'))

  # Add the augmented training images
  if os.path.isdir(disease_augmented_training):
    # Iterate over the images in each class folder
    for image_file in os.listdir(disease_augmented_training):
      # Create the target list based on the class_name
      target = [1 if i == cls else 0 for i in classes.values()]

      # Append the image data to the list
      augmented_data.append([id_counter, image_file, cls, target])

      # Increment the ID counter
      id_counter += 1

  # Add the augmented testing images
  if os.path.isdir(disease_augmented_testing):
    # Iterate over the images in each class folder
    for image_file in os.listdir(disease_augmented_testing):
      # Create the target list based on the class_name
      target = [1 if i == cls else 0 for i in classes.values()]

      # Append the image data to the list
      augmented_data.append([id_counter, image_file, cls, target])

      # Increment the ID counter
      id_counter += 1

# Creating a dataframe for the augmented images data
augmented_df = pd.DataFrame(augmented_data, columns=["ID", "filename", "class_name", "target"])

# Path to the CSV file to be created
augmented_csv_path = os.path.join(ODIR_path, 'augmented.csv')

augmented_df.to_csv(augmented_csv_path, index=False)

print("CSV file for the augmented images has been created.")

CSV file for the augmented images has been created.


In [49]:
augmented_df.head()

Unnamed: 0,ID,filename,class_name,target
0,0,augmented_Diabetes__0_4126838.jpg,Diabetes,"[0, 1, 0, 0, 0]"
1,1,augmented_Diabetes__1_5142113.jpg,Diabetes,"[0, 1, 0, 0, 0]"
2,2,augmented_Diabetes__2_5013863.jpg,Diabetes,"[0, 1, 0, 0, 0]"
3,3,augmented_Diabetes__3_2848479.jpg,Diabetes,"[0, 1, 0, 0, 0]"
4,4,augmented_Diabetes__4_1365883.jpg,Diabetes,"[0, 1, 0, 0, 0]"


In [50]:
augmented_df.shape

(9041, 4)

In [51]:
print("Total numbers in each class after augmentation: ")

for cls in classes.values():
  disease_training = os.path.join(training_folder, (cls + '_training'))
  disease_augmented_training = os.path.join(augmented_training, (cls + '_augmented_training'))
  total_training = len(os.listdir(disease_training)) + len(os.listdir(disease_augmented_training))
  print(cls + " training = ", total_training)

  disease_testing = os.path.join(testing_folder, (cls + '_testing'))
  disease_augmented_testing = os.path.join(augmented_testing, (cls + '_augmented_testing'))
  total_testing = len(os.listdir(disease_testing)) + len(os.listdir(disease_augmented_testing))
  print(cls + "testing = ", total_testing)


Total numbers in each class after augmentation: 
Normal training =  2011
Normaltesting =  862
Diabetes training =  2011
Diabetestesting =  862
Glaucoma training =  2011
Glaucomatesting =  862
Cataract training =  2011
Cataracttesting =  862
Age related Macular Degeneration training =  2011
Age related Macular Degenerationtesting =  862


###Create a CSV file combining the original and augmented images

In [52]:
# Combine the dataframes into a single dataframe containing the information of all images
df = pd.concat([filtered_df, augmented_df], ignore_index=True)

# Changing the ID column to avoid repetition
df = df.drop('ID', axis=1)
df = df.reset_index().rename(columns={'index': 'ID'})

# Write the new DataFrame to a new CSV file
new_csv_file_path = os.path.join(ODIR_path, '14k_data.csv')
df.to_csv(new_csv_file_path, index=False)

print("The new csv file has been created successfully.")

The new csv file has been created successfully.


In [53]:
df.head()

Unnamed: 0,ID,filename,class_name,target
0,0,0_right.jpg,Normal,"[1, 0, 0, 0, 0]"
1,1,1_right.jpg,Normal,"[1, 0, 0, 0, 0]"
2,2,2_right.jpg,Diabetes,"[0, 1, 0, 0, 0]"
3,3,4_right.jpg,Diabetes,"[0, 1, 0, 0, 0]"
4,4,5_right.jpg,Diabetes,"[0, 1, 0, 0, 0]"
