<a href="https://colab.research.google.com/github/LujainAK/GP/blob/main/DataSplitCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###**Data Split Code**

###Import Libraries

In [1]:
from google.colab import drive
import pandas as pd
import os
import shutil
import json
import numpy as np
import random

###Read the csv file and specify paths

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
csv_file = '/content/drive/MyDrive/Colab Notebooks/ODIR Dataset/full_df.csv'
image_directory = '/content/drive/MyDrive/Colab Notebooks/ODIR Dataset/preprocessed_images'
df = pd.read_csv(csv_file)

###Data Exploring

In [4]:
# Show the first 5 columns of the csv file
df.head()

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename
0,0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,0,0,0,1,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",0_right.jpg
1,1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",1_right.jpg
2,2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",2_right.jpg
3,4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4_right.jpg
4,5,50,Female,5_left.jpg,5_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",5_right.jpg


In [6]:
raw_data = df.drop(columns=['Patient Age', 'Patient Sex', 'Left-Fundus', 'Right-Fundus', 'Left-Diagnostic Keywords', 'Right-Diagnostic Keywords', 'filepath', 'N', 'D', 'G', 'C', 'A', 'H', 'M', 'O', 'labels'])

raw_data.head()

Unnamed: 0,ID,target,filename
0,0,"[1, 0, 0, 0, 0, 0, 0, 0]",0_right.jpg
1,1,"[1, 0, 0, 0, 0, 0, 0, 0]",1_right.jpg
2,2,"[0, 1, 0, 0, 0, 0, 0, 0]",2_right.jpg
3,4,"[0, 1, 0, 0, 0, 0, 0, 0]",4_right.jpg
4,5,"[0, 1, 0, 0, 0, 0, 0, 0]",5_right.jpg


In [7]:
# Number of classes (diseases) in the dataset
raw_data["target"].unique().size

8

In [8]:
raw_data.shape

(6392, 3)

In [9]:
targets = np.array(raw_data["target"].apply(lambda x: json.loads(x)).tolist())


targets.shape

(6392, 8)

In [10]:
classes = { 0: "Normal",
            1: "Diabetes",
            2: "Glaucoma",
            3: "Cataract",
            4: "Age related Macular Degeneration",
            5: "Hypertension",
            6: "Pathological Myopia",
            7: "Other diseases/abnormalities"
          }

In [11]:
# Creating the dataset
data = np.sum(targets, axis=0)

classes_names = list(classes.values())
values = list(data)

In [12]:
raw_data["class_name"] = np.argmax(targets, axis=1).tolist()
raw_data["class_name"] = raw_data["class_name"].replace(classes)

raw_data.head()

Unnamed: 0,ID,target,filename,class_name
0,0,"[1, 0, 0, 0, 0, 0, 0, 0]",0_right.jpg,Normal
1,1,"[1, 0, 0, 0, 0, 0, 0, 0]",1_right.jpg,Normal
2,2,"[0, 1, 0, 0, 0, 0, 0, 0]",2_right.jpg,Diabetes
3,4,"[0, 1, 0, 0, 0, 0, 0, 0]",4_right.jpg,Diabetes
4,5,"[0, 1, 0, 0, 0, 0, 0, 0]",5_right.jpg,Diabetes


In [13]:
processed_labels = raw_data[["ID", "filename", "class_name", "target"]]
processed_labels.to_csv('/content/drive/MyDrive/Colab Notebooks/ODIR Dataset/labels_clean.csv', index=False)

processed_labels.head()

Unnamed: 0,ID,filename,class_name,target
0,0,0_right.jpg,Normal,"[1, 0, 0, 0, 0, 0, 0, 0]"
1,1,1_right.jpg,Normal,"[1, 0, 0, 0, 0, 0, 0, 0]"
2,2,2_right.jpg,Diabetes,"[0, 1, 0, 0, 0, 0, 0, 0]"
3,4,4_right.jpg,Diabetes,"[0, 1, 0, 0, 0, 0, 0, 0]"
4,5,5_right.jpg,Diabetes,"[0, 1, 0, 0, 0, 0, 0, 0]"


In [14]:
processed_labels.shape

(6392, 4)

###Split data into disease folders

In [15]:
# Iterate over each row in the DataFrame
for index, row in processed_labels.iterrows():
    image_name = row['filename']
    disease = row['class_name']

    # Create a folder for the disease if it doesn't exist
    disease_folder = os.path.join(image_directory, disease)
    if not os.path.exists(disease_folder):
        os.makedirs(disease_folder)

    # Move the image to the disease folder
    image_path = os.path.join(image_directory, image_name)
    new_image_path = os.path.join(disease_folder, image_name)
    train_image_path = os.path.join(image_directory, disease_folder, "training", image_name)
    test_image_path = os.path.join(image_directory, disease_folder, "testing", image_name)
    if(os.path.exists(image_path)):
      if not (os.path.exists(new_image_path) or os.path.exists(train_image_path) or os.path.exists(test_image_path)):
        shutil.move(image_path, new_image_path)

print("Images have been organized into disease folders in Google Drive.")

Images have been organized into disease folders in Google Drive.


###Split each disease folder into 70% training and 30% testing

In [16]:
for disease_folder in os.listdir(image_directory):
    disease_folder_path = os.path.join(image_directory, disease_folder)
    if os.path.isdir(disease_folder_path):
        # Create training and testing folders within the disease folder
        training_folder = os.path.join(disease_folder_path, 'training')
        os.makedirs(training_folder, exist_ok=True)
        testing_folder = os.path.join(disease_folder_path, 'testing')
        os.makedirs(testing_folder, exist_ok=True)

        # Get the list of files in the disease folder
        files = os.listdir(disease_folder_path)

        # Filter the files to include only the ones ending with ".jpg"
        images = [file for file in files if file.endswith(".jpg")]

        # Calculate the number of images for training and testing
        num_images = len(images)
        num_training_images = int(0.7 * num_images)
        num_testing_images = num_images - num_training_images

        # Randomly assign images to the training and testing folders
        random.shuffle(images)
        training_images = images[:num_training_images]
        testing_images = images[num_training_images:]

        # Move the training images to the training folder
        for image in training_images:
            image_path = os.path.join(disease_folder_path, image)
            new_image_path = os.path.join(training_folder, image)
            shutil.move(image_path, new_image_path)

        # Move the testing images to the testing folder
        for image in testing_images:
            image_path = os.path.join(disease_folder_path, image)
            new_image_path = os.path.join(testing_folder, image)
            shutil.move(image_path, new_image_path)

print("Images have been split into 70% training and 30% testing.")

Images have been split into 70% training and 30% testing.


###Move the data into general Training and Testing folders

In [17]:
# Classes of interest
classes = { 0: "Normal",
            1: "Diabetes",
            2: "Glaucoma",
            3: "Cataract",
            4: "Age related Macular Degeneration"
          }

In [18]:
# Create general training and testing folders
ODIR_path = '/content/drive/MyDrive/Colab Notebooks/ODIR Dataset'
training_folder = os.path.join(ODIR_path, 'training')
os.makedirs(training_folder, exist_ok=True)

testing_folder = os.path.join(ODIR_path, 'testing')
os.makedirs(testing_folder, exist_ok=True)
print("The general training and testing folders have been created.")

The general training and testing folders have been created.


In [19]:
# Create sets that contain the paths of the training and testing folders inside all the diseases of interest
training_paths = []
training_paths = set(training_paths)
testing_paths = []
testing_paths = set(testing_paths)

for disease_folder in os.listdir(image_directory):
  disease_folder_path = os.path.join(image_directory, disease_folder)

  # Check if the disease is of our interest
  if disease_folder in list(classes.values()):
    # Move the training folders into the general training folder
    disease_training_path = os.path.join(disease_folder_path, 'training')
    new_disease_training_path = os.path.join(disease_folder_path, (disease_folder + '_training'))
    if os.path.isdir(disease_training_path):
      os.rename(disease_training_path, new_disease_training_path)
    if not os.path.isdir(new_disease_training_path):
      shutil.move(new_disease_training_path, training_folder)

    # Move the testing folders into the general testing folder
    disease_testing_path = os.path.join(disease_folder_path, 'testing')
    new_disease_testing_path = os.path.join(disease_folder_path, (disease_folder + '_testing'))
    if os.path.isdir(disease_testing_path):
      os.rename(disease_testing_path, new_disease_testing_path)
    if not os.path.isdir(new_disease_training_path):
      shutil.move(new_disease_testing_path, testing_folder)

print("The training and testing folders for each disease have been moved into the general training and testing folders.")

The training and testing folders for each disease have been moved into the general training and testing folders.


###Create a csv file that contains all the images of interest

In [23]:
# Create a list of all images file names

image_file_names = []

training_folder = os.path.join(ODIR_path, 'training')
for training_disease in os.listdir(training_folder):
  training_disease_folder = os.path.join(training_folder, training_disease)
  image_file_names.extend(os.listdir(training_disease_folder))

testing_folder = os.path.join(ODIR_path, 'testing')
for testing_disease in os.listdir(testing_folder):
  testing_disease_folder = os.path.join(testing_folder, testing_disease)
  image_file_names.extend(os.listdir(testing_disease_folder))

In [27]:
# Number of total images of interest
len(image_file_names)

5324

In [28]:
# Create a csv file for the images of interest
selected_rows = []
for index, row in processed_labels.iterrows():
    image_name = row['filename']

    # Check if the image file exists in the list of image file names
    if image_name in image_file_names:
        selected_rows.append(row)

# Create a new DataFrame with the selected rows
filtered_df = pd.DataFrame(selected_rows)

# Modify the 'target' column to contain only the classes of interest
filtered_df['target'] = filtered_df['class_name'].apply(lambda x: [1 if i == x else 0 for i in classes.values()])

# Write the new DataFrame to a new CSV file
new_csv_file_path = os.path.join(ODIR_path, 'filtered.csv')
filtered_df.to_csv(new_csv_file_path, index=False)

print("The new csv file has been created successfully.")

The new csv file has been created successfully.


In [29]:
filtered_df.head()

Unnamed: 0,ID,filename,class_name,target
0,0,0_right.jpg,Normal,"[1, 0, 0, 0, 0]"
1,1,1_right.jpg,Normal,"[1, 0, 0, 0, 0]"
2,2,2_right.jpg,Diabetes,"[0, 1, 0, 0, 0]"
3,4,4_right.jpg,Diabetes,"[0, 1, 0, 0, 0]"
4,5,5_right.jpg,Diabetes,"[0, 1, 0, 0, 0]"
