# CSV Preprocessing

In [2]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('/kaggle/input/justraigs-with-paths/JustRAIGS_Train_labels_with_paths.csv')

In [4]:
label_counts = df['Final Label'].value_counts().reset_index()
print(label_counts)

# Rename the columns to 'Label' and 'Count'
label_counts.columns = ['Label', 'Count']

# Create a bar plot using Seaborn
sns.barplot(x='Label', y='Count', data=label_counts)

# Add title and labels
plt.title('Final Label Distribution')
plt.xlabel('Label')
plt.ylabel('Count')

# Display the plot
plt.show()

In [5]:
def downsample_majority_class(df, rg_class='RG', nrg_class='NRG', ratio=1):
    """Downsample the majority class (NRG) to a given ratio compared to RG."""
    rg_df = df[df['Final Label'] == rg_class]
    nrg_df = df[df['Final Label'] == nrg_class]

    # Downsample NRG to match the desired ratio with RG
    nrg_downsampled = nrg_df.sample(n=len(rg_df) * ratio, random_state=42)
    
    # Combine the downsampled NRG with RG
    downsampled_df = pd.concat([rg_df, nrg_downsampled], ignore_index=True)
    return downsampled_df

# Example usage
df_downsampled = downsample_majority_class(df, rg_class='RG', nrg_class='NRG', ratio=2)


In [6]:
label_counts = df_downsampled['Final Label'].value_counts().reset_index()
print(label_counts)

# Dataset processing

In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def main():
    # Load data

    df_temp = df_downsampled

    # Filter and relabel the dataset
    rg_instances = df_temp[df_temp['Final Label'] == 'RG']
    rg_instances['Final Label'] = 1  # Setting RG class label to 1
    rg_instances.reset_index(drop=True, inplace=True)
    
    nrg_instances = df_temp[df_temp['Final Label'] == 'NRG']
    nrg_instances['Final Label'] = 0  # Setting NRG class label to 0
    nrg_instances.reset_index(drop=True, inplace=True)
    
    # Split the dataset for classification
    train_rg, test_rg = train_test_split(rg_instances[['Eye ID', 'Final Label', 'Image Path']],
                                         test_size=0.1, random_state=42, shuffle=True)
    
    # Split the data into training and testing datasets for glaucoma classification. 
    # The test set is balanced, containing 10% of all referable glaucoma cases 
    # along with an equal number of non-referable glaucoma cases.
    train_nrg, test_nrg = train_test_split(nrg_instances[['Eye ID', 'Final Label', 'Image Path']],
                                           test_size=len(test_rg), random_state=42, shuffle=True)
    
    # Merge RG and NRG for glaucoma classification
    train_glaucoma = pd.concat([train_rg, train_nrg], ignore_index=True)
    test_glaucoma = pd.concat([test_rg, test_nrg], ignore_index=True)

    # Shuffle the data
    train_rg = train_rg.sample(frac=1).reset_index(drop=True)
    test_rg = test_rg.sample(frac=1).reset_index(drop=True)
    train_glaucoma = train_glaucoma.sample(frac=1).reset_index(drop=True)
    test_glaucoma = test_glaucoma.sample(frac=1).reset_index(drop=True)

    # Save to CSV (including image paths)
    
    train_glaucoma.to_csv('/kaggle/working/glaucoma_no_mask_train.csv', index=False)
    test_glaucoma.to_csv('/kaggle/working/glaucoma_no_mask_tes.csv', index=False)

if __name__ == "__main__":
    main()

In [8]:
temp = pd.read_csv('/kaggle/working/glaucoma_no_mask_test.csv')

In [11]:
import os
os.environ['KAGGLE_USERNAME'] = 'Username'
os.environ['KAGGLE_KEY'] = 'API_key'

# Contrast Enhancement: CLAHE

In [None]:
import os
import cv2
import numpy as np
import json 

from PIL import Image
from tqdm import tqdm
from kaggle.api.kaggle_api_extended import KaggleApi

def apply_clahe(img, clip_limit=3.0, tile_grid_size=(8, 8)):
    """Applying CLAHE contrast enhancement on each color channel separately."""
    img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
    r, g, b = cv2.split(img)
    clahe = cv2.createCLAHE(clipLimit=clip_limit, tileGridSize=tile_grid_size)
    r_clahe, g_clahe, b_clahe = clahe.apply(r), clahe.apply(g), clahe.apply(b)
    clahe_img = cv2.merge([r_clahe, g_clahe, b_clahe])
    return cv2.cvtColor(clahe_img, cv2.COLOR_BGR2RGB)

def trim_and_resize(im, output_size):
    """Trims margins, maintains aspect ratio, and resizes to the specified output size."""
    percentage = 0.02
    img = np.array(im)
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    im_binary = img_gray > 0.1 * np.mean(img_gray[img_gray != 0])
    row_sums = np.sum(im_binary, axis=1)
    col_sums = np.sum(im_binary, axis=0)
    rows = np.where(row_sums > img.shape[1] * percentage)[0]
    cols = np.where(col_sums > img.shape[0] * percentage)[0]
    if rows.size and cols.size:
        min_row, min_col = np.min(rows), np.min(cols)
        max_row, max_col = np.max(rows), np.max(cols)
        img = img[min_row:max_row+1, min_col:max_col+1]
    im_pil = Image.fromarray(img)
    old_size = im_pil.size
    ratio = float(output_size) / max(old_size)
    new_size = tuple([int(x * ratio) for x in old_size])
    im_resized = im_pil.resize(new_size, Image.LANCZOS)
    new_im = Image.new("RGB", (output_size, output_size))
    new_im.paste(im_resized, ((output_size - new_size[0]) // 2, (output_size - new_size[1]) // 2))
    return new_im

def process_and_save_images(image_paths, output_path_folder, output_size):
    """Processes a list of images and saves them to the output folder."""
    if not os.path.exists(output_path_folder):
        os.makedirs(output_path_folder)
    
    for image_path in tqdm(image_paths):
        img_file = os.path.basename(image_path)
        output_image_path = os.path.join(output_path_folder, img_file)
        if not os.path.exists(output_image_path):
            try:
                image_original = cv2.imread(image_path)
                if image_original is not None:
                    image_trimmed_resized = trim_and_resize(image_original, output_size)
                    image_clahe = apply_clahe(image_trimmed_resized)
                    cv2.imwrite(output_image_path, image_clahe)
                    print(f"Processed and saved: {output_image_path}")
            except Exception as e:
                print(f"Error processing {image_path}: {e}")
        else:
            print(f"Skipping {output_image_path}, already exists.")
    return output_path_folder  # Return the directory path for later use

def create_dataset(output_dir, dataset_title, dataset_id):
    """Creates a new dataset in Kaggle."""
    # Create dataset-metadata.json file
    metadata = {
        "title": dataset_title,
        "id": "mahajantm/" + dataset_id,  # Use correct Kaggle username
        "licenses": [{"name": "CC0-1.0"}]
    }
    
    with open(os.path.join(output_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(metadata, f)
    
    # Authenticate and create dataset using Kaggle API
    api = KaggleApi()
    api.authenticate()

    try:
        api.dataset_create_new(
            folder=output_dir,
            convert_to_csv=False,
            dir_mode='zip'
        )
        print(f"Successfully created dataset: {dataset_title}")
    except Exception as e:
        print(f"Error creating dataset: {e}")

if __name__ == "__main__":
    # Configuration
    train = pd.read_csv('/kaggle/input/images-hda-before-preprocess/glaucoma_no_mask_train.csv')
    test = pd.read_csv('/kaggle/input/images-hda-before-preprocess/glaucoma_no_mask_test.csv')
    image_paths = pd.concat([train[['Image Path']], test[['Image Path']]], ignore_index=True)['Image Path'].tolist()
    
#     image_paths = [
#         "/kaggle/input/jraigs-dataset/justRAIGS/0/0/TRAIN000237.JPG",
#         "/kaggle/input/jraigs-dataset/justRAIGS/5/TRAIN095425.JPG"
#     ]

    output_dir = "/kaggle/working/preprocessed_images"  # Use Kaggle's working directory
    output_size = 2000
    
    # Process images
    processed_folder_path = process_and_save_images(image_paths, output_dir, output_size)
    print(f"Images processed and saved to: {processed_folder_path}")
    
    # Create Kaggle dataset
    dataset_title = "Processed Retinal Images HDA"
    dataset_id = "processed-retinal-images"
    create_dataset(output_dir, dataset_title, dataset_id)