In [5]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import random
import sys
import cv2
import matplotlib
from subprocess import check_output
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Dropout, Flatten
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from matplotlib import pyplot as plt
from datetime import datetime

# Label conversion functions
def classes_to_int(label):
    label = label.strip()
    if label == "No DR": return 0
    if label == "Mild": return 1
    if label == "Moderate": return 2
    if label == "Severe": return 3
    if label == "Proliferative DR": return 4
    print("Invalid Label", label)
    return 5

def int_to_classes(i):
    if i == 0: return "No DR"
    elif i == 1: return "Mild"
    elif i == 2: return "Moderate"
    elif i == 3: return "Severe"
    elif i == 4: return "Proliferative DR"
    print("Invalid class ", i)
    return "Invalid Class"

# Constants
NUM_CLASSES = 5
WIDTH = 128
HEIGHT = 128
DEPTH = 3
inputShape = (HEIGHT, WIDTH, DEPTH)
EPOCHS = 15
INIT_LR = 1e-3
BS = 32

# Global variables
ImageNameDataHash = {}
uniquePatientIDList = []

# Function to read and process training data
def readTrainData(trainDir):
    global ImageNameDataHash
    images = os.listdir(trainDir)
    print("Number of files in " + trainDir + " is " + str(len(images)))
    for imageFileName in images:
        if imageFileName == "trainLabels.csv":
            continue
        imageFullPath = os.path.join(os.path.sep, trainDir, imageFileName)
        img = load_img(imageFullPath)
        arr = img_to_array(img)  # Numpy array with shape (233,233,3)
        dim1 = arr.shape[0]
        dim2 = arr.shape[1]
        dim3 = arr.shape[2]
        if dim1 < HEIGHT or dim2 < WIDTH or dim3 < DEPTH:
            print("Error: Image dimensions are less than expected " + str(arr.shape))
        arr = cv2.resize(arr, (HEIGHT, WIDTH))  # Resize to (128,128,3)
        dim1 = arr.shape[0]
        dim2 = arr.shape[1]
        dim3 = arr.shape[2]
        if dim1 != HEIGHT or dim2 != WIDTH or dim3 != DEPTH:
            print("Error: After resize, image dimensions are not as expected " + str(arr.shape))
        arr = np.array(arr, dtype="float") / 255.0
        imageFileName = imageFileName.replace('.jpeg', '')
        ImageNameDataHash[str(imageFileName)] = np.array(arr)
    return

# Function to read CSV data
def readTrainCsv():
    raw_df = pd.read_csv('trainLabels.csv', sep=',')
    print(type(raw_df))  # <class 'pandas.core.frame.DataFrame'>
    row_count = raw_df.shape[0]  # gives number of row count
    col_count = raw_df.shape[1]  # gives number of col count
    print("row_count=" + str(row_count) + " col_count=" + str(col_count))
    raw_df["PatientID"] = ''
    header_list = list(raw_df.columns)
    print(header_list)  # ['image', 'level', 'PatientID']
    
    ImageLevelHash = {}
    patientIDList = []
    for index, row in raw_df.iterrows():
        key = row[0] + ''
        patientID = row[0] + ''
        patientID = patientID.replace('_right', '')
        patientID = patientID.replace('_left', '')
        raw_df.at[index, 'PatientID'] = patientID
        patientIDList.append(patientID)
        ImageLevelHash[key] = str(row[1])  # level
    
    global uniquePatientIDList
    uniquePatientIDList = sorted(set(patientIDList))
    count = 0
    for patientID in uniquePatientIDList:
        left_level = ImageLevelHash[str(patientID + '_left')]
        right_level = ImageLevelHash[str(patientID + '_right')]
        if left_level != right_level:
            count += 1
    print("Count of images with both left and right eye level not matching=" + str(count))
    print("Number of unique patients=" + str(len(uniquePatientIDList)))
    return raw_df

# Loading images
print("Loading images at..." + str(datetime.now()))
sys.stdout.flush()
readTrainData("")
print("Loaded " + str(len(ImageNameDataHash)) + " images at..." + str(datetime.now()))

# Reading CSV
random.seed(10)
print("Reading trainLabels.csv...")
df = readTrainCsv()
for i in range(0, 10):
    s = df.loc[df.index[i], 'PatientID']  # get patient id of patients
    print(str(i) + " patient's patientID=" + str(s))

# Filtering dataframe
keepImages = list(ImageNameDataHash.keys())
df = df[df['image'].isin(keepImages)]
print(len(df))  # 1000

# Convert hash to dataframe
imageNameArr = []
dataArr = []
for index, row in df.iterrows():
    key = str(row[0])
    if key in ImageNameDataHash:
        imageNameArr.append(key)
        dataArr.append(np.array(ImageNameDataHash[key]))  # np.array
df2 = pd.DataFrame({'image': imageNameArr, 'data': dataArr})
df2_header_list = list(df2.columns)
print(df2_header_list)  # ['image', 'data']
print(len(df2))

if len(df) != len(df2):
    print("Error: Length of df != df2")

for idx in range(0, len(df)):
    if df.loc[df.index[idx], 'image'] != df2.loc[df2.index[idx], 'image']:
        print("Error " + df.loc[df.index[idx], 'image'] + " == " + df2.loc[df2.index[idx], 'image'])

print(df2.dtypes)
print(df.dtypes)
df = pd.merge(df2, df, left_on='image', right_on='image', how='outer')
df_header_list = list(df.columns)
print(df_header_list)  # ['image', 'data', 'level', 'PatientID']
print(len(df))  # 1000
print(df.sample())

# Display sample image
sample0 = df.loc[df.index[0], 'data']
print(sample0)
print(type(sample0))  # <class 'numpy.ndarray'>
print(sample0.shape)  # (128, 128, 3)
plt.imshow(sample0, interpolation='nearest')
plt.show()
print("Sample Image")

# Prepare data for training
X = df['data']
Y = df['level']
Y = np.array(Y)
Y = to_categorical(Y, num_classes=NUM_CLASSES)

# Partition the data into training and testing splits
print("Partitioning data into 75:25...")
sys.stdout.flush()
print("Unique patients in dataframe df=" + str(df.PatientID.nunique()))  # 500
unique_ids = df.PatientID.unique()
print('unique_ids shape=' + str(len(unique_ids)))  # 500

train_ids, valid_ids = train_test_split(unique_ids, test_size=0.25, random_state=10)
trainid_list = train_ids.tolist()
print('trainid_list shape=', str(len(trainid_list)))  # 375

traindf = df[df.PatientID.isin(trainid_list)]
valSet = df[~df.PatientID.isin(trainid_list)]
traindf = traindf.reset_index(drop=True)
valSet = valSet.reset_index(drop=True)
trainX = traindf['data']
trainY = traindf['level']
valX = valSet['data']
valY = valSet['level']

print('trainX shape=', trainX.shape[0], 'valX shape=', valX.shape[0])  # 750, 250
trainY = to_categorical(trainY, num_classes=NUM_CLASSES)
valY = to_categorical(valY, num_classes=NUM_CLASSES)

# Construct the image generator for data augmentation
print("Generating images...")
sys.stdout.flush()
aug = ImageDataGenerator(rotation_range=30, width_shift_range=0.1,
                         height_shift_range=0.1, shear_range=0.2, zoom_range=0.2,
                         horizontal_flip=True, fill_mode="nearest")

# Function to create CNN model
def createModel():
    model = Sequential()
    model.add(Conv2D(32, (3, 3), padding='same', activation='relu', input_shape=inputShape))
    model.add(Conv2D(32, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    model.add(Conv2D(64, (3, 3), padding='same', activation='relu'))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    model.add(Conv2D(128, (3, 3), padding='same', activation='relu'))
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(NUM_CLASSES, activation='softmax'))
    
    opt = Adam(learning_rate=INIT_LR, decay=INIT_LR / EPOCHS)
    model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])
    return model

# Initialize the model
model = createModel()

# Train the model
print("Training the model...")
sys.stdout.flush()
H = model.fit(aug.flow(np.array(list(trainX)), trainY, batch_size=BS),
              validation_data=(np.array(list(valX)), valY),
              steps_per_epoch=len(trainX) // BS,
              epochs=EPOCHS, verbose=1)

# Evaluate the model on validation data
print("Evaluating the model...")
loss, accuracy = model.evaluate(np.array(list(valX)), valY, verbose=1)
print(f"Validation Loss: {loss}")
print(f"Validation Accuracy: {accuracy}")

# Plot training loss and accuracy
plt.style.use("ggplot")
plt.figure()
plt.plot(np.arange(0, EPOCHS), H.history["loss"], label="train_loss")
plt.plot(np.arange(0, EPOCHS), H.history["val_loss"], label="val_loss")
plt.plot(np.arange(0, EPOCHS), H.history["accuracy"], label="train_acc")
plt.plot(np.arange(0, EPOCHS), H.history["val_accuracy"], label="val_acc")
plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend(loc="upper left")
plt.show()


Loading images at...2024-08-12 12:18:25.657441


FileNotFoundError: [WinError 3] The system cannot find the path specified: ''