

# Training Machine Learning Model File 
## This file contains the code which is used to train the machine learning model
## After the model is trained the weight of the model get saved with its best accuracy 

# **Code Reference**
## https://github.com/debayanmitra1993-data/Blindness-Detection-Diabetic-Retinopathy-/blob/master/3_resnet50(colab).ipynb 

In [1]:
import numpy as np
import cv2
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from multiprocessing.pool import ThreadPool
import pickle
import multiprocessing
import seaborn as sns
plt.rcParams["axes.grid"] = False
from PIL import Image
from tqdm import tqdm
from prettytable import PrettyTable

# TensorFlow and tf.keras Imports
import tensorflow as tf
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input, decode_predictions
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import load_model, model_from_json
from tensorflow.keras.layers import Input, GlobalAveragePooling2D, Dropout, Dense, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras import applications, optimizers, Model, Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.utils import to_categorical

# Sklearn Imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, cohen_kappa_score, accuracy_score

%matplotlib inline


# The Functions
the program uses various functions to deal with different activties throughout in order to successfully train the machine learning model 


In [2]:
#This Function loads the train/test data from a folder and cvs file into variables
def load_data():
    train = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    
    train_dir = os.path.join('./','train_images/')
    test_dir = os.path.join('./','test_images/')
    
    train['file_path'] = train['id_code'].map(lambda x: os.path.join(train_dir,'{}.png'.format(x)))
    test['file_path'] = test['id_code'].map(lambda x: os.path.join(test_dir,'{}.png'.format(x)))
    
    train['file_name'] = train["id_code"].apply(lambda x: x + ".png")
    test['file_name'] = test["id_code"].apply(lambda x: x + ".png")
    
    train['diagnosis'] = train['diagnosis'].astype(str)
    
    return train,test

These next Functions deal with Processing The Images to produce an results of higher accuracy 


In [4]:
'''Function loads an image from Folder , Resizes and saves in another directory '''

def image_resize_save(directory, output_directory, filenames):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory, exist_ok=True)
    for filename in tqdm(filenames):
        output_filepath = os.path.join(output_directory, filename)
        # Check if the resized image already exists
        if not os.path.exists(output_filepath):
            input_filepath = os.path.join(directory, filename)
            img = cv2.imread(input_filepath)
            if img is not None:
                resized_img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
                cv2.imwrite(output_filepath, resized_img)
            else:
                print(f"Warning: '{input_filepath}' cannot be read.")
        else:
            print(f"Skipping existing file: {output_filepath}")


In [5]:
#Function to crop image background
def crop_image_from_gray(img,tol=7):
    if img.ndim ==2:
        mask = img>tol
        return img[np.ix_(mask.any(1),mask.any(0))]
    elif img.ndim==3:
        gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
        mask = gray_img>tol
        
        check_shape = img[:,:,0][np.ix_(mask.any(1),mask.any(0))].shape[0]
        if (check_shape == 0): # if image is too dark and we crop out everything,
            return img # return original image
        else:
            img1=img[:,:,0][np.ix_(mask.any(1),mask.any(0))]
            img2=img[:,:,1][np.ix_(mask.any(1),mask.any(0))]
            img3=img[:,:,2][np.ix_(mask.any(1),mask.any(0))]
            img = np.stack([img1,img2,img3],axis=-1)
        return img
    
    
#function provides circle crop on the image
def circle_crop(img, sigmaX = 30):   
    img = crop_image_from_gray(img)    
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    height, width, depth = img.shape    
    
    x = int(width/2)
    y = int(height/2)
    r = np.amin((x,y))
    
    circle_img = np.zeros((height, width), np.uint8)
    cv2.circle(circle_img, (x,y), int(r), 1, thickness=-1)
    img = cv2.bitwise_and(img, img, mask=circle_img)
    img = crop_image_from_gray(img)
    img=cv2.addWeighted(img,4, cv2.GaussianBlur( img , (0,0) , sigmaX) ,-4 ,128)
    return img 

def preprocess_image(input_directory, output_directory, filenames):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory, exist_ok=True)
    for filename in filenames: 
        input_filepath = os.path.join(input_directory, filename)
        output_filepath = os.path.join(output_directory, filename)
        # Check if the preprocessed image already exists
        if not os.path.exists(output_filepath):
            img = cv2.imread(input_filepath)
            if img is not None:
                height, width, channels = img.shape
                img = circle_crop(img)  # Assuming circle_crop is your preprocessing step
                cv2.imwrite(output_filepath, cv2.resize(img, (IMG_SIZE, IMG_SIZE)))
            else:
                print(f"Warning: '{input_filepath}' cannot be read.")
        else:
            print(f"Skipping existing file: {output_filepath}")


Next Functions are used to successfully train the model


In [10]:
def img_generator(df_train, df_test):
    train_datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2, horizontal_flip=True)
    
    train_generator = train_datagen.flow_from_dataframe(
        dataframe=df_train,
        directory="./train_images_resized_preprocessed/",
        x_col="file_name",
        y_col="diagnosis",
        batch_size=BATCH_SIZE,
        class_mode="categorical",
        target_size=(HEIGHT, WIDTH),
        subset='training'
    )
    
    valid_generator = train_datagen.flow_from_dataframe(
        dataframe=df_train,
        directory="./train_images_resized_preprocessed/",
        x_col="file_name",
        y_col="diagnosis",
        batch_size=BATCH_SIZE,
        class_mode="categorical",
        target_size=(HEIGHT, WIDTH),
        subset='validation'
    )
    
    test_datagen = ImageDataGenerator(rescale=1./255)
    
    test_generator = test_datagen.flow_from_dataframe(
    dataframe=df_test,
    directory="D:\\DR_ML\\backend\\test_images_resized_preprocessed\\",
    x_col="file_name",
    target_size=(HEIGHT, WIDTH),
    batch_size=1,
    shuffle=False,
    class_mode=None
)


    return train_generator, valid_generator, test_generator

print("Calling img_generator")
train_generator, valid_generator, test_generator = img_generator(df_train, df_test)
print("img_generator called successfully")


Calling img_generator
Found 2930 validated image filenames belonging to 5 classes.
Found 732 validated image filenames belonging to 5 classes.
Found 1928 validated image filenames.
img_generator called successfully


In [12]:
#this function create the resnet50 model with iniall imagenet weights without the top layer 
def create_model(input_shape, n_out):
    input_tensor = Input(shape=input_shape)
    base_model = applications.ResNet50(weights="imagenet", include_top=False,input_tensor=input_tensor)

    x = GlobalAveragePooling2D()(base_model.output)
    x = Dropout(0.5)(x)
    x = Dense(2048, activation='relu')(x)
    x = Dropout(0.5)(x)
    final_output = Dense(n_out, activation='softmax', name='final_output')(x)
    model = Model(input_tensor, final_output)
    return model

Main code of the program underneath declaring variables and using the functions to process the images and train the model 

In [3]:
#Declearing important variables 
IMG_SIZE = 512
BATCH_SIZE = 8
EPOCHS = 25
WARMUP_EPOCHS = 2
LEARNING_RATE = 1e-4
WARMUP_LEARNING_RATE = 1e-3
HEIGHT = 320
WIDTH = 320
CANAL = 3
#variables used for callbacks
ES_PATIENCE = 5
RLROP_PATIENCE = 3
DECAY_DROP = 0.5

In [13]:
#load in the directories 
train_dir = "train_images/"
val_dir   = "backend/test_images/" #directories for training

test_dir  = "backend/val/" #directory for final model scoring

In [6]:
# Assuming you've defined preprocess_image similarly to image_resize_save

# Load data into dataframes
df_train, df_test = load_data()
print(df_train.shape, df_test.shape, '\n')

# Resize training images
image_resize_save('./train_images', './train_images_resized', df_train['file_name'].values)
# Preprocess training images - Ensure this function processes images from './train_images_resized' and saves to './train_images_resized_preprocessed'
preprocess_image('./train_images_resized', './train_images_resized_preprocessed', df_train['file_name'].values)

# Optionally, resize test images if not done previously, then preprocess
image_resize_save('./test_images', './test_images_resized', df_test['file_name'].values)
preprocess_image('./test_images_resized', './test_images_resized_preprocessed', df_test['file_name'].values)

# Display first few rows of test DataFrame to verify
df_test.head(6)


(3662, 4) (1928, 3) 



 67%|██████▋   | 2456/3662 [00:00<00:00, 12310.66it/s]

Skipping existing file: ./train_images_resized\000c1434d8d7.png
Skipping existing file: ./train_images_resized\001639a390f0.png
Skipping existing file: ./train_images_resized\0024cdab0c1e.png
Skipping existing file: ./train_images_resized\002c21358ce6.png
Skipping existing file: ./train_images_resized\005b95c28852.png
Skipping existing file: ./train_images_resized\0083ee8054ee.png
Skipping existing file: ./train_images_resized\0097f532ac9f.png
Skipping existing file: ./train_images_resized\00a8624548a9.png
Skipping existing file: ./train_images_resized\00b74780d31d.png
Skipping existing file: ./train_images_resized\00cb6555d108.png
Skipping existing file: ./train_images_resized\00cc2b75cddd.png
Skipping existing file: ./train_images_resized\00e4ddff966a.png
Skipping existing file: ./train_images_resized\00f6c1be5a33.png
Skipping existing file: ./train_images_resized\0104b032c141.png
Skipping existing file: ./train_images_resized\0124dffecf29.png
Skipping existing file: ./train_images_r

100%|██████████| 3662/3662 [00:00<00:00, 11297.83it/s]


Skipping existing file: ./train_images_resized\aaaadb174012.png
Skipping existing file: ./train_images_resized\aabd867043cf.png
Skipping existing file: ./train_images_resized\aad0c0ee9268.png
Skipping existing file: ./train_images_resized\aae8f9f3ef8c.png
Skipping existing file: ./train_images_resized\aafb0c944f14.png
Skipping existing file: ./train_images_resized\aafe980edd0c.png
Skipping existing file: ./train_images_resized\ab03d50bba2f.png
Skipping existing file: ./train_images_resized\ab1c20a94f3f.png
Skipping existing file: ./train_images_resized\ab32db41c409.png
Skipping existing file: ./train_images_resized\ab3c505b624f.png
Skipping existing file: ./train_images_resized\ab50123abadb.png
Skipping existing file: ./train_images_resized\ab653b8554c0.png
Skipping existing file: ./train_images_resized\ab686895533e.png
Skipping existing file: ./train_images_resized\ab724603ee93.png
Skipping existing file: ./train_images_resized\ab78a66dee6a.png
Skipping existing file: ./train_images_r

 60%|█████▉    | 1152/1928 [00:00<00:00, 11508.65it/s]

Skipping existing file: ./test_images_resized\0005cfc8afb6.png
Skipping existing file: ./test_images_resized\003f0afdcd15.png
Skipping existing file: ./test_images_resized\006efc72b638.png
Skipping existing file: ./test_images_resized\00836aaacf06.png
Skipping existing file: ./test_images_resized\009245722fa4.png
Skipping existing file: ./test_images_resized\009c019a7309.png
Skipping existing file: ./test_images_resized\010d915e229a.png
Skipping existing file: ./test_images_resized\0111b949947e.png
Skipping existing file: ./test_images_resized\01499815e469.png
Skipping existing file: ./test_images_resized\0167076e7089.png
Skipping existing file: ./test_images_resized\01c31b10ab99.png
Skipping existing file: ./test_images_resized\01c5ba195207.png
Skipping existing file: ./test_images_resized\01e4d86b3a30.png
Skipping existing file: ./test_images_resized\020921b796d5.png
Skipping existing file: ./test_images_resized\020f6983114d.png
Skipping existing file: ./test_images_resized\021c20761

100%|██████████| 1928/1928 [00:00<00:00, 8791.95it/s] 


Skipping existing file: ./test_images_resized\f893813ed0d4.png
Skipping existing file: ./test_images_resized\f898d3e8ed90.png
Skipping existing file: ./test_images_resized\f8aa9af6ca63.png
Skipping existing file: ./test_images_resized\f8b184ad0701.png
Skipping existing file: ./test_images_resized\f8b9c9f5235f.png
Skipping existing file: ./test_images_resized\f8f530fa573e.png
Skipping existing file: ./test_images_resized\f9642cf6a5bb.png
Skipping existing file: ./test_images_resized\f96f06803471.png
Skipping existing file: ./test_images_resized\f96fecb16957.png
Skipping existing file: ./test_images_resized\f99a9ad63dda.png
Skipping existing file: ./test_images_resized\f9e7f614c91d.png
Skipping existing file: ./test_images_resized\f9f1cd9ed16c.png
Skipping existing file: ./test_images_resized\f9f3d97f3269.png
Skipping existing file: ./test_images_resized\fa007743976a.png
Skipping existing file: ./test_images_resized\fa09c97a2887.png
Skipping existing file: ./test_images_resized\fa7a97f4d

Unnamed: 0,id_code,file_path,file_name
0,0005cfc8afb6,./test_images/0005cfc8afb6.png,0005cfc8afb6.png
1,003f0afdcd15,./test_images/003f0afdcd15.png,003f0afdcd15.png
2,006efc72b638,./test_images/006efc72b638.png,006efc72b638.png
3,00836aaacf06,./test_images/00836aaacf06.png,00836aaacf06.png
4,009245722fa4,./test_images/009245722fa4.png,009245722fa4.png
5,009c019a7309,./test_images/009c019a7309.png,009c019a7309.png


In [7]:
df_train_train,df_train_valid = train_test_split(df_train,test_size = 0.2)
print(df_train_train.shape,df_train_valid.shape)
df_train_train.head(6)

(2929, 4) (733, 4)


Unnamed: 0,id_code,diagnosis,file_path,file_name
1431,64b9206afb3f,0,./train_images/64b9206afb3f.png,64b9206afb3f.png
3465,f080a22008be,3,./train_images/f080a22008be.png,f080a22008be.png
2373,a64273801bde,2,./train_images/a64273801bde.png,a64273801bde.png
3116,d871895742b1,0,./train_images/d871895742b1.png,d871895742b1.png
139,0a61bddab956,1,./train_images/0a61bddab956.png,0a61bddab956.png
2495,ad1aa75d5630,0,./train_images/ad1aa75d5630.png,ad1aa75d5630.png


In [8]:
N_CLASSES = df_train_train['diagnosis'].nunique()

In [11]:
#initalise the generators 
train_generator, valid_generator, test_generator = img_generator(df_train, df_test)



Found 2930 validated image filenames belonging to 5 classes.
Found 732 validated image filenames belonging to 5 classes.
Found 1928 validated image filenames.


In [14]:
#create the model
model = create_model(input_shape=(HEIGHT, WIDTH, CANAL), n_out=N_CLASSES)

for layer in model.layers:
    layer.trainable = False

for i in range(-5, 0):
    model.layers[i].trainable = True
model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 0us/step


In [15]:
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size
print(STEP_SIZE_TRAIN,STEP_SIZE_VALID)

366 91


In [18]:
# Run some warm up epochs 
model.compile(optimizer=optimizers.Adam(learning_rate=WARMUP_LEARNING_RATE), loss='categorical_crossentropy', metrics=['accuracy'])

history_warmup = model.fit(train_generator,
                           steps_per_epoch=STEP_SIZE_TRAIN,
                           validation_data=valid_generator, validation_steps=STEP_SIZE_VALID,
                           epochs=WARMUP_EPOCHS,
                           verbose=1).history


Epoch 1/2


  self._warn_if_super_not_called()


[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m415s[0m 1s/step - accuracy: 0.3668 - loss: 2.3433 - val_accuracy: 0.4629 - val_loss: 1.3289
Epoch 2/2
[1m  1/366[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:12[0m 693ms/step - accuracy: 0.2500 - loss: 1.5137

  self.gen.throw(typ, value, traceback)


[1m366/366[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.2500 - loss: 0.7589 - val_accuracy: 0.0000e+00 - val_loss: 1.0466


In [21]:
for layer in model.layers:
    layer.trainable = True

# Create callback which will make sure the training will stop when the best accuracy is achieved through the training
es = EarlyStopping(monitor='val_loss', mode='min', patience=ES_PATIENCE, restore_best_weights=True, verbose=1)
rlrop = ReduceLROnPlateau(monitor='val_loss', mode='min', patience=RLROP_PATIENCE, factor=DECAY_DROP, min_lr=1e-6, verbose=1)
# This specific callback will save the model's best accuracy
savepoint = ModelCheckpoint("test_best_model.keras", monitor='loss', verbose=1, save_best_only=True, mode='auto')

callback_list = [es, rlrop, savepoint]
optimizer = optimizers.Adam(learning_rate=LEARNING_RATE)
# Initialize the model before running the training
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=['accuracy'])
model.summary()


In [None]:
#run the training of the model using train_generator , valid_generator and callback_list
history_finetunning = model.fit(train_generator,
                                steps_per_epoch=STEP_SIZE_TRAIN,
                                validation_data=valid_generator,
                                validation_steps=STEP_SIZE_VALID,
                                epochs=EPOCHS,
                                callbacks=callback_list,
                                verbose=1).history

In [None]:
#the image below shows the progress of accuracy of the model with the training data as well as with the validation data that have never been seen before
plt.figure(figsize=(8,5))

plt.plot(history_finetunning['accuracy'])
plt.plot(history_finetunning['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.gca().ticklabel_format(axis='both', style='plain', useOffset=False)
plt.show()

In [None]:
#complete generator meaning run predictions of train data to determine accuracy with data familiar to the model and to determine Cohen Kappa Score
complete_datagen = ImageDataGenerator(rescale=1./255)
complete_generator = complete_datagen.flow_from_dataframe(dataframe=df_train_train,
                                                          directory = "./train_images_resized_preprocessed/",
                                                          x_col="file_name",
                                                          target_size=(HEIGHT, WIDTH),
                                                          batch_size=1,
                                                          shuffle=False,
                                                          class_mode=None)

STEP_SIZE_COMPLETE = complete_generator.n//complete_generator.batch_size
train_preds = model.predict_generator(complete_generator, steps=STEP_SIZE_COMPLETE,verbose = 1)
train_preds = [np.argmax(pred) for pred in train_preds]

In [None]:
#Print the calculated accuracy and the Cohen Kappa score 
print("Train Cohen Kappa score: %.3f" % cohen_kappa_score(train_preds, df_train_train['diagnosis'].astype('int'), weights='quadratic'))
print("Train Accuracy score : %.3f" % accuracy_score(df_train_train['diagnosis'].astype('int'),train_preds))

In [None]:
#save the model structure 
model.save('model')