<a href="https://colab.research.google.com/github/probalkar/Diabetic-Retinopathy-Detection/blob/main/Diabetic_Retinopathy_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'diabetic-retinopathy-resized:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F131128%2F418031%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240412%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240412T030653Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D78207c43b86d08203bd93d354ae79ba42a732a121ba1064ff9ee0b829cf78ba57959e3a18de8af97ad30932c16bdd3487f0ce7111d2a3b6f416bc23a075c39075b9c8a6c1bd1a93c936eba1ea53ddea141fa2a92afc5c61764512bbcbb17f6715d0d4b89a5377451ebc9399f3f82f07b41e4ed2d7839e952e950c92ad2d2c5fa7dd93b2f8836ffaaef0e28af2eba444fe6a4cd1eb889869fe8caa327a09c2c4d1353ef3ffdc749c2c2c3205ab3ad7c2e3cf679b533c1cb2b44dd1a1ebf15257a42240416008491348b293ddf054ba479c9ae137490e0688374f878ce2015a4eb8edcdbe7aabdb1c27425ec1c7fb185cf337985511ba08962fb8cd0d7899f48e8'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading diabetic-retinopathy-resized, 7785957896 bytes compressed
Downloaded and uncompressed: diabetic-retinopathy-resized
Data source import complete.


In [2]:
!pip show tensorflow

Name: tensorflow
Version: 2.16.1
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: packages@tensorflow.org
License: Apache 2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: absl-py, astunparse, flatbuffers, gast, google-pasta, grpcio, h5py, keras, libclang, ml-dtypes, numpy, opt-einsum, packaging, protobuf, requests, setuptools, six, tensorboard, tensorflow-io-gcs-filesystem, termcolor, typing-extensions, wrapt
Required-by: dopamine-rl, tf_keras


In [3]:
!pip show keras

Name: keras
Version: 3.2.1
Summary: Multi-backend Keras.
Home-page: https://github.com/keras-team/keras
Author: Keras team
Author-email: keras-users@googlegroups.com
License: Apache License 2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: absl-py, h5py, ml-dtypes, namex, numpy, optree, rich
Required-by: tensorflow


## Importing dependencies

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout, Flatten
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import resample

## Loading dataset

In [5]:
data = pd.read_csv("/kaggle/input/diabetic-retinopathy-resized/trainLabels_cropped.csv")
print(data.value_counts().sum())
print(data['level'].unique())
print(data['level'].value_counts().unique())
data.head()

35108
[0 1 2 4 3]
[25802  5288  2438   872   708]


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,image,level
0,0,0,10_left,0
1,1,1,10_right,0
2,2,2,13_left,0
3,3,3,13_right,0
4,4,4,15_left,1


In [6]:
data.drop(columns=['Unnamed: 0.1','Unnamed: 0'], inplace=True)
data.head()

Unnamed: 0,image,level
0,10_left,0
1,10_right,0
2,13_left,0
3,13_right,0
4,15_left,1


In [7]:
from imblearn.under_sampling import RandomUnderSampler
import numpy as np

# Define your dataset
X = data['image']  # Your feature data
y = data['level']  # Your class labels

# Convert DataFrame or Series to a numpy array
X_array = X.values.reshape(-1,1)
y_array = y.values.reshape(-1,1)

# Count the number of datapoints for each class
class_counts = {0: 25802, 1: 5288, 2: 2438, 3: 708, 4: 872}

# Find the target number of datapoints
target_count = class_counts[3]

# Undersample the majority classes
undersampler = RandomUnderSampler(sampling_strategy={label: target_count for label in class_counts.keys() if label != 3})
X_resampled, y_resampled = undersampler.fit_resample(X_array, y_array)

# Check the new class counts
unique, counts = np.unique(y_resampled, return_counts=True)
resampled_class_counts = dict(zip(unique, counts))
print("Resampled class counts:", resampled_class_counts)

Resampled class counts: {0: 708, 1: 708, 2: 708, 3: 872, 4: 708}


In [8]:
new_X = []
for i in X_resampled:
  for j in i:
    new_X.append(j)

In [9]:
y_resampled

array([0, 0, 0, ..., 4, 4, 4])

In [10]:
# Create a new dataframe with resampled data
resampled_data = {'image': new_X,
        'level': y_resampled}

balanced_data = pd.DataFrame(resampled_data)

# Display the first few rows of the new dataframe
balanced_data.sample(10)

Unnamed: 0,image,level
1786,40001_right,2
2607,24317_right,3
1733,42633_right,2
803,227_left,1
904,1932_left,1
1427,36576_right,2
1,22549_right,0
1612,35863_left,2
2085,36938_right,2
3542,33812_right,4


In [11]:
print(balanced_data.value_counts().sum())
print(balanced_data['level'].unique())
print(balanced_data['level'].value_counts().unique())

3704
[0 1 2 3 4]
[872 708]


In [12]:
balanced_data['image'] = balanced_data['image']+'.jpeg'
balanced_data.head()

Unnamed: 0,image,level
0,43959_right.jpeg,0
1,22549_right.jpeg,0
2,9742_left.jpeg,0
3,11158_left.jpeg,0
4,40726_left.jpeg,0


### Splitting data

In [13]:
train, val = train_test_split(balanced_data, test_size=0.2, random_state=42)

In [14]:
IMAGE_SIZE = (224, 224)
BATCH_SIZE = 32
NUM_CLASSES = 5

### Rescaling image data

In [15]:
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)

In [16]:
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train,
    directory='../input/diabetic-retinopathy-resized/resized_train_cropped/resized_train_cropped/',
    x_col='image',
    y_col='level',
    target_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='raw')

val_generator = val_datagen.flow_from_dataframe(
    dataframe=val,
    directory='../input/diabetic-retinopathy-resized/resized_train_cropped/resized_train_cropped/',
    x_col='image',
    y_col='level',
    target_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='raw')

Found 2963 validated image filenames.
Found 741 validated image filenames.


### Loading base models

In [17]:
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

In [18]:
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.2)(x)
predictions = Dense(NUM_CLASSES, activation='softmax')(x)

In [19]:
model = Model(inputs=base_model.input, outputs=predictions)

In [20]:
for layer in base_model.layers:
    layer.trainable = False

In [21]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [22]:
model.fit(train_generator, validation_data=val_generator, epochs=40)

Epoch 1/40


  self._warn_if_super_not_called()


[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 442ms/step - accuracy: 0.3036 - loss: 1.8782 - val_accuracy: 0.3968 - val_loss: 1.3846
Epoch 2/40
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 269ms/step - accuracy: 0.4435 - loss: 1.2767 - val_accuracy: 0.3995 - val_loss: 1.3406
Epoch 3/40
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 259ms/step - accuracy: 0.4590 - loss: 1.2514 - val_accuracy: 0.3941 - val_loss: 1.3377
Epoch 4/40
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 269ms/step - accuracy: 0.4838 - loss: 1.2121 - val_accuracy: 0.4332 - val_loss: 1.2879
Epoch 5/40
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 257ms/step - accuracy: 0.5230 - loss: 1.1373 - val_accuracy: 0.4170 - val_loss: 1.3087
Epoch 6/40
[1m93/93[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 256ms/step - accuracy: 0.5432 - loss: 1.1039 - val_accuracy: 0.4076 - val_loss: 1.3512
Epoch 7/40
[1m93/93[0m [32m━━━

<keras.src.callbacks.history.History at 0x7a995d743550>

In [23]:
model.save('drdMbNetV2_40epochs_97.h5')

