<a href="https://colab.research.google.com/github/Jorgecardetegit/DiseaseClassifier/blob/main/Simple_CNN_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [63]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [81]:
import os
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from sklearn.model_selection import train_test_split

from glob import glob
import random

In [65]:
gen_dir = "/content/drive/MyDrive/Apple disease proyect/dataset"
categories = os.listdir(gen_dir)
categories = [category.split('___')[1].lower() for category in categories]

apple_scab = glob('/content/drive/MyDrive/Apple disease proyect/dataset/Apple___Apple_scab/*.JPG')
black_rot = glob("/content/drive/MyDrive/Apple disease proyect/dataset/Apple___Black_rot/*.JPG")
cedar_apple_rust = glob("/content/drive/MyDrive/Apple disease proyect/dataset/Apple___Cedar_apple_rust/*.JPG")
healthy = glob("/content/drive/MyDrive/Apple disease proyect/dataset/Apple___healthy/*.JPG")

# 1. Preprocessing

### Normalizing or Standardizing and rescaling

All the images in the dataset have the same size so it is not strictly necessary to Standardize or even Normalize the dataset, nevertheless I will normalize the images in order to converge faster during training and improve generalization. I won´t standardize it.

In addition the images have a size of 255*255 which is quite standard and suitable for the models I am going to fit so I won´t rescale the images.

### Data Augmentation
As stated in the EDA anlaysis the main problem in the dataset is the imbalance of data amount presented in each category. In order to deal with this issue I will use the method of data augmentation to artificially incraese the number of images in the categories with less images.

Is important to be carefull with this method, since it can provoke overfitting. To start with I will use the following technique with each variable:

- apple_scab: Apply mild to moderate augmentation. An increase of 20-50% in the number of images will be the starting point.

- black_rot: Will also apply mild to moderate augmentation. An increase of 20-50% in the number of images will be the starting point.

- cedar_apple_rust: Apply moderate to aggressive data augmentation to this class to increase the number of samples. I aim to at least double the number of images in this class

- healthy: No augmentation will be applied for now.

### Splitting ratios
- Train: 70%
- Validation: 15%
- Test: 15%

### Parameters definition

- IMAGE_WIDTH = 255
- IMAGE_HEIGHT = 255
- NUM_CHANNELS = 3
- BATCH_SIZE = 32
- EPOCHS = 50

### Data Augmentation

In [79]:
class DataAugmentation():

  def shuffleBatchCategory(self,category):
      shuffled_dataset = random.sample(category, len(category))

      return shuffled_dataset

  def transformations(self, image_path):
      image = tf.io.read_file(image_path)
      image = tf.image.decode_image(image, channels=3)

      image = tf.image.random_flip_left_right(image)
      image = tf.image.rot90(image, k=tf.random.uniform(shape=[], minval=0, maxval=4, dtype=tf.int32))
      # Add the transoformations here

      return image

  def augmentation(self,image_list, num_image=None):
      if num_image is None:
          num_image = len(image_list)

      augmented_images = [self.transformations(image) for image in image_list]
      combined_images = image_list + augmented_images

      return combined_images

  def save_images(self, image_list, output_dir):
      os.makedirs(output_dir, exist_ok=True)

      for i, image in enumerate(image_list):
          transformed_filename = f"transformed_image_{i}.jpg"            # Genera un nombre de archivo único con sufijo de transformación y extensión .jpg
          file_path = os.path.join(output_dir, transformed_filename)

          image = tf.cast(image, tf.uint8)                               # Convierte la imagen a formato uint8 (0-255) y la guarda como JPEG
          encoded_image = tf.image.encode_jpeg(image)

          with open(file_path, "wb") as f:                               # Escribe el archivo JPEG en el directorio de salida
              f.write(encoded_image.numpy())

  def concatenate(category1, category2, category3):
      general_dataset = category1 + category2 + category3

      return general_dataset


In [80]:
DataAugmentor = DataAugmentation()

# Apple_scab_augmentation----------------------------------------------------------------------------------------------------------------------------------
shuffled_dataset_apple_scab = DataAugmentor.shuffleBatchCategory(category = apple_scab)

augmented_dataset_apple_scab = DataAugmentor.augmentation(image_list = shuffled_dataset_apple_scab,num_image = round(len(shuffled_dataset_apple_scab)/2))

DataAugmentor.save_images(image_list = augmented_dataset_apple_scab,
                          output_dir ="/content/drive/MyDrive/Apple disease proyect/dataset/Apple___Apple_scab" )

# Black_rot augmentation----------------------------------------------------------------------------------------------------------------------------------
shuffled_dataset_black_rot = DataAugmentor.shuffleBatchCategory(category = black_rot)

augmented_dataset_black_rot = DataAugmentor.augmentation(image_list = shuffled_dataset_black_rot, num_image = round(len(shuffled_dataset_black_rot)/2))

DataAugmentor.save_images(image_list = augmented_dataset_black_rot,
                         output_dir ="/content/drive/MyDrive/Apple disease proyect/dataset/Apple___Black_rot" )

# Cedar_apple_rust augmentation---------------------------------------------------------------------------------------------------------------------------
shuffled_dataset_cedar_apple_rust = DataAugmentor.shuffleBatchCategory(category = cedar_apple_rust)

augmented_dataset_cedar_apple_rust = DataAugmentor.augmentation(image_list = shuffled_dataset_cedar_apple_rust)

DataAugmentor.save_images(image_list = augmented_dataset_cedar_apple_rust,
                          output_dir ="/content/drive/MyDrive/Apple disease proyect/dataset/Apple___Cedar_apple_rust" )

UnimplementedError: ignored

## Parameters

In [None]:
IMAGE_HEIGHT = 255
IMAGE_WIDTH = 255
NUM_CHANNELS = 3
BATCH_SIZE = 32
EPOCHS = 50

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(augmented_images, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.shuffle(len(X_train)).batch(BATCH_SIZE)

# Conjunto de validación
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
val_dataset = val_dataset.batch(BATCH_SIZE)

# Conjunto de prueba
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_dataset = test_dataset.batch(BATCH_SIZE)

# Entrenar el modelo

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
    tf.keras.layers.MaxPooling2D((2, 2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(4, activation='softmax')  # 4 clases de salida
])
