# Deep Learning Major Task
## CNN Leaf Classification

<ol>
  <li><a href="#description">description</a></li>
  <li><a href="#part-i">Part I: Data Preparation</a>
    <ol>
      <li><a href="#describe-data">Describe the Data</a></li>
      <li><a href="#clean-data">Clean the Data</a></li>
      <li><a href="#check-values">Check for Missing Values and Duplicates</a></li>
      <li><a href="#visualize-data">Visualize the Data</a></li>
      <li><a href="#draw-images">Draw Images</a></li>
      <li><a href="#correlation-analysis">Correlation Analysis</a></li>
      <li><a href="#divide-data">Divide the Data</a></li>
      <li><a href="#standardize-data">Standardize the Data</a></li>
      <li><a href="#encode-labels">Encode the Labels</a></li>
    </ol>
  </li>
  <li><a href="#part-ii">Part II: Training a Neural Network (CNN)</a>
    <ol>
      <li><a href="#implement-a-cnn-model">Implement a CNN Model</a></li>
      <li><a href="#write-training-function">Write Training Function</a></li>
      <li><a href="#explore-hyperparameter-settings">Explore Hyperparameter Settings</a></li>
      <li><a href="#tensorboard-monitoring">TensorBoard Monitoring</a></li>
      <li><a href="#evaluation-function">Evaluation Function</a></li>
    </ol>
  </li>
</ol>

<h3>Description</h3>
<a id="description"></a>

### First lets write our imports

In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder


# Part I: Data Preparation
<a id="part-i"></a>

<h2>Taking a look and Describing the data</h2>
<a id="describe-data"></a>

### Training dataset

In [None]:
# Load the training set
train_df = pd.read_csv(r'.\data_files\train.csv')

print("#-----> First 5 rows of the training set:\n")
train_df.head(5)

In [None]:
print("-----> training set description:")
train_df.describe()

In [None]:
print("-----> training set information")
train_df.info()

In [None]:
print("-----> training set value types")
train_df.dtypes

### Testing dataset

In [None]:
# Load the testing set
test_df = pd.read_csv(r'.\data_files\test.csv')

print("#-----> First 5 rows of the testing set:")
test_df.head(5)

In [None]:
print("-----> testing set description:")
test_df.describe()

In [None]:
print("-----> testing set information")
test_df.info()

In [None]:
print("-----> testing set value types")
test_df.dtypes

<h2>Cleaning the data</h2>
<a id="clean-data"></a>

### Checking the data for missing values or duplicates and carrying out proper correction methods
<a id="check-values"></a>

In [None]:
# Check for missing values
print("Missing values:\n", train_df.isnull().sum(), "\n")

# Check for duplicates
print("Duplicate values:\n", train_df.duplicated().sum())


### ----> Looks like we don't have any missing or duplicate values

Before we continue lets setup our data by dropping the the id and species from the features and set the target on species

In [None]:
# Exclude 'id' and 'species' columns
X_features = train_df.drop(['id', 'species'], axis=1)
y_target = train_df['species']

## Visualizing the data
<a id="viualize-data"></a>

In [None]:
### Feature Distributions

features = train_df.iloc[:, 1:]  # Assuming features start from column 2
# plt.figure(figsize=(12, 8))
# for i, feature in enumerate(features.columns, 1):
#     plt.subplot(3, 3, i)
#     sns.histplot(train_df[feature], kde=True)
#     plt.title(f'Distribution of {feature}')
# plt.tight_layout()
# plt.show()


# features = train_df.iloc[:, 1:]
# plt.figure(figsize=(12, 3 * features.shape[1]))  # Adjust the figure height based on the number of features
# for i, feature in enumerate(features.columns, 1):
#     plt.subplot(features.shape[1], 1, i)
#     sns.histplot(train_df[feature], kde=True)
#     plt.title(f'Distribution of {feature}')

# plt.tight_layout()
# plt.show()

In [None]:
# # Visualization 3: Pairwise Feature Scatter Plots
# sns.pairplot(train_df.sample(IMAGE_SIZE), hue='species', diag_kind='kde')
# plt.suptitle('Pairwise Scatter Plots for Features', y=1.02)
# plt.show()

In [None]:
# Dimensionality Reduction Visualization (using PCA)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(X_features)
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_result[:, 0], y=pca_result[:, 1], hue=train_df['species'])
plt.title('PCA Visualization')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()



In [None]:
from PIL import Image
import os

image_dir = '.\data_files\images'
image_ids = train_df['id'].head(5).tolist() 

plt.figure(figsize=(15, 8))

for layer_number, image_id in enumerate(image_ids, 1):
    image_path = os.path.join(image_dir, f"{image_id}.jpg")
    image = Image.open(image_path).convert('RGB')

    plt.subplot(1, 5, layer_number)
    plt.imshow(image)
    plt.title(f"Image {layer_number}")
    plt.axis('off')

plt.show()


<h2>Correlation Analysis </h2>
<a id="correlation-analysis"></a>

we are going to calculate the correlation matrix for shape features<br>
we will use heatmap


## Correlation Matrix and Distribution for Each Feature Group

In [None]:
# Assuming features are grouped into margin, shape, and texture
margin_features = X_features.iloc[:, :64]
shape_features = X_features.iloc[:, 64:128]
texture_features = X_features.iloc[:, 128:]

feature_groups = [margin_features, shape_features, texture_features]
group_names = ['Margin Features', 'Shape Features', 'Texture Features']

# Create subplots
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(18, 16))
fig.suptitle('Analysis of Feature Groups')


# Flatten the 2D array of subplots for easier indexing
axes = axes.flatten()

for layer_number, features in enumerate(feature_groups):
    # Calculate correlation matrix
    correlation_matrix = features.corr()

    # Plot correlation heatmap
    sns.heatmap(correlation_matrix, cmap='coolwarm', ax=axes[layer_number * 2])
    axes[layer_number * 2].set_title(f'Correlation Matrix - {group_names[layer_number]}')

    # Plot distribution for the first feature in the group
    sns.histplot(data=features, x=features.columns[0], kde=True, ax=axes[layer_number * 2 + 1])
    axes[layer_number * 2 + 1].set_title(f'Distribution - {group_names[layer_number]}')

# Adjust layout to prevent overlapping
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


## Deciding which split method to use
<a id="divide-data"></a>

#### We got two methods for splitting:
<ol>
<li>train_test_split</li>
<li>StratifiedShuffleSplit (sss)</li>
</ol>

<b>train_test_split:</b></br>
Usage: Commonly used for general train-test splitting, especially when the class distribution is not a significant concern.<br>
How it works: Randomly shuffles and splits the data into training and test sets.<br>
Advantage: Simplicity and ease of use. Suitable for well-balanced datasets.<br>

<b>StratifiedShuffleSplit:</b></br>
Usage: Typically used when you want to ensure that the distribution of classes in both the training and validation sets is representative of the overall distribution in the dataset.<br>
How it works: StratifiedShuffleSplit maintains the class distribution when creating random splits. It shuffles the data and then creates splits, ensuring that each split has a similar class distribution.<br>
Advantage: Useful when dealing with imbalanced datasets where certain classes have significantly fewer samples than others.<br>

If the dataset has a <b>balanced</b> class distribution, and just need a simple split, train_test_split is often sufficient and easier to use.<br>

If the dataset has <b>imbalanced</b> classes, and want to ensure that the class distribution is maintained in both training and validation sets, then StratifiedShuffleSplit is a good choice.<br>

To decide which approach is better the dataset, we can can check the distribution of the 'species' column in our dataset.

In [None]:
plt.figure(figsize=(14, 6))
sns.countplot(x='species', data=train_df)
plt.title('Distribution of Leaf Classes')
plt.xticks(rotation=90)
plt.xticks(fontsize=8)
plt.show()

-----> since all the bars are the same height that means its balanced and we can use the regular train_test_split method

In [None]:
# Exclude 'id' and 'species' columns
X_features = train_df.drop(['species'], axis=1)
# y_target = train_df['species']

<h2>Train/Test split</h2>
Divide the data into a training and testing set using approximately 80% for training

In [None]:
# test_size = 0.2 meaning that the training set will be 0.8 (80%)
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=42)

In [None]:
X_train

In [None]:
y_train

<h2>Data Standardization</h2>
<a id="standardize-data"></a>

In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h2>Label Encoding</h2>
<a id="encode-labels"></a>

In [None]:
label_encoder = LabelEncoder()

y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Image Preprocessing

In [None]:
import cv2
import numpy as np

# read image
img = cv2.imread('data_files/images/1.jpg')
color = (0,0,0)
result = img.copy()
result = cv2.copyMakeBorder(result, 0,0,90,90, cv2.BORDER_CONSTANT, value=color)

plt.figure(figsize=(24, 16))

plt.subplot(2,1,1)
plt.imshow(img)
plt.subplot(2,1,2)
plt.imshow(result)



In [43]:
IMAGE_SIZE = 128

In [44]:
# import zipfile
# with zipfile.ZipFile('/data_files/images/leaf-classification/images.zip') as z_img:
#     z_img.extractall()
from PIL import Image, ImageOps
import glob
# image_list = []

def resize_img(img):
     # height, width, number of channels in image
    height = img.shape[0]
    width = img.shape[1]
    diff = int(abs(width-height)/2)
    color = (0,0,0)
    result = img.copy()
    if width<height:
        result = cv2.copyMakeBorder(result, 0,0,diff,diff, cv2.BORDER_CONSTANT, value=color)
    elif height>width:
        result = cv2.copyMakeBorder(result, diff,diff,0,0, cv2.BORDER_CONSTANT, value=color)
    
    # resize images
    result = cv2.resize(result, (IMAGE_SIZE,IMAGE_SIZE))
    
    return result
    
    # resize images
#     result = cv2.resize(result, (IMAGE_SIZE,IMAGE_SIZE))
# for filename in glob.glob('data_files/images/*.jpg'): #assuming jpg
#     # im=Image.open(filename)
#     img = cv2.imread(filename)
#     dimensions = img.shape
 
#     # height, width, number of channels in image
#     height = img.shape[0]
#     width = img.shape[1]
#     diff = int(abs(width-height)/2)
#     color = (0,0,0)
#     result = img.copy()
#     if width<height:
#         result = cv2.copyMakeBorder(result, 0,0,diff,diff, cv2.BORDER_CONSTANT, value=color)
#     elif height>width:
#         result = cv2.copyMakeBorder(result, diff,diff,0,0, cv2.BORDER_CONSTANT, value=color)
    
#     # resize images
#     result = cv2.resize(result, (IMAGE_SIZE,IMAGE_SIZE))
#     image_list.append(result)
    
# plt.figure(figsize=(24, 16))
# for i in range(25):
#     # j=np.random.choice((os.listdir('images')))
#     plt.subplot(5,5,i+1)
#     # img=load_img(os.path.join('/kaggle/working/images',j))
#     img = image_list[i]
#     plt.imshow(img)

<h1>Part II: Training the Neural Network</h1>

In [63]:
from keras.preprocessing import image

def load_img_data(data):
    data_ID = data['id']

    X = np.empty((len(data_ID), IMAGE_SIZE, IMAGE_SIZE, 1))
    for i, idnum in enumerate(data_ID):
        x = cv2.imread(("data_files/images/" + str(idnum) + '.jpg'), cv2.IMREAD_GRAYSCALE)
        # x = image.load_img(
        #     ("data_files/images/" + str(idnum) + '.jpg'), grayscale=True)
        x = image.img_to_array(resize_img(x))
        X[i] = x

    return np.around(X / 255.0)


In [65]:
import os
import sys
import numpy as np
import pandas as pd
import keras
from keras.preprocessing import image
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Dense, Dropout, Activation, Conv2D, MaxPooling2D, Flatten, Input

In [126]:
def NaiveCovNet(input_layer):
    conv_layers = 4
    BASE_CONV_FILTERS = 8
    x = input_layer
    for _ in range(conv_layers):
        x = Conv2D(BASE_CONV_FILTERS, 5, padding='same')(x)
        x = (Activation('relu'))(x)
        x = (MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))(x)
        BASE_CONV_FILTERS*=2

    # Flatten our array
    x = Flatten()(x)
    x = Dense(2048, kernel_initializer='glorot_normal', activation='relu')(x)
    x = Dropout(0.2)(x)

    x = Dense(99, kernel_initializer='glorot_normal', activation='relu')(x)
    x = Dropout(0.2)(x)

    output_layer = Dense(99, activation='softmax')(x)
    model = Model(input_layer, output_layer)
    return model

In [127]:
trian_X = load_img_data(X_train)
train_y = y_train_encoded
print(trian_X.shape)
print(train_y.shape)

(792, 128, 128, 1)
(792,)


In [140]:
input_layer = Input(shape=(IMAGE_SIZE, IMAGE_SIZE, 1), name='image')
model = NaiveCovNet(input_layer)
optimizer = keras.optimizers.Adam()
# print(optimizer.learning_rate)
# optimizer.learning_rate = 0.9
# print(optimizer.learning_rate)
model.compile(optimizer="Adam", loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

<tf.Variable 'learning_rate:0' shape=() dtype=float32, numpy=0.001>
<tf.Variable 'learning_rate:0' shape=() dtype=float32, numpy=0.9>
Model: "model_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 image (InputLayer)          [(None, 128, 128, 1)]     0         
                                                                 
 conv2d_58 (Conv2D)          (None, 128, 128, 8)       208       
                                                                 
 activation_58 (Activation)  (None, 128, 128, 8)       0         
                                                                 
 max_pooling2d_51 (MaxPooli  (None, 64, 64, 8)         0         
 ng2D)                                                           
                                                                 
 conv2d_59 (Conv2D)          (None, 64, 64, 16)        3216      
                                                        

In [137]:
import time
tensorboard_callback = keras.callbacks.TensorBoard(log_dir="logs/{}".format(time.time()))
history = model.fit(trian_X, train_y, epochs=40, batch_size=128, callbacks=[tensorboard_callback])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [246]:
batch_sizes = [64,128,256]
number_of_layers = [1,2,3,4,5]
dropout_rates = [0.2,0.3,0.4,0.5,0.6]
optimizers = [keras.optimizers.Adam,keras.optimizers.SGD,keras.optimizers.RMSprop,keras.optimizers.Adagrad]
weight_decays = [0.0001,0.001,0.01,0.05,0.1]
learning_rates = [0.0005,0.001,0.005,0.01]
learning_rate_schedulers = [
    None,
    keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=0.01,decay_steps=1000,decay_rate=0.9),
    keras.optimizers.schedules.InverseTimeDecay(initial_learning_rate=0.01,decay_steps=1000,decay_rate=0.9),
    keras.optimizers.schedules.CosineDecay(initial_learning_rate=0.01,decay_steps=1000),
    keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=0.01,decay_steps=1000)
    ]

# default
default_batch_size = 128
default_number_of_layers = 3
default_dropout_rate = 0.5
default_optimizer = keras.optimizers.Adam
default_weight_decay = 0.01
default_learning_rate = 0.001
default_learning_rate_scheduler = None

In [247]:
def generateModel(batch_size,number_of_layers,dropout_rate,optimizer,weight_decay,learning_rate,learning_rate_scheduler):
    input_layer = Input(shape=(IMAGE_SIZE, IMAGE_SIZE, 1), name='image')
    kernel_regularizer=keras.regularizers.L2(l2=weight_decay)
    def NaiveCovNet():
        BASE_CONV_FILTERS = 8
        x = input_layer
        for _ in range(number_of_layers):
            x = Conv2D(BASE_CONV_FILTERS, 5, padding='same')(x)
            x = (Activation('relu'))(x)
            x = (MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))(x)
            BASE_CONV_FILTERS*=2

        # Flatten our array
        x = Flatten()(x)
        x = Dense(2048, kernel_initializer='glorot_normal', activation='relu')(x)
        x = Dropout(dropout_rate)(x)

        x = Dense(99, kernel_initializer='glorot_normal', activation='relu')(x)
        x = Dropout(dropout_rate)(x)

        output_layer = Dense(99, activation='softmax')(x)
        model = Model(input_layer, output_layer)
        return model


    model = NaiveCovNet()
    # print(optimizer.learning_rate)
    # optimizer.learning_rate = 0.9
    # print(optimizer.learning_rate)
    optimizer = optimizer()
    learning_rate_scheduler_name = str(learning_rate_scheduler.__class__).split(".")[-1]
    if learning_rate_scheduler is None:
        optimizer.learning_rate = learning_rate
        learning_rate_scheduler_name = "fixed"
    
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.summary()

    optimizer_name = str(optimizer.__class__).split(".")[-1][:-2]
    
    hyperparameters = [batch_size,number_of_layers,dropout_rate,optimizer_name,weight_decay,learning_rate,learning_rate_scheduler_name]

    def getLogName():
        return " ".join([str(x) for x in hyperparameters])
    
    print(getLogName())
        
    tensorboard_callback = keras.callbacks.TensorBoard(log_dir="logs/{}/{}".format(getLogName(),time.time()))
    history = model.fit(trian_X, train_y, epochs=40, batch_size=batch_size, callbacks=[tensorboard_callback])
    return model

In [248]:
test_X = load_img_data(X_test)
test_y = y_test_encoded
print(test_X.shape)
print(test_y.shape)
model.evaluate(test_X, test_y)

(198, 128, 128, 1)
(198,)


[1.147599220275879, 0.7323232293128967]

In [249]:
model = generateModel(default_batch_size,default_number_of_layers,default_dropout_rate,default_optimizer,default_weight_decay,default_learning_rate,default_learning_rate_scheduler)

Model: "model_47"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 image (InputLayer)          [(None, 128, 128, 1)]     0         
                                                                 
 conv2d_137 (Conv2D)         (None, 128, 128, 8)       208       
                                                                 
 activation_137 (Activation  (None, 128, 128, 8)       0         
 )                                                               
                                                                 
 max_pooling2d_130 (MaxPool  (None, 64, 64, 8)         0         
 ing2D)                                                          
                                                                 
 conv2d_138 (Conv2D)         (None, 64, 64, 16)        3216      
                                                                 
 activation_138 (Activation  (None, 64, 64, 16)        0  

In [250]:
test_X = load_img_data(X_test)
test_y = y_test_encoded
print(test_X.shape)
print(test_y.shape)
model.evaluate(test_X, test_y)

(198, 128, 128, 1)
(198,)


[1.3209306001663208, 0.7070707082748413]