In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing import image
import matplotlib.pyplot as plt
import os

class PatchExtractor(layers.Layer):
    def __init__(self, patch_size):
        super(PatchExtractor, self).__init__()
        self.patch_size = patch_size
    
    def call(self, images):
        batch_size = tf.shape(images)[0]
        patches = tf.image.extract_patches(
            images=images,
            sizes=[1, self.patch_size, self.patch_size, 1],
            strides=[1, self.patch_size, self.patch_size, 1],
            rates=[1, 1, 1, 1],
            padding="VALID"
        )
        patch_dims = patches.shape[-1]
        patches = tf.reshape(patches, [batch_size, -1, patch_dims])
        return patches

def mlp(x, hidden_units, dropout_rate):
    for units in hidden_units:
        x = layers.Dense(units, activation=tf.nn.gelu)(x)
        x = layers.Dropout(dropout_rate)(x)
    return x

def build_vit_model(input_shape, patch_size, num_patches, projection_dim, 
                     transformer_layers, num_heads, transformer_units, 
                     mlp_head_units, num_classes):
    inputs = layers.Input(shape=input_shape)
    patches = PatchExtractor(patch_size)(inputs)
    patch_projection = layers.Dense(projection_dim)(patches)
    
    positions = tf.range(start=0, limit=num_patches, delta=1)
    position_embedding = layers.Embedding(input_dim=num_patches, output_dim=projection_dim)(positions)
    encoded_patches = patch_projection + position_embedding
    
    for _ in range(transformer_layers):
        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
        attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=projection_dim, dropout=0.1)(x1, x1)
        x2 = layers.Add()([attention_output, encoded_patches])
        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
        encoded_patches = layers.Add()([x3, x2])
    
    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
    representation = layers.GlobalAveragePooling1D()(representation)
    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.3)
    logits = layers.Dense(num_classes, activation="softmax")(features)
    
    return Model(inputs=inputs, outputs=logits)

def train_vit(training, validing, testing, num_classes, epochs=30):
    vit_model = build_vit_model((224, 224, 3), 16, 196, 128, 6, 8, [256, 128], [512, 256], num_classes)
    vit_model.compile(optimizer=Adam(learning_rate=1e-4), loss="categorical_crossentropy", metrics=["accuracy"])
    early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=1e-6)
    
    history = vit_model.fit(training, epochs=epochs, validation_data=validing, callbacks=[early_stopping, reduce_lr])
    vit_model.save("vision_transformer_model.h5")
    
    test_loss, test_accuracy = vit_model.evaluate(testing)
    print(f"Test Accuracy: {test_accuracy:.4f}")
    return vit_model, history

def predict_vit(model, img_path, class_labels):
    if not os.path.exists(img_path):
        print(f"Image {img_path} not found.")
        return
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = np.expand_dims(image.img_to_array(img) / 255.0, axis=0)
    prediction = model.predict(img_array)
    predicted_class = np.argmax(prediction, axis=1)[0]
    print(f"Predicted: {class_labels[predicted_class]} ({prediction[0][predicted_class]:.4f})")

def main():
    train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255, validation_split=0.1)
    train_dir = 'Dataset/pest/train'
    test_dir = 'Dataset/pest/test'
    training = train_datagen.flow_from_directory(train_dir, batch_size=32, target_size=(224, 224), subset="training")
    validing = train_datagen.flow_from_directory(train_dir, batch_size=32, target_size=(224, 224), subset='validation')
    testing = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255).flow_from_directory(test_dir, batch_size=32, target_size=(224, 224))
    
    num_classes = len(training.class_indices)
    vit_model, history = train_vit(training, validing, testing, num_classes)
    img_test_path = 'Dataset/pest/test/beetle/jpg_33.jpg'
    predict_vit(vit_model, img_test_path, list(training.class_indices.keys()))

if __name__ == "__main__":
    main()


Found 2430 images belonging to 9 classes.
Found 270 images belonging to 9 classes.
Found 450 images belonging to 9 classes.



  self._warn_if_super_not_called()


Epoch 1/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 1s/step - accuracy: 0.1134 - loss: 2.2269 - val_accuracy: 0.1852 - val_loss: 2.1696 - learning_rate: 1.0000e-04
Epoch 2/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m563s[0m 7s/step - accuracy: 0.1595 - loss: 2.1696 - val_accuracy: 0.1815 - val_loss: 2.1363 - learning_rate: 1.0000e-04
Epoch 3/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m431s[0m 6s/step - accuracy: 0.1845 - loss: 2.1333 - val_accuracy: 0.2074 - val_loss: 2.0700 - learning_rate: 1.0000e-04
Epoch 4/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 2s/step - accuracy: 0.2314 - loss: 2.0071 - val_accuracy: 0.2000 - val_loss: 1.9520 - learning_rate: 1.0000e-04
Epoch 5/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 1s/step - accuracy: 0.2910 - loss: 1.9119 - val_accuracy: 0.3037 - val_loss: 1.8046 - learning_rate: 1.0000e-04
Epoch 6/30
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m



[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 446ms/step - accuracy: 0.5484 - loss: 1.3485
Test Accuracy: 0.5222
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 650ms/step
Predicted: beetle (0.4742)
