# Libraries / Dependencies

In [None]:
%pip install opencv

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import os
import cv2  # For image processing
import xml.etree.ElementTree as ET  # For parsing PASCAL VOC annotation files


2023-11-20 16:40:29.502762: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Variables

In [2]:
# Some constants
RGB_MAX = 255.0
base_dir = '/tf/workspace/widget-classification/'

In [3]:
# Paths for data
base_data_dir = os.path.join(base_dir, 'data')
image_base_data_dir = os.path.join(base_data_dir, 'raw')
annotation_base_data_dir = os.path.join(base_data_dir, 'labels')
extensions = ['.jpg', '.xml']
class_names = ['button', 'checkbox', 'label', 'slider', 'switch']
data_dicts = {class_name:{'img': os.path.join(image_base_data_dir, class_name), 'label':os.path.join(annotation_base_data_dir, class_name)} for class_name in class_names}
data_dicts

{'button': {'img': '/tf/workspace/widget-classification/data/raw/button',
  'label': '/tf/workspace/widget-classification/data/labels/button'},
 'checkbox': {'img': '/tf/workspace/widget-classification/data/raw/checkbox',
  'label': '/tf/workspace/widget-classification/data/labels/checkbox'},
 'label': {'img': '/tf/workspace/widget-classification/data/raw/label',
  'label': '/tf/workspace/widget-classification/data/labels/label'},
 'slider': {'img': '/tf/workspace/widget-classification/data/raw/slider',
  'label': '/tf/workspace/widget-classification/data/labels/slider'},
 'switch': {'img': '/tf/workspace/widget-classification/data/raw/switch',
  'label': '/tf/workspace/widget-classification/data/labels/switch'}}

In [4]:
# Paths for the training and validation images
base_test_dir = os.path.join(base_dir, 'test')
image_base_test_dir = os.path.join(base_test_dir, 'raw')
annotation_base_test_dir = os.path.join(base_test_dir, 'labels')
extensions = ['.jpg', '.xml']
class_names = ['button', 'checkbox', 'label', 'slider', 'switch']
test_dicts = {class_name:{'img': os.path.join(image_base_test_dir, class_name), 'label':os.path.join(annotation_base_test_dir, class_name)} for class_name in class_names}
test_dicts

{'button': {'img': '/tf/workspace/widget-classification/test/raw/button',
  'label': '/tf/workspace/widget-classification/test/labels/button'},
 'checkbox': {'img': '/tf/workspace/widget-classification/test/raw/checkbox',
  'label': '/tf/workspace/widget-classification/test/labels/checkbox'},
 'label': {'img': '/tf/workspace/widget-classification/test/raw/label',
  'label': '/tf/workspace/widget-classification/test/labels/label'},
 'slider': {'img': '/tf/workspace/widget-classification/test/raw/slider',
  'label': '/tf/workspace/widget-classification/test/labels/slider'},
 'switch': {'img': '/tf/workspace/widget-classification/test/raw/switch',
  'label': '/tf/workspace/widget-classification/test/labels/switch'}}

In [5]:
data_choice = data_dicts['button']

In [6]:
test_choice = test_dicts['button']

# Load & Preprocess

## Load images

In [7]:
def load_images_from_folder(folder, size=(224, 224)):
    images = []
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder, filename))
        if img is not None:
            img = cv2.resize(img, size)  # Resize the image
            images.append(img)
    return images


In [8]:
images = load_images_from_folder(data_choice['img'])
x_train = np.array(images, dtype='float32') / RGB_MAX  # Normalize the images

## Load annotations

In [9]:
def load_annotations(folder):
    annotations = []
    for filename in os.listdir(folder):
        if filename.endswith(extensions[1]):
            tree = ET.parse(os.path.join(folder, filename))
            root = tree.getroot()
            for obj in root.iter('object'):
                bbox = obj.find('bndbox')
                xmin = int(bbox.find('xmin').text)
                ymin = int(bbox.find('ymin').text)
                xmax = int(bbox.find('xmax').text)
                ymax = int(bbox.find('ymax').text)
                annotations.append([xmin, ymin, xmax, ymax])
    return annotations

In [10]:
y_train = np.array(load_annotations(data_choice['label']))

----
# State

1. Importing Libraries: You've imported TensorFlow, Keras, NumPy, os, cv2 (for image processing), and ElementTree (for parsing XML).

2. Setting Up Directories and Class Names: You've defined the base directories for images and annotations and identified the widget classes.

3. Loading Images: You've created a function load_images_from_folder to load and resize images from a specified folder.

4. Image Normalization: The images are normalized by dividing pixel values by 255, bringing them into the range [0, 1].

5. Loading Annotations: You've also written a function load_annotations to load bounding box annotations from XML files.

6. Preparing Data: You've loaded and prepared your training data (x_train for images, y_train for annotations).

7. Model Initialization: You've initialized a Keras Sequential model but haven't added any layers yet.
----

# Define the CNN model

- **Conv2D Layers:** These layers extract features from the images using filters. The first layer specifies the input shape (224x224 with 3 channels for color images).
- **MaxPooling2D:** These layers reduce the spatial size to decrease the number of parameters, reducing computation and overfitting.
- **Flatten:** Converts 2D feature maps to a 1D vector for the dense layers.
- **Dense:** Fully connected layers that perform classification based on the features extracted by the Conv2D layers.
- **Output Layer:** Outputs the coordinates of the bounding box. We use a linear activation function since this is a regression problem (predicting coordinates).

In [11]:
model = keras.Sequential([
    # Add CNN layers
     # First convolutional layer, with 32 filters and a kernel size of 3x3
    layers.Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(224, 224, 3)),
    # Max pooling layer to reduce the spatial dimensions
    layers.MaxPooling2D(pool_size=(2, 2)),
    
    # Second convolutional layer, increasing the depth of features
    layers.Conv2D(64, kernel_size=(3, 3), activation='relu'),
    layers.MaxPooling2D(pool_size=(2, 2)),

    # Flatten the feature maps to a 1D vector
    layers.Flatten(),

    # Dense (fully connected) layer for classification
    layers.Dense(128, activation='relu'),

    # Output layer - 4 units for (xmin, ymin, xmax, ymax)
    layers.Dense(4, activation='linear')  # 'linear' for regression-type output
])

  super().__init__(
2023-11-20 16:40:33.200748: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:280] failed call to cuInit: UNKNOWN ERROR (34)


## Compile model

The model is compiled with the Adam optimizer and Mean Squared Error (MSE) loss, which is appropriate for regression problems like predicting bounding box coordinates.

In [17]:
model.compile(optimizer='adam', loss='mse')  # Mean Squared Error for regression
model.summary()

# Train the model

In [13]:
model.fit(x_train, y_train, batch_size=32, epochs=10)

Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 22011.8184
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 739ms/step - loss: 12470.1729
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 830ms/step - loss: 4650.3569
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 762ms/step - loss: 4171.0596
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 762ms/step - loss: 7369.3755
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 765ms/step - loss: 5560.3369
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 801ms/step - loss: 3313.7444
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 778ms/step - loss: 2725.6721
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 728ms/step - loss: 3283.6655
Epoch 10/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 803ms/ste

<keras.src.callbacks.history.History at 0x7f3177c11c10>

# Evaluate

## Load test data

In [14]:
images = load_images_from_folder(test_choice['img'])
x_test = np.array(images, dtype='float32') / RGB_MAX  # Normalize the images

In [15]:
y_test = np.array(load_annotations(test_choice['label']))

## Evaluate model on test data

In [16]:
# x_test: array of test images, y_test: corresponding bounding box annotations

loss = model.evaluate(x_test, y_test)
print("Test loss:", loss)

ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 20
'y' sizes: 30


# Analyze model performance


# Visualization

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

def plot_image_with_bbox(image, bbox):
    """ Plot an image with a bounding box. """
    fig, ax = plt.subplots(1)
    ax.imshow(image.astype('uint8'))

    # Create a Rectangle patch
    rect = patches.Rectangle((bbox[0], bbox[1]), bbox[2]-bbox[0], bbox[3]-bbox[1], linewidth=1, edgecolor='r', facecolor='none')
    
    # Add the patch to the Axes
    ax.add_patch(rect)
    plt.show()

# Test with one image
test_image = x_test[0]
predicted_bbox = model.predict(np.array([test_image]))[0]

plot_image_with_bbox(test_image, predicted_bbox)

# Save model

In [None]:
model.save('widget_localization_model.h5')