In [None]:
import os
import glob
import pandas as pd
from imgaug import augmenters as iaa
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import cupy as np
from scipy.special import softmax
import cv2
import matplotlib.patches as patches
from tqdm import tqdm
#from src.pyESNN.pyESNcupy import ESN
import imageio.v3 as iio
from io import BytesIO 
import matplotlib.patches as patches
from sklearn.preprocessing import MinMaxScaler
from skimage.transform import resize
from src.utils.path import *

In [None]:
N_INPUTS = 1000  # Number of input dimensions (in this case, grayscale pixel values)
N_OUTPUTS = 2  # Number of output dimensions (x and y coordinates)
N_RESERVOIR = 1000  # Increased number of reservoir neurons
SPECTRAL_RADIUS = 0.8  # Adjusted spectral radius
INPUT_SCALING = 0.7  # Adjusted input scaling
NOISE = 0.1  # Reduced noise
SPARSITY = 0.2  # Adjusted sparsity
FEEDBACK_SCALING = 0.7  # Adjusted feedback scaling
TEACHER_SCALING = 0.9  # Adjusted teacher scaling
TEACHER_FORCING = True  # Use teacher forcing

# video config
WIDTH = 100
HEIGHT = 100
DIM = 3
X_DIM = 3

WIDTH_FACTOR = 800 // WIDTH
HEIGHT_FACTOR = 600 // HEIGHT

In [4]:
import os
import random
from PIL import Image, ImageDraw

def generate_image_with_shape_and_bbox(shape_type, save_dir, bbox_file, index):
    width, height = 100, 100

    img = Image.new('RGB', (width, height), 'white')
    draw = ImageDraw.Draw(img)
    x = random.randint(15, width - 45)
    y = random.randint(15, height - 45)
    size = random.randint(20, 40)

    # Calculate center coordinates
    center_x = x + size // 2
    center_y = y + size // 2

    # Draw the shape based on the shape type
    if shape_type == 'circle':
        draw.ellipse([(x, y), (x + size, y + size)], outline='black', width=2)
    elif shape_type == 'triangle':
        draw.polygon([(x, y), (x + size, y), (x + size // 2, y - size)], outline='black', width=2)
    elif shape_type == 'rectangle':
        draw.rectangle([(x, y), (x + size, y + size)], outline='black', width=2)


    img.save(os.path.join(save_dir, f'{shape_type}_{index}.png'))

    with open(bbox_file, 'a') as f:
        f.write(f'{shape_type}_{index}.png,{center_x},{center_y}\n')

def main():
    main_dir = 'assets/new_images_shape_bbox/images'
    os.makedirs(main_dir, exist_ok=True)

    bbox_dir = 'assets/new_images_shape_bbox/bounding_boxes'
    os.makedirs(bbox_dir, exist_ok=True)
    bbox_file = os.path.join(bbox_dir, 'bounding_boxes.csv')

    with open(bbox_file, 'w') as f:
        f.write('filename,x,y\n')

    shapes = ['circle', 'triangle', 'rectangle']

    num_images_per_shape = 10

    # Generate images for each shape
    for shape in shapes:
        # Directory for this shape
        shape_dir = os.path.join(main_dir, shape)
        os.makedirs(shape_dir, exist_ok=True)


        n = 0
        for i in range(num_images_per_shape):
            generate_image_with_shape_and_bbox(shape, shape_dir, bbox_file, n)
            n += 1

if __name__ == "__main__":
    main()


In [5]:
class ESN:
    def __init__(self, input_size, reservoir_size, output_size, spectral_radius=0.9, alpha=0.99):
        self.input_size = input_size
        self.reservoir_size = reservoir_size
        self.output_size = output_size
        self.spectral_radius = spectral_radius
        self.alpha = alpha

        self.W_in = np.random.rand(reservoir_size, input_size + 1) - 0.5  # bias term
        self.W_res = np.random.rand(reservoir_size, reservoir_size) - 0.5 # bias term
        self.W_out = np.random.rand(output_size, reservoir_size) - 0.5    # bias term

        self.W_res *= spectral_radius / np.max(np.abs(np.linalg.eigvals(self.W_res)))

    def train(self, X_train, y_train, transient=100):
        X_train = np.concatenate((np.ones((len(X_train), 1)), X_train), axis=1)  # Add bias term to input
        X_res = np.zeros((len(X_train), self.reservoir_size))
        x = np.zeros(self.reservoir_size)

        for t in range(len(X_train)):
            u = X_train[t]
            x = (1 - self.alpha) * x + self.alpha * np.tanh(np.dot(self.W_in, u) + np.dot(self.W_res, x))
            if t > transient:
                X_res[t] = x

        self.W_out = np.dot(np.linalg.pinv(X_res[transient:]), y_train[transient:])

    def predict(self, X_test):
        X_test = np.concatenate((np.ones((len(X_test), 1)), X_test), axis=1)  # Add bias term to input
        X_res = np.zeros((len(X_test), self.reservoir_size))
        x = np.zeros(self.reservoir_size)

        for t in range(len(X_test)):
            u = X_test[t]
            x = (1 - self.alpha) * x + self.alpha * np.tanh(np.dot(self.W_in, u) + np.dot(self.W_res, x))
            X_res[t] = x

        return np.dot(X_res, self.W_out)
    
    def identity(self, x):
        return softmax(x)

In [6]:
def draw_bounding_boxes_from_array(video_path, bounding_boxes, output_video_path):
    # Open video file
    cap = cv2.VideoCapture(video_path)

    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Define the codec and create a VideoWriter object for AVI format
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

    # Read each frame, draw bounding boxes, and write to output video
    frame_count = 0
    while cap.isOpened():
        ret, frame = cap.read()

        if not ret:
            break

        # Extract coordinates for the current frame
        if frame_count < len(bounding_boxes):
            x, y = bounding_boxes[frame_count]
            x, y = int(x), int(y)
            width, height = 30, 30  # Assumed width and height

            # Draw bounding box
            cv2.rectangle(frame, (x, y), (x + width, y + height), (0, 255, 0), 2)

        # Write the frame to the output video
        out.write(frame)

        frame_count += 1

    # Release video capture and writer objects
    cap.release()
    out.release()

    print(f"Bounding boxes added and new video saved to: {output_video_path}")

def read_png_image(file_path, name, i):

    file_name = os.path.basename(file_path)
    image_array = random_augment(file_path)
    image_array = resize(image_array, (WIDTH , HEIGHT), anti_aliasing=True)

    # Remove alpha channel
    #image_array = np.flipud(image_array[:,:,:3])

    number = int(file_name.split("_")[1].split(".")[0])

    df = pd.read_csv(f'assets/new_images_shape_bbox/bounding_boxes.csv')
    x = df['X-coordinate']
    y = df['Y-coordinate']

    return image_array/255.0, x[number]//WIDTH_FACTOR, y[number]//HEIGHT_FACTOR

def plot_image(img, bb, save_path):

    fig, ax = plt.subplots()

    plt.imshow(img)
    x, y = (bb[0]-30)//WIDTH_FACTOR, (bb[1]-30)//HEIGHT_FACTOR
    rect = patches.Rectangle((x, y), 10, 10, linewidth=2, edgecolor='r', facecolor='none')
    ax.add_patch(rect)

    plt.savefig(save_path)
    plt.close() 

def zoom(image):
  zoom = iaa.Affine(scale=(1, 1.3))
  image = zoom.augment_image(image)
  return image

def pan(image):
  pan = iaa.Affine(translate_percent={"x":(-0.08, 0.08), "y":(-0.08, 0.08)})
  image = pan.augment_image(image)
  return image

def img_random_brightness(image):
  # As of experience models recats better to darker immages
  brightness = iaa.Multiply((0.2, 1.2))
  image = brightness.augment_image(image)
  return image

def random_augment(image):
  image = mpimg.imread(image)
  if np.random.rand() < 0.5:
    image = pan(image)
  if np.random.rand() < 0.5:
    image = zoom(image)
  if np.random.rand() < 0.5:
    image = img_random_brightness(image)

  return image

In [None]:
#image = image_paths[random.randint(0, 35)]
image = 'assets/original_frames/moving_circle_0/frame_0020.png'
original_image = mpimg.imread(image)
zoomed_image = zoom(original_image)

fig, axs = plt.subplots(1, 2, figsize=(15, 10))
fig.tight_layout()
axs[0].imshow(original_image)
axs[0].set_title('Original Image')

axs[1].imshow(zoomed_image)
axs[1].set_title('Zoomed Image')

In [None]:
image = 'assets/original_frames/moving_circle_0/frame_0070.png'
original_image = mpimg.imread(image)
panned_image = pan(original_image)

fig, axs = plt.subplots(1, 2, figsize=(15, 10))
fig.tight_layout()

axs[0].imshow(original_image)
axs[0].set_title('Original Image')

axs[1].imshow(panned_image)
axs[1].set_title('Panned Image')

In [None]:
image = 'assets/original_frames/moving_circle_0/frame_0090.png'
original_image = mpimg.imread(image)
brightness_altered_image = img_random_brightness(original_image)

fig, axs = plt.subplots(1, 2, figsize=(15, 10))
fig.tight_layout()

axs[0].imshow(original_image)
axs[0].set_title('Original Image')

axs[1].imshow(brightness_altered_image)
axs[1].set_title('brightness Altered Image')

In [None]:
# find all png files
#shapes = {'circle':4, 'rect':3, 'star':4}
shapes = {'circle':15}
X, y_bb = [], []
for j in shapes.keys():
    for i in tqdm(range(shapes[j]), desc='Preprocessing images...'):
        img_path = os.path.join(os.getcwd(), f'assets/original_frames/moving_{j}_{i}')
        files = glob.glob(os.path.join(img_path, '*.png'))
        sorted_files = sorted(files, key=lambda x: int(x.split('_')[-1].split('.')[0]))

        for filepath in sorted_files:
            image_array, x, y = read_png_image(filepath, j, i)

            X.append(image_array)
            y_bb.append([x, y])
print('Preprocessing done!\n')

In [None]:
X, y_bb = np.array(X), np.array(y_bb)
X = np.reshape(X, (len(X), HEIGHT*WIDTH, DIM))
X_gray = np.dot(X[...,:DIM], np.array([0.2989, 0.5870, 0.1140]))
X = np.expand_dims(X_gray, axis=-1)

# Create MinMaxScaler for bounding box data
scaler = MinMaxScaler()
scaled_bb = scaler.fit_transform(y_bb.get())

print(f'shape of X: {X.shape}, shape of y_bb:{y_bb.shape}')

In [None]:
# data splitting
n_samples = len(X)
train_idx = int(n_samples * 0.98)
X_train, X_test = X[:train_idx,...], X[train_idx:, ...]
y_bb_train, y_bb_test = scaled_bb[:train_idx], scaled_bb[train_idx:]

In [None]:
"""
Build the model
"""
'''
# Create the Echo State Network (ESN)
esn = ESN(n_inputs=N_INPUTS, 
          n_outputs=N_OUTPUTS, 
          n_reservoir=N_RESERVOIR,
          spectral_radius=SPECTRAL_RADIUS, 
          input_scaling=INPUT_SCALING,
          noise=NOISE,
          sparsity=SPARSITY,
          feedback_scaling=FEEDBACK_SCALING,
          teacher_scaling=TEACHER_SCALING,
          teacher_forcing=TEACHER_FORCING, 
          silent=False)
'''
esn = ESN(input_size=N_INPUTS, 
          output_size=N_OUTPUTS, 
          reservoir_size=N_RESERVOIR,
          alpha=1,
          spectral_radius=SPECTRAL_RADIUS)

In [None]:
esn.fit(np.array(X_train).reshape(len(X_train), -1), np.array(y_bb_train))
print('Model fitted!\n')

## Inference

In [None]:
# Inference on the entire video
video_directory = os.path.join(ORIGINAL_VAL_VIDEOS_DIR, 'moving_circle_3.mp4')
output_video_path = os.path.join(PREDICTED_VIDEOS_DIR, 'moving_circle_val_0.mp4')

# Open video file
cap = cv2.VideoCapture(video_directory)

frames = []
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Convert frame to PNG format in memory
    with BytesIO() as f:
        iio.imwrite(f, frame, format='png')
        f.seek(0)
        # Read PNG image from memory
        image_array = iio.imread(f)

    # Resize the image
    image_array = resize(image_array, (WIDTH, HEIGHT), anti_aliasing=True)

    # Append resized frame to frames list
    frames.append(image_array/255.0)

# Convert frames to numpy array
frames = np.array(frames)
frames = np.reshape(frames, (len(frames), HEIGHT*WIDTH, DIM))
X_gray = np.dot(frames[...,:DIM], np.array([0.2989, 0.5870, 0.1140]))
frames = np.expand_dims(X_gray, axis=-1)

# Predict bounding box coordinates for the entire video
predictions = esn.predict(frames.reshape(len(frames), -1))

# Inverse min-max scaling
predictions  = predictions.get()

bb = scaler.inverse_transform(predictions)
# abs to avoid negative values
for i in range(len(bb)):
    bb[i][0] = int(abs(bb[i][0]) * WIDTH_FACTOR)
    bb[i][1] = int(abs(bb[i][1]) * HEIGHT_FACTOR)
print(bb)

# Release video capture object
cap.release()

# Draw bounding boxes on the video
draw_bounding_boxes_from_array(video_directory, bb, output_video_path)