In [None]:
import cv2
import numpy as np
import torch
import datetime
from matplotlib import pyplot as plt
    
def write_video(filename, frames, width, height, fps, grayscale=False):
    if grayscale:
        writer = cv2.VideoWriter(filename, cv2.VideoWriter_fourcc(*"MP4V"), fps, (width, height), 0)
    else:
        writer = cv2.VideoWriter(filename, cv2.VideoWriter_fourcc(*"MP4V"), fps, (width, height))
    
    for frame in np.clip(np.around(frames), 0, 255).astype(np.uint8):
        writer.write(frame)
    writer.release()

def show_video(frames, imduration=int(1000/24.0)):
    for frame in frames:
        cv2.imshow('frame',frame)
        if cv2.waitKey(imduration) & 0xFF == ord('q'):
            break

    cv2.destroyAllWindows()

def reconstruction_error(frames1, frames2):
    if frames1.shape != frames2.shape:
        return -1
    return np.sqrt(np.mean((frames1 - frames2)**2))

def crit(output, gt):
    return torch.sqrt(torch.mean((output - gt)**2))

def normalize_frames(frames, **kwargs):
    mean = kwargs['mean'] if 'mean' in kwargs else np.mean(frames)
    frames = frames - mean
    std  = kwargs['std'] if 'std' in kwargs else np.std(frames)
    frames = frames / std
    
    return frames

styles = ['C'+str(c)+'-'+s for s in ['', '.', 'o', '^'] for c in [0, 1, 2, 3, 6, 8, 9] ]
def plot(x, ys, **kwargs):
    if len(ys) > len(styles):
        print('Duplicate styles')
    
    if 'fontsize' in kwargs:
        plt.rcParams.update({'font.size': kwargs['fontsize']})
    else:
        plt.rcParams.update({'font.size': 12})
        
        
    if 'figsize' in kwargs:
        plt.figure(figsize=kwargs['figsize'])
    else:
        plt.figure(figsize=(15,10))
        
    if 'xlabel' in kwargs:
        plt.xlabel(kwargs['xlabel'])
    if 'ylabel' in kwargs:
        plt.ylabel(kwargs['ylabel'])
        
    if 'yrange' in kwargs:
        low, high = kwargs['yrange']
        plt.ylim(low, high)
    
    if 'bound_to_plot' in kwargs:
        epoch, max_error = kwargs['bound_to_plot']
        ys = list(filter(lambda x: max(x[epoch:]) < max_error, ys))
        
        
    if 'labels' in kwargs:
        for i, (y, label) in enumerate(zip(ys, kwargs['labels'])):
            plt.plot(x, y, styles[i], label=label)
        plt.legend()
    else:
        for y in ys:
            plt.plot(x, y)

    if 'title' in kwargs:
        plt.title(kwargs['title'])


def sec2string(sec):
    if sec <= 60:
        return round(sec, 2)
    secr = round(sec)
    
    return str(datetime.timedelta(seconds=secr)).strip("00:")
import numpy as np # for prod
import torch
from torch import nn

class PCAAutoEncoder(nn.Module):
    def __init__(self, shape, ncomp):
        super().__init__()
        infeatures = np.prod(shape)
        self.shape = shape
        self.to_lower_rep = nn.Linear(infeatures, ncomp)
        self.from_lower_rep = nn.Linear(ncomp, infeatures)
    
    def forward(self, x):
        x = x.view(x.shape[0], -1)
        x = self.from_lower_rep(self.to_lower_rep(x))
        
        return x.view(x.shape[0], *self.shape)

class OneHAutoEncoder(nn.Module):
    def __init__(self, shape, ncomp, nl=nn.ReLU):
        super().__init__()
        infeatures = np.prod(shape)
        self.shape = shape
        self.ncomp = ncomp
        self.hidden_dim = 200
        self.to_lower_rep = nn.Sequential(nn.Linear(infeatures, self.hidden_dim),
                                          nl(), 
                                          nn.Linear(self.hidden_dim, ncomp))
        self.from_lower_rep = nn.Sequential(nn.Linear(ncomp, self.hidden_dim),
                                           nl(),
                                           nn.Linear(self.hidden_dim, infeatures))
        
    def forward(self, x):
        x = x.view(x.shape[0], -1)
        x = self.from_lower_rep(self.to_lower_rep(x))
        
        return x.view(x.shape[0], *self.shape)
    
class SpatialConvAE(nn.Module):
    def __init__(self, inchannels, ncomp, nl=nn.ReLU, chans=[128, 128, 64]):
        super().__init__()
        self.ncomp = ncomp
        self.chans = chans
        
        self.encoder_convs = nn.Sequential(nn.Conv2d(inchannels, chans[0], kernel_size=26, stride=5), nl(), # 47
                                           nn.Conv2d(chans[0], chans[1], kernel_size=11, stride=3), nl(), # 13
                                           nn.Conv2d(chans[1], chans[2], kernel_size=6), nl()) # 8
        
        self.encoder_lin = nn.Linear(chans[2]*8*8, ncomp)
        self.decoder_lin = nn.Linear(ncomp, chans[2]*8*8)
        
        self.decoder_convs = nn.Sequential(nn.ConvTranspose2d(chans[2], chans[1], kernel_size=6), nl(),
                                           nn.ConvTranspose2d(chans[1], chans[0], kernel_size=11, stride=3), nl(),
                                           nn.ConvTranspose2d(chans[0], inchannels, kernel_size=26, stride=5))
        
        
    def forward(self, x):
        x = self.encoder_convs(x)
        x = x.view(x.shape[0], -1)
        x = self.encoder_lin(x)
        x = self.decoder_lin(x)
        x = x.view(x.shape[0], self.chans[2], 8, 8)
        x = self.decoder_convs(x)
        
        return x
    
class TemporalConvAE(nn.Module):
    def __init__(self, inchannels, nlayers, layerchans):
        super().__init__()
        self.inchannels = inchannels
        self.layerchans = layerchans
        c1 = c2 = c3 = c4 = c5 = layerchans
        
        conv_params = [(inchannels, c1, 8, 2), # (1, c1, 29, 125, 125)
                       (c1, c2, 7, (1, 2, 2)), # (1, c2, 23, 60, 60)
                       (c2, c3, 8, (1, 2, 2)), # (1, c3, 16, 27, 27)
                       (c3, c4, 7, (1, 2, 2)), # (1, c4, 10, 11, 11)
                       (c4, c5, 5, (1, 2, 2))] # (1, c5, 6, 4, 4)
        
        encoder_modules = []
        for params in conv_params[:nlayers]:
            encoder_modules.append(nn.Conv3d(params[0], params[1], kernel_size=params[2], stride=params[3]))
            encoder_modules.append(nn.ReLU())
        self.encoder_convs = nn.Sequential(*encoder_modules)
        
        decoder_modules = []
        for params in conv_params[:nlayers][::-1]:
            decoder_modules.append(nn.ConvTranspose3d(params[1], params[0], kernel_size=params[2], stride=params[3]))
            decoder_modules.append(nn.ReLU())
        self.decoder_convs = nn.Sequential(*decoder_modules)
        
        """
        self.encoder_convs = nn.Sequential(nn.Conv3d(inchannels, c1, kernel_size=8, stride=2), nn.ReLU(), # 125
                                          nn.Conv3d(c1, c2, kernel_size=7, stride=2), nn.ReLU(), nn.ReLU(), # 60
                                          nn.Conv3d(c2, c3, kernel_size=8, stride=2), nn.ReLU(), # 27
                                          nn.Conv3d(c3, c4, kernel_size=7, stride=2), nn.ReLU(),  # 11
                                          nn.Conv3d(c4, c5, kernel_size=5, stride=2), nn.ReLU())  # 4
                                          
        """
        
    def forward(self, x):
        x = self.encoder_convs(x)
        x = self.decoder_convs(x)
        
        return x
        
import numpy as np
import torch
import cv2

class VideoLoader:
    def __init__(self, filename, duration=np.inf, batch_size=64, gray=False, scale=None, skip_frame=0, randit=False, torch=True):
        self.filename = filename
        self.gray = gray
        self.batch_size = batch_size
        cap = cv2.VideoCapture(filename)
        self.total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        self.fps = round(cap.get(cv2.CAP_PROP_FPS))
        self.duration_frames = min(self.total_frames, np.ceil(duration*self.fps/batch_size)*batch_size)
        self.duration = self.duration_frames/self.fps
        self.width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        self.height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        if scale:
            self.scale = True
            self.original_width  = self.width
            self.original_height = self.height
            self.width, self.height = scale
        else:
            self.scale = False
        self.skip_frame = skip_frame
        self.randit = randit
        self.torch = torch
        
    def reduce_latent(self, model, trans=True):
        self.randit = self.skip_frame = 0
        
        reconstructed_frames = []
        for frames in self:
            # WILL ALWAYS BE TRANSFORM -> INV_TRANSFORM
            if trans:
                if self.torch:
                    reconstructed_frames.append(model.inverse_transform(*model.transform(frames)).detach())
                else:
                    reconstructed_frames.append(model.inverse_transform(*model.transform(frames)))
            else:
                reconstructed_frames.append(model(frames).detach())

        if self.torch:
            reconstructed_frames = torch.cat(reconstructed_frames, 0)
        else:
            reconstructed_frames = np.vstack(reconstructed_frames)
        return reconstructed_frames
        
    def get_all_frames(self):
        frames = []
        cap = cv2.VideoCapture(self.filename)
        current_frame = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if current_frame >= self.duration_frames:
                cap.release()
                break
            if ret:
                frames.append(self.frame_transform(frame))
                current_frame += 1
            else:
                cap.release()
        
        return self.__from_frame_list(frames)
    
    def get_random_frames(self, frames_ratio, seed=42):
        nframes = int(self.duration_frames * frames_ratio)
        frames = []
        cap = cv2.VideoCapture(self.filename)
        np.random.seed(seed)
        frame_ids = np.random.choice(np.arange(self.duration_frames), 
                                     size=nframes, 
                                     replace=False, )
        while cap.isOpened():
            ret, frame = cap.read()
            current_frame = cap.get(cv2.CAP_PROP_POS_FRAMES)
            if ret:
                if current_frame in frame_ids:
                    frames.append(self.frame_transform(frame))
            else:
                cap.release()
        
        return self.__from_frame_list(frames)
            

    def frame_transform(self, frame):
        if self.scale:
            frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
        if self.gray:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            
        return frame
    
    def __from_frame_list(self, frames):
        if self.torch:
            frames = torch.FloatTensor(frames)
            if self.gray:
                frames = frames.unsqueeze(1)
            else:
                frames = frames.permute((0, 3, 1, 2))
        else:
            frames = np.array(frames)
            if not self.gray:
                frames = np.transpose(frames, axes=(0,3, 1, 2))
        
        return frames
    
    def __iter__(self):
        self.__cap = cv2.VideoCapture(self.filename)
        self.__frame_count = 0
        self.__frame_order = np.arange(1, self.duration_frames+1)
        if self.randit:
            np.random.shuffle(self.__frame_order)
        self.__frame_order = iter(self.__frame_order)
        self.__stop = False
        return self

    def __next__(self):
        if self.__stop:
            raise StopIteration()
        
        frames = []
        while self.__cap.isOpened():
            try:
                next_frame = next(self.__frame_order)
                self.__cap.set(cv2.CAP_PROP_POS_FRAMES, next_frame - 1)
                for _ in range(self.skip_frame):
                    next(self.__frame_order)
            except StopIteration:
                self.__stop = True
                break
            ret, frame = self.__cap.read()
                
            if ret:
                frames.append(self.frame_transform(frame))
                self.__frame_count += 1
            else:
                self.__cap.release()
                self.__stop = True
                break
            
            if self.__frame_count % self.batch_size == 0:
                break

        if self.__frame_count*(self.skip_frame+1) >= self.duration_frames:
            self.__stop = True
            
        return self.__from_frame_list(frames)
    
    def write(self, filename):
        last_torch = self.torch
        self.torch = False
        
        if self.gray:
            writer = cv2.VideoWriter(filename, cv2.VideoWriter_fourcc(*"MP4V"), self.fps, (self.width, self.height), 0)
        else:
            writer = cv2.VideoWriter(filename, cv2.VideoWriter_fourcc(*"MP4V"), self.fps, (self.width, self.height))
    
        cap = cv2.VideoCapture(self.filename)
        current_frame = 0
        while cap.isOpened():
            ret, frame = cap.read()
            if current_frame >= self.duration_frames:
                cap.release()
                break
            if ret:
                #print(frame.shape)
                frame = self.frame_transform(frame)
                #print(frame.shape)
                writer.write(frame)
                current_frame += 1
            else:
                cap.release()
        
        writer.release()
        self.torch = last_torch
        
import numpy as np
from sklearn.utils.extmath import randomized_svd

class custom_pca():
    def __init__(self, ncomp=10):
        self.ncomp = ncomp
        
    def fit(self, frames):
        self.mean = np.mean(frames)
        self.std = np.std(frames)
        frames = (frames - self.mean) / self.std
        frames = frames.reshape(frames.shape[0], -1)
        self.pc, _, _ = randomized_svd(frames.T, self.ncomp)
        
    def transform(self, frames):
        shape = frames.shape[1:]
        if len(shape) > 1:
            frames = frames.reshape(frames.shape[0], -1)
        frames = (frames - self.mean) / self.std
        frames_reduced = self.pc.T @ frames.T
        
        return frames_reduced.T, shape
        
    def inverse_transform(self, frames, shape=None, cast=True):
        nframes = frames.shape[0]
        frames_reconstructed = (self.pc @ frames.T).T
        frames_reconstructed = (frames_reconstructed * self.std) + self.mean
        
        if cast:
            frames_reconstructed = np.clip(frames_reconstructed, 0, 255).astype(np.uint8)
        if shape:
            frames_reconstructed = frames_reconstructed.reshape(nframes, *shape)
        
        return frames_reconstructed

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import cv2
import torch
import torch.functional as F
import torch.nn as nn
from torch.optim import Adam
from time import time, sleep


seed = 42

### Temporal Convolutional AE

In [None]:
video = VideoLoader('R25_gray_scaled.mp4', duration=10, gray=True, randit=True)
num_epoch, num_epoch_tune = 30, 10
iteration_per_epoch = int(video.duration_frames/video.batch_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device', device)
if device.type == 'cuda':
  print(torch.cuda.get_device_name(0))

all_losses = {}
lrs = {1: [5e-4, 0.001, 0.005],
       2: [5e-4, 0.001],
       3: [5e-4, 0.001],
       4: [5e-5, 1e-4, 5e-4],
       5: [1e-5, 5e-5, 1e-4, 5e-4]}
for nlayers in range(1, 6):
  for layerchans in [4, 8, 12]:
    total_time_start = time()
    print(f'Model with {nlayers} layer with {layerchans} channels:')

    losses_tune = []
    for lr in lrs[nlayers]:
      torch.manual_seed(seed)
      np.random.seed(seed)
      model = TemporalConvAE(1, nlayers, layerchans).to(device)
      optimizer = Adam(model.parameters(), lr=lr)
      for epoch in range(num_epoch_tune):
        epoch_loss = 0
        for frames in video:
          frames = frames.view(-1, 1, video.batch_size, video.height, video.width).to(device)
          reconstructed = model(frames)
          loss = crit(frames, reconstructed)
          
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          epoch_loss += loss.item()
      losses_tune.append(epoch_loss / iteration_per_epoch)
    lr = lrs[nlayers][np.argmin(losses_tune)]
    print('Chosen learning rate:', lr)
    torch.manual_seed(seed)
    np.random.seed(seed)
    model = TemporalConvAE(1, nlayers, layerchans).to(device)
    optimizer = Adam(model.parameters(), lr=lr)

    t1 = time()
    losses = []
    for epoch in range(num_epoch):
      epoch_loss = 0
      for frames in video:
        frames = frames.view(-1, 1, video.batch_size, video.height, video.width).to(device)
        reconstructed = model(frames)
        loss = crit(frames, reconstructed)
                
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
      losses.append(epoch_loss / iteration_per_epoch)
      if (epoch+1) % 3 == 0:
        print(f'\t Error at epoch {epoch+1}:', losses[-1])
    all_losses[f'{nlayers},{layerchans}'] = ((time()-t1)/num_epoch, losses)
    print('Total time for model:', sec2string(time()-total_time_start))

print(all_losses)

Using device cuda
Tesla K80
Model with 1 layer with 4 channels:
Chosen learning rate: 0.001
	 Error at epoch 3: 57.36143493652344
	 Error at epoch 6: 49.19762191772461
	 Error at epoch 9: 46.23065719604492
	 Error at epoch 12: 45.10911560058594
	 Error at epoch 15: 44.23296890258789
	 Error at epoch 18: 43.47725601196289
	 Error at epoch 21: 42.667728424072266
	 Error at epoch 24: 42.34994049072266
	 Error at epoch 27: 42.08884201049805
	 Error at epoch 30: 41.75653839111328
Total time for model: 6:35
Model with 1 layer with 8 channels:
Chosen learning rate: 0.001
	 Error at epoch 3: 52.99357299804687
	 Error at epoch 6: 46.6766357421875
	 Error at epoch 9: 45.21807174682617
	 Error at epoch 12: 44.29331970214844
	 Error at epoch 15: 43.36727523803711
	 Error at epoch 18: 42.57135009765625
	 Error at epoch 21: 41.781185913085935
	 Error at epoch 24: 41.45039672851563
	 Error at epoch 27: 41.189398193359374
	 Error at epoch 30: 40.91673202514649
Total time for model: 6:36
Model with 1 l

In [None]:
video = VideoLoader('R25_gray_scaled.mp4', duration=10, gray=True, randit=True)
num_epoch, num_epoch_tune = 30, 10
iteration_per_epoch = int(video.duration_frames/video.batch_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device', device)
if device.type == 'cuda':
  print(torch.cuda.get_device_name(0))

all_losses = {}
lrs = {1: [1e-4, 5e-4, 0.001],
       2: [1e-4, 5e-4, 0.001],
       3: [1e-4, 5e-4, 0.001],
       4: [1e-5, 5e-5, 1e-4, 5e-4],
       5: [1e-5, 5e-5, 1e-4, 5e-4]}
for nlayers in range(1, 6):
  for layerchans in [32, 64]:
    total_time_start = time()
    print(f'Model with {nlayers} layer with {layerchans} channels:')

    losses_tune = []
    for lr in lrs[nlayers]:
      torch.manual_seed(seed)
      np.random.seed(seed)
      model = TemporalConvAE(1, nlayers, layerchans).to(device)
      optimizer = Adam(model.parameters(), lr=lr)
      for epoch in range(num_epoch_tune):
        epoch_loss = 0
        for frames in video:
          frames = frames.view(-1, 1, video.batch_size, video.height, video.width).to(device)
          reconstructed = model(frames)
          loss = crit(frames, reconstructed)
          
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
          epoch_loss += loss.item()
      losses_tune.append(epoch_loss / iteration_per_epoch)
    lr = lrs[nlayers][np.argmin(losses_tune)]
    print('Chosen learning rate:', lr)
    torch.manual_seed(seed)
    np.random.seed(seed)
    model = TemporalConvAE(1, nlayers, layerchans).to(device)
    optimizer = Adam(model.parameters(), lr=lr)

    t1 = time()
    losses = []
    for epoch in range(num_epoch):
      epoch_loss = 0
      for frames in video:
        frames = frames.view(-1, 1, video.batch_size, video.height, video.width).to(device)
        reconstructed = model(frames)
        loss = crit(frames, reconstructed)
                
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
      losses.append(epoch_loss / iteration_per_epoch)
      if (epoch+1) % 3 == 0:
        print(f'\t Error at epoch {epoch+1}:', losses[-1])
    all_losses[f'{nlayers},{layerchans}'] = ((time()-t1)/num_epoch, losses)
    print('Total time for model:', sec2string(time()-total_time_start))

print(all_losses)

Using device cuda
Tesla K80
Model with 1 layer with 32 channels:
Chosen learning rate: 0.001
	 Error at epoch 3: 51.05170669555664
	 Error at epoch 6: 46.67074813842773
	 Error at epoch 9: 44.68516845703125
	 Error at epoch 12: 43.73015213012695
	 Error at epoch 15: 42.71629638671875
	 Error at epoch 18: 42.12033309936523
	 Error at epoch 21: 41.48017959594726
	 Error at epoch 24: 41.237037658691406
	 Error at epoch 27: 41.053828430175784
	 Error at epoch 30: 40.80555114746094
Total time for model: 6:43
Model with 1 layer with 64 channels:
Chosen learning rate: 0.0005
	 Error at epoch 3: 52.02037124633789
	 Error at epoch 6: 46.736235809326175
	 Error at epoch 9: 44.19419174194336
	 Error at epoch 12: 42.356974029541014
	 Error at epoch 15: 41.28425521850586
	 Error at epoch 18: 40.733621978759764
	 Error at epoch 21: 40.161346435546875
	 Error at epoch 24: 39.93864364624024
	 Error at epoch 27: 39.90627822875977
	 Error at epoch 30: 39.67598648071289
Total time for model: 6:55
Model w

In [22]:
import json 
all_losses_json = json.dumps(all_losses)
print(all_losses_json)

{"1,32": [6.71182476679484, [92.63471984863281, 58.182090759277344, 51.05170669555664, 49.26017761230469, 47.57956771850586, 46.67074813842773, 45.80816802978516, 45.33303298950195, 44.68516845703125, 44.414163970947264, 44.296163940429686, 43.73015213012695, 43.24689559936523, 42.91751480102539, 42.71629638671875, 42.5161849975586, 42.321920013427736, 42.12033309936523, 41.90849838256836, 41.80671081542969, 41.48017959594726, 41.455073547363284, 41.45417785644531, 41.237037658691406, 41.100476837158205, 41.06671981811523, 41.053828430175784, 40.90081100463867, 40.82514266967773, 40.80555114746094]], "1,64": [6.918313543001811, [88.46867523193359, 58.880174255371095, 52.02037124633789, 49.59053726196289, 47.85999221801758, 46.736235809326175, 45.758802795410155, 45.037732696533205, 44.19419174194336, 43.53841934204102, 42.87966613769531, 42.356974029541014, 41.88265380859375, 41.52345657348633, 41.28425521850586, 41.10011672973633, 41.025953674316405, 40.733621978759764, 40.51638183593