In [11]:
import os

folder_path = 'D:/MyDesktop/Audio/DLA-AVSS/data/dla_dataset/mouths'
mouths = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
print(f"Number of mouths: {len(mouths)}")

Number of mouths: 19434


In [12]:
from pathlib import Path
from src.utils.io_utils import ROOT_PATH


dir = ROOT_PATH / Path("data/dla_dataset")
print(dir)

folders_to_search = [
    dir / "audio/train/mix",
    dir / "audio/test/mix",
    dir / "audio/val/mix"
]

all_files = set()
for folder in folders_to_search:
    all_files.update([str(file) for file in folder.glob('**/*') if file.is_file()])

print(f"Total number of files found: {len(all_files)}")

all_files = list(all_files)
all_files = [file.split('\\')[-1] for file in all_files]

all_ids = set()
for file in all_files:
    all_ids.add(file.split('_')[0])
    all_ids.add(file.split('_')[1][:-4])
all_ids = list(all_ids)

for i in all_ids:
    assert i + '.npz' in mouths

print("Total number of ids:", len(all_ids))

D:\MyDesktop\Audio\DLA-AVSS\data\dla_dataset
Total number of files found: 28000
Total number of ids: 19434


python train.py -cn=conv-tasnet-baseline trainer.override=False trainer.device_ids=[0] trainer.resume_from="model_best.pth" trainer.n_epochs=40

In [2]:
import numpy as np
import cv2
import os


path = 'data/dla_dataset/mouths/'
files = os.listdir(path)

npz_file = np.load(f'data/dla_dataset/mouths/{files[10000]}')

frames = npz_file['data']

print("Shape of frames array:", frames.shape)

for frame in frames:
    frame = frame.astype(np.uint8)    
    cv2.imshow('Video', frame)
    if cv2.waitKey(100) & 0xFF == ord('q'):
        break

cv2.destroyAllWindows()
npz_file.close()

Shape of frames array: (50, 96, 96)


In [4]:
path = 'data/dla_dataset/visual_embeddings/'
files = os.listdir(path)

npz_file = np.load(f'data/dla_dataset/visual_embeddings/{files[0]}')

frames = npz_file['embeddings']

print("Shape of frames array:", frames.shape)

print(frames)

Shape of frames array: (50, 512)
[[-0.11330941 -0.11828102 -0.10756298 ...  0.14136718 -0.09221777
   0.00228598]
 [-0.08189496 -0.05640006 -0.09972193 ... -0.09749741  0.03205474
   0.08040293]
 [-0.10114203 -0.02449729 -0.09068808 ... -0.05603265 -0.05672693
   0.09957554]
 ...
 [-0.10003147 -0.06300171 -0.01544812 ... -0.01060615 -0.06721249
   0.00345985]
 [-0.12066182 -0.08771992 -0.04595585 ... -0.06440631 -0.10304234
   0.0956558 ]
 [-0.10181612 -0.00678791 -0.02959658 ...  0.03661477 -0.0785524
   0.0563571 ]]


In [13]:
num_filters = 512           # N
filter_length = 32          # L (in time steps; 32/16000 = 2ms)
bottleneck_channels = 128   # B
conv_num_channels = 512     # H
conv_kernel_size = 3        # P
num_conv_blocks = 8         # X
num_tcn_blocks = 3          # R

In [194]:
from src.model.conv_tasnet import ConvTasNetModel

def calc_params(model):
    total_params = sum(p.numel() for p in model.parameters())

    print(f"Total number of parameters: {total_params}")


model = ConvTasNetModel(
    num_filters=512, # up
    filter_length=16, # down
    bottleneck_channels=128, # up 
    conv_num_channels=512,
    conv_kernel_size=3,
    num_conv_blocks=8, # up
    num_tcn_blocks=3, # up
)

model = ConvTasNetModel(
    num_filters=512,     
    filter_length=16,    
    bottleneck_channels=64,
    conv_num_channels=128,  
    conv_kernel_size=3,     
    num_conv_blocks=8,
    num_tcn_blocks=40
)

calc_params(model)

Total number of parameters: 8391041


In [None]:
_target_: src.model.AudioVisualModel
audio_model:
  _target_: src.model.ConvTasNetModel
  num_filters: 512           # N
  filter_length: 16          # L (in time steps; 16/8000 = 2ms)
  bottleneck_channels: 128   # B
  conv_num_channels: 512     # H
  conv_kernel_size: 3        # P
  num_conv_blocks: 8         # X
  num_tcn_blocks: 3          # R
visual_model:
  _target_: src.model.visual_model.ConvTasNetVisualModel
  embedding_size: 512
  num_layers: 5
pre_audio_encoder:
  _target_: src.model.utils.TemporalConvNet
  in_channels: 512
  out_channels: 512
  bottleneck_channels: 128
  hidden_channels: 512
  num_conv_blocks: 8
  num_tcn_blocks: 3
  kernel_size: 3


In [None]:
from src.model import AudioVisualModel, ConvTasNetModel
from src.model.visual_model import ConvTasNetVisualModel
from src.model.utils import TemporalConvNet

audio_model = ConvTasNetModel(
    num_filters=512,
    filter_length=16,
    bottleneck_channels=256,
    conv_num_channels=1024,
    conv_kernel_size=3,
    num_conv_blocks=10,
    num_tcn_blocks=2
)

visual_model = ConvTasNetVisualModel(
    embedding_size=512,
    num_layers=5
)

pre_audio_encoder = TemporalConvNet(
    in_channels=512,
    out_channels=512,
    bottleneck_channels=256,
    hidden_channels=1024,
    num_conv_blocks=10,
    num_tcn_blocks=1,
    kernel_size=3
)

model = AudioVisualModel(audio_model, visual_model, pre_audio_encoder)

calc_params(model)

Total number of parameters: 25362976
