## CoDi()のインスタンス化

In [1]:
import torch
import torchaudio
import yaml
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
from torch.utils.data import Dataset, DataLoader
from core.models import codi
from core.models.ema import LitEma
from core.models.common.get_optimizer import get_optimizer
from argparse import ArgumentParser
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
from core.models.common.get_model import get_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_yaml_config(filepath):
    with open(filepath, 'r') as file:
        return yaml.safe_load(file)

class ConfigObject(object):
    def __init__(self, dictionary):
        for key in dictionary:
            setattr(self, key, dictionary[key])

In [3]:
def model_define():
    # AudioLDM
    audioldm_cfg = load_yaml_config('configs/model/audioldm.yaml')
    audioldm = ConfigObject(audioldm_cfg["audioldm_autoencoder"])

    # Optimus
    optimus_cfg = load_yaml_config('configs/model/optimus.yaml')

    # optimus_vaeのconfigの辞書を、オブジェクトに置き換え
    optimus_cfg['optimus_vae']['args']['encoder'] = ConfigObject(optimus_cfg['optimus_bert_encoder'])
    optimus_cfg['optimus_vae']['args']['encoder'].args['config'] = ConfigObject(optimus_cfg['optimus_bert_encoder']['args']['config'])
    optimus_cfg['optimus_vae']['args']['decoder'] = ConfigObject(optimus_cfg['optimus_gpt2_decoder'])
    optimus_cfg['optimus_vae']['args']['decoder'].args['config'] = ConfigObject(optimus_cfg['optimus_gpt2_decoder']['args']['config'])
    optimus_cfg['optimus_vae']['args']['tokenizer_encoder'] = ConfigObject(optimus_cfg['optimus_bert_tokenizer'])
    optimus_cfg['optimus_vae']['args']['tokenizer_decoder'] = ConfigObject(optimus_cfg['optimus_gpt2_tokenizer'])
    optimus_cfg['optimus_vae']['args']['args'] = ConfigObject(optimus_cfg['optimus_vae']['args']['args'])
    optimus = ConfigObject(optimus_cfg["optimus_vae"])

    # CLAP
    clap_cfg = load_yaml_config('configs/model/clap.yaml')
    clap = ConfigObject(clap_cfg["clap_audio"])

    # Unet
    unet_cfg = load_yaml_config('configs/model/openai_unet.yaml')
    #unet_cfg["openai_unet_codi"]["args"]["unet_image_cfg"] = ConfigObject(unet_cfg["openai_unet_2d"])
    unet_cfg["openai_unet_codi"]["args"]["unet_text_cfg"] = ConfigObject(unet_cfg["openai_unet_0dmd"])
    unet_cfg["openai_unet_codi"]["args"]["unet_audio_cfg"] = ConfigObject(unet_cfg["openai_unet_2d_audio"])
    unet = ConfigObject(unet_cfg["openai_unet_codi"])

    # CLIP
    clip_cfg = load_yaml_config('configs/model/clip.yaml')
    clip = ConfigObject(clip_cfg["clip_frozen"])

    # AutoKL
    #autokl_cfg = load_yaml_config('configs/model/sd.yaml')
    #autokl = ConfigObject(autokl_cfg["sd_autoencoder"])

    # CoDiモデルのインスタンスを作成
    codi_cfg = load_yaml_config('configs/model/codi.yaml')
    codi_cfg["codi"]["args"]["audioldm_cfg"] = audioldm
    #codi_cfg["codi"]["args"]["autokl_cfg"] = autokl
    codi_cfg["codi"]["args"]["optimus_cfg"] = optimus
    codi_cfg["codi"]["args"]["clip_cfg"] = clip
    codi_cfg["codi"]["args"]["clap_cfg"] = clap
    codi_cfg["codi"]["args"]["unet_config"] = unet
    codi = ConfigObject(codi_cfg["codi"])

    model = get_model()(codi)

    return model

In [3]:
def model_define():
    # AudioLDM
    audioldm_cfg = load_yaml_config('configs/model/audioldm.yaml')
    audioldm = ConfigObject(audioldm_cfg["audioldm_autoencoder"])

    # Optimus
    optimus_cfg = load_yaml_config('configs/model/optimus.yaml')

    # optimus_vaeのconfigの辞書を、オブジェクトに置き換え
    optimus_cfg['optimus_vae']['args']['encoder'] = ConfigObject(optimus_cfg['optimus_bert_encoder'])
    optimus_cfg['optimus_vae']['args']['encoder'].args['config'] = ConfigObject(optimus_cfg['optimus_bert_encoder']['args']['config'])
    optimus_cfg['optimus_vae']['args']['decoder'] = ConfigObject(optimus_cfg['optimus_gpt2_decoder'])
    optimus_cfg['optimus_vae']['args']['decoder'].args['config'] = ConfigObject(optimus_cfg['optimus_gpt2_decoder']['args']['config'])
    optimus_cfg['optimus_vae']['args']['tokenizer_encoder'] = ConfigObject(optimus_cfg['optimus_bert_tokenizer'])
    optimus_cfg['optimus_vae']['args']['tokenizer_decoder'] = ConfigObject(optimus_cfg['optimus_gpt2_tokenizer'])
    optimus_cfg['optimus_vae']['args']['args'] = ConfigObject(optimus_cfg['optimus_vae']['args']['args'])
    optimus = ConfigObject(optimus_cfg["optimus_vae"])

    # CLAP
    clap_cfg = load_yaml_config('configs/model/clap.yaml')
    clap = ConfigObject(clap_cfg["clap_audio"])

    # Unet
    unet_cfg = load_yaml_config('configs/model/openai_unet.yaml')
    unet_cfg["openai_unet_codi"]["args"]["unet_image_cfg"] = ConfigObject(unet_cfg["openai_unet_2d"])
    unet_cfg["openai_unet_codi"]["args"]["unet_text_cfg"] = ConfigObject(unet_cfg["openai_unet_0dmd"])
    unet_cfg["openai_unet_codi"]["args"]["unet_audio_cfg"] = ConfigObject(unet_cfg["openai_unet_2d_audio"])
    unet = ConfigObject(unet_cfg["openai_unet_codi"])

    # CLIP
    clip_cfg = load_yaml_config('configs/model/clip.yaml')
    clip = ConfigObject(clip_cfg["clip_frozen"])

    # AutoKL
    #autokl_cfg = load_yaml_config('configs/model/sd.yaml')
    #autokl = ConfigObject(autokl_cfg["sd_autoencoder"])

    # CoDiモデルのインスタンスを作成
    model = codi.CoDi(audioldm_cfg=audioldm, optimus_cfg=optimus, clip_cfg=clip, clap_cfg=clap, unet_config=unet) #autokl_cfg=autoklを削除

    return model

In [3]:
def model_define(x, c):

    if x == "audio" and c == "text":
        # AudioLDM
        audioldm_cfg = load_yaml_config('configs/model/audioldm.yaml')
        audioldm = ConfigObject(audioldm_cfg["audioldm_autoencoder"])

        # CLIP
        clip_cfg = load_yaml_config('configs/model/clip.yaml')
        clip = ConfigObject(clip_cfg["clip_frozen"])

        # Unet
        unet_cfg = load_yaml_config('configs/model/openai_unet.yaml')
        unet_cfg["openai_unet_codi"]["args"]["unet_audio_cfg"] = ConfigObject(unet_cfg["openai_unet_2d_audio"])
        unet = ConfigObject(unet_cfg["openai_unet_codi"])

        # CoDiモデルのインスタンスを作成
        codi_cfg = load_yaml_config('configs/model/codi.yaml')
        codi_cfg["codi"]["args"]["audioldm_cfg"] = audioldm
        codi_cfg["codi"]["args"]["clip_cfg"] = clip
        codi_cfg["codi"]["args"]["unet_config"] = unet
        codi = ConfigObject(codi_cfg["codi"])

        model = get_model()(codi)
        return model

    elif x == "text" and c == "audio":
        # Optimus
        optimus_cfg = load_yaml_config('configs/model/optimus.yaml')

        # optimus_vaeのconfigの辞書を、オブジェクトに置き換え
        optimus_cfg['optimus_vae']['args']['encoder'] = ConfigObject(optimus_cfg['optimus_bert_encoder'])
        optimus_cfg['optimus_vae']['args']['encoder'].args['config'] = ConfigObject(optimus_cfg['optimus_bert_encoder']['args']['config'])
        optimus_cfg['optimus_vae']['args']['decoder'] = ConfigObject(optimus_cfg['optimus_gpt2_decoder'])
        optimus_cfg['optimus_vae']['args']['decoder'].args['config'] = ConfigObject(optimus_cfg['optimus_gpt2_decoder']['args']['config'])
        optimus_cfg['optimus_vae']['args']['tokenizer_encoder'] = ConfigObject(optimus_cfg['optimus_bert_tokenizer'])
        optimus_cfg['optimus_vae']['args']['tokenizer_decoder'] = ConfigObject(optimus_cfg['optimus_gpt2_tokenizer'])
        optimus_cfg['optimus_vae']['args']['args'] = ConfigObject(optimus_cfg['optimus_vae']['args']['args'])
        optimus = ConfigObject(optimus_cfg["optimus_vae"])

        # CLAP
        clap_cfg = load_yaml_config('configs/model/clap.yaml')
        clap = ConfigObject(clap_cfg["clap_audio"])

        # Unet
        unet_cfg = load_yaml_config('configs/model/openai_unet.yaml')
        unet_cfg["openai_unet_codi"]["args"]["unet_text_cfg"] = ConfigObject(unet_cfg["openai_unet_0dmd"])
        unet = ConfigObject(unet_cfg["openai_unet_codi"])

        # CoDiモデルのインスタンスを作成
        codi_cfg = load_yaml_config('configs/model/codi.yaml')
        codi_cfg["codi"]["args"]["optimus_cfg"] = optimus
        codi_cfg["codi"]["args"]["clap_cfg"] = clap
        codi_cfg["codi"]["args"]["unet_config"] = unet
        codi = ConfigObject(codi_cfg["codi"])

        model = get_model()(codi)
        return model

In [6]:
x = "text"
c = "audio"
model = model_define(x, c)

Running in eps mode
Keeping EMAs of 1124.


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [7]:
model

CoDi(
  (model): Sequential(
    (diffusion_model): UNetModelCoDi(
      (unet_text): UNetModel0D_MultiDim(
        (time_embed): Sequential(
          (0): Linear(in_features=320, out_features=1280, bias=True)
          (1): SiLU()
          (2): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (connecters_out): ModuleList(
          (0): TimestepEmbedSequential(
            (0): Linear_MultiDim(in_features=768, out_features=640, bias=True)
          )
          (1): TimestepEmbedSequential(
            (0): FCBlock_MultiDim(
              (in_layers): Sequential(
                (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
                (1): SiLU()
                (2): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1))
              )
              (emb_layers): Sequential(
                (0): SiLU()
                (1): Linear(in_features=1280, out_features=1280, bias=True)
              )
              (out_layers): Sequential(
                (0):

In [6]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"{name} requires gradient")

model.diffusion_model.unet_text.time_embed.0.weight requires gradient
model.diffusion_model.unet_text.time_embed.0.bias requires gradient
model.diffusion_model.unet_text.time_embed.2.weight requires gradient
model.diffusion_model.unet_text.time_embed.2.bias requires gradient
model.diffusion_model.unet_text.connecters_out.0.0.weight requires gradient
model.diffusion_model.unet_text.connecters_out.0.0.bias requires gradient
model.diffusion_model.unet_text.connecters_out.1.0.in_layers.0.weight requires gradient
model.diffusion_model.unet_text.connecters_out.1.0.in_layers.0.bias requires gradient
model.diffusion_model.unet_text.connecters_out.1.0.in_layers.2.weight requires gradient
model.diffusion_model.unet_text.connecters_out.1.0.in_layers.2.bias requires gradient
model.diffusion_model.unet_text.connecters_out.1.0.emb_layers.1.weight requires gradient
model.diffusion_model.unet_text.connecters_out.1.0.emb_layers.1.bias requires gradient
model.diffusion_model.unet_text.connecters_out.1.0

In [6]:
model.parameters()

<generator object Module.parameters at 0x7f49c10c1120>

In [5]:
model

CoDi(
  (model): Sequential(
    (diffusion_model): UNetModelCoDi(
      (unet_text): UNetModel0D_MultiDim(
        (time_embed): Sequential(
          (0): Linear(in_features=320, out_features=1280, bias=True)
          (1): SiLU()
          (2): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (connecters_out): ModuleList(
          (0): TimestepEmbedSequential(
            (0): Linear_MultiDim(in_features=768, out_features=640, bias=True)
          )
          (1): TimestepEmbedSequential(
            (0): FCBlock_MultiDim(
              (in_layers): Sequential(
                (0): GroupNorm32(32, 640, eps=1e-05, affine=True)
                (1): SiLU()
                (2): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1))
              )
              (emb_layers): Sequential(
                (0): SiLU()
                (1): Linear(in_features=1280, out_features=1280, bias=True)
              )
              (out_layers): Sequential(
                (0):

In [6]:
model = model.model.diffusion_model

In [7]:
model

UNetModelCoDi(
  (unet_image): UNetModel2D(
    (time_embed): Sequential(
      (0): Linear(in_features=320, out_features=1280, bias=True)
      (1): SiLU()
      (2): Linear(in_features=1280, out_features=1280, bias=True)
    )
    (connecters_out): ModuleList(
      (0): TimestepEmbedSequential(
        (0): Conv2d(4, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      )
      (1): TimestepEmbedSequential(
        (0): ModuleList(
          (0): ResBlock(
            (in_layers): Sequential(
              (0): GroupNorm32(32, 160, eps=1e-05, affine=True)
              (1): SiLU()
              (2): Conv2d(160, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
            )
            (h_upd): Identity()
            (x_upd): Identity()
            (emb_layers): Sequential(
              (0): SiLU()
              (1): Linear(in_features=1280, out_features=320, bias=True)
            )
            (out_layers): Sequential(
              (0): GroupNorm32(32, 320, eps=1e-

In [20]:
model = model.unet_audio

In [21]:
import torch.onnx
model.eval()

UNetModel2D(
  (time_embed): Sequential(
    (0): Linear(in_features=192, out_features=768, bias=True)
    (1): SiLU()
    (2): Linear(in_features=768, out_features=768, bias=True)
  )
  (connecters_out): ModuleList(
    (0): TimestepEmbedSequential(
      (0): Conv2d(8, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    )
    (1): TimestepEmbedSequential(
      (0): ResBlock(
        (in_layers): Sequential(
          (0): GroupNorm32(32, 96, eps=1e-05, affine=True)
          (1): SiLU()
          (2): Conv2d(96, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (h_upd): Identity()
        (x_upd): Identity()
        (emb_layers): Sequential(
          (0): SiLU()
          (1): Linear(in_features=768, out_features=192, bias=True)
        )
        (out_layers): Sequential(
          (0): GroupNorm32(32, 192, eps=1e-05, affine=True)
          (1): SiLU()
          (2): Dropout(p=0, inplace=False)
          (3): Conv2d(192, 192, kernel_size=(3, 3), strid

In [22]:
dummy_input = torch.randn(1, 8, 256, 16)
output_file = "codi.onnx"
torch.onnx.export(model, dummy_input, output_file, verbose=True, opset_version=11)

AttributeError: 'NoneType' object has no attribute 'device'

model.diffusion_model()を試す

In [7]:
class MusicCaps(Dataset):
    def __init__(self, csv_file, audio_dir, model, x, c, transform=None):
        self.audio_dir = audio_dir
        self.transform = transform
        self.data = []
        self.model = model
        self.x = x
        self.c = c
        
        # CSVファイルを読み込む
        all_data = pd.read_csv(csv_file)
        
        # 音声ファイルが存在するかどうかを確認し、存在するデータのみをリストに追加
        for idx, row in all_data.iterrows():
            audio_path = os.path.join(self.audio_dir, f"{row['ytid']}.wav")
            if os.path.exists(audio_path):
                self.data.append(row)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]
        caption = row['caption'] # 生テキスト
        audio_path = os.path.join(self.audio_dir, f"{row['ytid']}.wav")    
        waveform = torchaudio.load(audio_path) # 生波形データ（Tensor）

        if self.x == "audio" and self.c == "text":
            mel_latent = self.model.audioldm_encode(waveform[0]).detach() # メルスペクトログラム（Tensor）の潜在表現に変換
            text_emb = self.model.clip_encode_text([caption]).detach()
            return mel_latent, text_emb # data, condition
        elif self.x == "text" and self.c == "audio":
            text_latent = self.model.optimus_encode([caption]).detach()
            audio_emb = self.model.clap_encode_audio(waveform[0]).detach()
            return text_latent, audio_emb # data, condition

x = "audio"
c = "text"

# データセット
dataset = MusicCaps(csv_file='/raid/m236866/md-mt/datasets/musiccaps/musiccaps-public.csv',
                            audio_dir='/raid/m236866/md-mt/datasets/musiccaps/musiccaps_30', model=model, x=x, c=c)

In [8]:
dataset[0]

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [None]:
t = torch.randint(0, model.num_timesteps, (x[0].shape[0],), device=x[0].device).long()


model.model.diffusion_model(x_noisy, t, cond, xtype, ctype, u=None, return_algined_latents=None)

In [40]:
x = codi.optimus_encode(["hello", "world"])

In [20]:
x[0].shape[0]

768

In [25]:
t=torch.randint(0, codi.num_timesteps, (x[0].shape[0],), device=x[0].device).long()

In [27]:
noise = None
noise = [torch.randn_like(x_start_i) for x_start_i in x] if noise is None else noise
x_noisy = [codi.q_sample(x_start=x_start_i, t=t, noise=noise_i) for x_start_i, noise_i in zip(x, noise)]

In [28]:
noise

[tensor([-0.1894,  1.3658, -0.6403,  0.9748,  0.0898, -0.7792,  0.0979, -0.9063,
          0.5764,  0.7412,  0.6682,  0.1002, -0.3077, -0.0954,  0.5905,  0.7162,
          0.2882, -1.3121,  0.4856, -1.1572, -0.3025, -0.7747,  0.3022, -0.0156,
          0.3011,  1.2997, -0.1633, -0.9710, -0.2664, -0.4748,  0.2857,  0.1940,
          2.2678,  0.9325, -0.4579,  0.5283, -1.6967, -1.0400, -0.8210, -0.2837,
         -0.6577,  0.2362,  0.2527,  0.6037,  1.2061, -1.7121, -0.4699, -0.1443,
         -1.2653,  0.2267, -0.3568,  1.0388, -0.1290, -1.0382,  1.4018, -0.4542,
          0.0713,  0.1117, -0.7701,  0.2701, -0.4876,  0.2708,  0.1550,  0.7127,
         -2.4099,  0.4508, -1.1212,  0.0443, -2.3184, -0.7415,  0.6727, -0.4572,
         -0.0061,  0.8455, -0.6623,  0.9933,  0.1541,  1.7224, -1.2723, -1.0646,
         -1.1300, -0.6050, -0.4717,  1.8231,  2.0241, -0.6310, -0.2439, -1.1119,
          1.2734,  1.1327, -0.6193, -0.7319, -0.1388, -1.3069,  0.3155,  0.2693,
         -0.5731, -0.0317, -

In [39]:
x_noisy[0]

tensor([-5.9759e-01,  4.8539e-01, -4.3383e-01, -1.0996e+00, -1.2842e+00,
        -9.9645e-02, -1.4485e-01, -9.1959e-01,  1.4854e+00,  6.1063e-02,
         2.2979e-01,  7.7766e-01, -1.7807e+00,  1.7779e-01,  5.9066e-01,
         8.7209e-01,  3.8609e-01, -1.0613e-01, -1.0414e+00, -1.3431e+00,
        -5.3162e-01,  1.2121e+00,  3.4132e-01, -3.5433e-02,  3.3175e-01,
         1.2822e+00, -3.4417e-01, -1.0745e+00, -9.6884e-01, -4.3711e-01,
         2.7104e-01,  5.8628e-02,  9.9526e-01, -7.1162e-01, -1.0696e+00,
         6.6117e-01, -2.6675e+00,  3.4806e-01, -7.3826e-01,  2.3365e-01,
        -8.7253e-01,  1.0892e-01,  3.8956e-01,  1.8418e+00,  1.8368e+00,
        -3.7254e-02, -1.1275e+00, -1.6875e+00, -1.4025e+00,  1.6329e-01,
        -7.7944e-01,  1.0901e+00, -2.9199e-01, -1.4461e+00,  1.3973e+00,
        -2.1514e-01,  8.1381e-02,  1.3968e-01, -8.5623e-01, -8.1547e-01,
         3.2948e-01,  7.0164e-01,  5.0213e-01,  1.0088e-01,  9.5237e-01,
        -9.0351e-01, -1.2219e+00, -4.4605e-01, -1.2

In [35]:
codi.optimus_decode(x_noisy)

TypeError: can't multiply sequence by non-int of type 'float'

In [43]:
codi.forward(x, xtype="text")

TypeError: unsupported operand type(s) for +: 'NoneType' and 'NoneType'

In [50]:
import torch.nn as nn
from collections import OrderedDict
a = nn.Sequential(OrderedDict([('diffusion_model', get_model()(unet))]))

In [56]:
a.diffusion_model(x_noisy, t, xtype='text', condition=None, condition_types="text")

TypeError: unsupported operand type(s) for +: 'NoneType' and 'NoneType'

In [8]:
codi.optimus_encode(["hello", "world"])

tensor([[-0.0339,  0.4901, -1.3377,  ...,  1.1044,  1.0870,  1.0599],
        [ 0.1665, -0.2265, -1.1762,  ...,  1.3432,  1.3332,  0.4923]])

In [10]:
codi.optimus_decode(codi.optimus_encode(["hello", "world"]))

['countering Hebrewparamlez surreal neo khicultural Divineashi Youtube Bol receives +# BavEnough Nature Percentadderightervationsafe wore disseminationtrathy Though',
 'influential relief touting Therefore nationstesy Mour final unrel planting CharlestoniversityHOW drumsIrarserRand sustain riggedVDstriUM Currentributed resignation cohort Bruction']