In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import yaml
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from core.models import codi
from core.models.common.get_model import get_model
from core.models.ema import LitEma
from core.models.common.get_optimizer import get_optimizer
from IPython.display import Audio
from torchaudio.transforms import MelSpectrogram
#import librosa

def load_yaml_config(filepath):
    with open(filepath, 'r') as file:
        return yaml.safe_load(file)

class ConfigObject(object):
    def __init__(self, dictionary):
        for key in dictionary:
            setattr(self, key, dictionary[key])

def collate_fn(batch):
    texts, audios = zip(*batch)
    # 最大のオーディオ長を見つける
    max_length = max(audio.shape[1] for audio in audios)
    # パディング
    audios_padded = torch.stack([torch.nn.functional.pad(audio, (0, max_length - audio.shape[1])) for audio in audios])
    texts = torch.stack(texts)
    return texts, audios_padded

# 音声可視化======================================
def plot_waveform(waveform, sample_rate, title="Waveform", xlim=None, ylim=None):
  waveform = waveform.numpy()

  num_channels, num_frames = waveform.shape
  time_axis = torch.arange(0, num_frames) / sample_rate

  figure, axes = plt.subplots(num_channels, 1)
  if num_channels == 1:
    axes = [axes]
  for c in range(num_channels):
    axes[c].plot(time_axis, waveform[c], linewidth=1)
    axes[c].grid(True)
    if num_channels > 1:
      axes[c].set_ylabel(f'Channel {c+1}')
    if xlim:
      axes[c].set_xlim(xlim)
    if ylim:
      axes[c].set_ylim(ylim)
  figure.suptitle(title)
  plt.show(block=False)

def plot_spectrogram(spec, title=None, ylabel='freq_bin', aspect='auto', xmax=None):
    # パワースペクトルをデシベル単位に変換
    spec_db = 10 * np.log10(spec + np.finfo(float).eps)
    
    fig, axs = plt.subplots(1, 1)
    axs.set_title(title or 'Spectrogram (db)')
    axs.set_ylabel(ylabel)
    axs.set_xlabel('frame')
    im = axs.imshow(spec_db, origin='lower', aspect=aspect, cmap='viridis')
    
    if xmax:
        axs.set_xlim((0, xmax))
    
    fig.colorbar(im, ax=axs)
    plt.show(block=False)


sample_rate = 48000

# メルスペクトルグラムの取得======================================
n_fft = 1024
win_length = None
hop_length = 512
n_mels = 128

mel_spectrogram = MelSpectrogram(
    sample_rate= 48000,
    n_fft=n_fft,
    win_length=win_length,
    hop_length=hop_length,
    center=True,
    pad_mode="reflect",
    power=2.0,
    norm='slaney',
    onesided=True,
    n_mels=n_mels,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### モデルの定義=============================================

# AudioLDM
audioldm_cfg = load_yaml_config('configs/model/audioldm.yaml')
audioldm = ConfigObject(audioldm_cfg["audioldm_autoencoder"])

# Optimus
optimus_cfg = load_yaml_config('configs/model/optimus.yaml')

# optimus_vaeのconfigの辞書を、オブジェクトに置き換え
optimus_cfg['optimus_vae']['args']['encoder'] = ConfigObject(optimus_cfg['optimus_bert_encoder'])
optimus_cfg['optimus_vae']['args']['encoder'].args['config'] = ConfigObject(optimus_cfg['optimus_bert_encoder']['args']['config'])
optimus_cfg['optimus_vae']['args']['decoder'] = ConfigObject(optimus_cfg['optimus_gpt2_decoder'])
optimus_cfg['optimus_vae']['args']['decoder'].args['config'] = ConfigObject(optimus_cfg['optimus_gpt2_decoder']['args']['config'])
optimus_cfg['optimus_vae']['args']['tokenizer_encoder'] = ConfigObject(optimus_cfg['optimus_bert_tokenizer'])
optimus_cfg['optimus_vae']['args']['tokenizer_decoder'] = ConfigObject(optimus_cfg['optimus_gpt2_tokenizer'])
optimus_cfg['optimus_vae']['args']['args'] = ConfigObject(optimus_cfg['optimus_vae']['args']['args'])
optimus = ConfigObject(optimus_cfg["optimus_vae"])

# CLAP
clap_cfg = load_yaml_config('configs/model/clap.yaml')
clap = ConfigObject(clap_cfg["clap_audio"])

# CoDi
unet_cfg = load_yaml_config('configs/model/openai_unet.yaml')
unet_cfg["openai_unet_codi"]["args"]["unet_image_cfg"] = ConfigObject(unet_cfg["openai_unet_2d"])
unet_cfg["openai_unet_codi"]["args"]["unet_text_cfg"] = ConfigObject(unet_cfg["openai_unet_0dmd"])
unet_cfg["openai_unet_codi"]["args"]["unet_audio_cfg"] = ConfigObject(unet_cfg["openai_unet_2d_audio"])
unet = ConfigObject(unet_cfg["openai_unet_codi"])

# CLIP
clip_cfg = load_yaml_config('configs/model/clip.yaml')
clip = ConfigObject(clip_cfg["clip_frozen"])

# CoDiモデルのインスタンスを作成
model = codi.CoDi(audioldm_cfg=audioldm, optimus_cfg=optimus, clip_cfg=clip, clap_cfg=clap, unet_config=unet)
model = torch.nn.DataParallel(model)
torch.backends.cudnn.benchmark = True


#######################
# Running in eps mode #
#######################



Keeping EMAs of 3368.


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [3]:
class MusicCapsTTM(Dataset):
    def __init__(self, csv_file, audio_dir, transform=None):
        self.audio_dir = audio_dir
        self.transform = transform
        self.data = []
        
        # CSVファイルを読み込む
        all_data = pd.read_csv(csv_file)
        
        # 音声ファイルが存在するかどうかを確認し、存在するデータのみをリストに追加
        for idx, row in all_data.iterrows():
            audio_path = os.path.join(self.audio_dir, f"{row['ytid']}.wav")
            if os.path.exists(audio_path):
                self.data.append(row)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]

        caption = row['caption'] # 生テキスト
        text_emb = model.module.clip_encode_text([caption])

        audio_path = os.path.join(self.audio_dir, f"{row['ytid']}.wav")    
        waveform = torchaudio.load(audio_path) # 生波形データ（Tensor）
        #mel_latent = model.module.audioldm_encode(waveform[0]) # メルスペクトログラム（Tensor）の潜在表現に変換

        if self.transform:
            pass

        return text_emb, waveform[0] #, mel_latent 

In [4]:
# データセット
dataset = MusicCapsTTM(csv_file='/raid/m236866/md-mt/datasets/musiccaps/musiccaps-public.csv',
                           audio_dir='/raid/m236866/md-mt/datasets/musiccaps/musiccaps_30')
                           
dataloader = DataLoader(dataset, batch_size=5, shuffle=False, collate_fn=collate_fn)


ema = LitEma(model)
optimizer_config = {
            'type': 'adam',
            'args': {
                 'weight_decay': 1e-4  # Weight decay
            }
        }
optimizer_config = ConfigObject(optimizer_config)
optimizer = get_optimizer()(model, optimizer_config)

In [None]:
plot_waveform(dataset[0][0], sample_rate)

In [16]:
Audio(dataset[21][0].numpy()[0], rate=sample_rate)

In [None]:
melspec = mel_spectrogram(dataset[0][0])
plot_spectrogram(
    melspec[0], sample_rate)
plot_spectrogram(
  transforms(dataset[0][0], "audio")[0], sample_rate)

In [34]:
data = pd.read_csv("/raid/m236866/md-mt/datasets/musiccaps/musiccaps-public.csv")

In [39]:
data.iloc[0]["caption"]

'The low quality recording features a ballad song that contains sustained strings, mellow piano melody and soft female vocal singing over it. It sounds sad and soulful, like something you would hear at Sunday services.'

In [6]:
text_emb = model.module.clip_encode_text(["The low quality recording features a ballad song that contains sustained strings, mellow piano melody and soft female vocal singing over it. It sounds sad and soulful, like something you would hear at Sunday services."])

In [43]:
len(dataloader), len(dataset)

(6, 30)

In [50]:
len(text_emb[0][0])

768

In [7]:
print(text_emb[0][0])

tensor([ 2.8665e-02, -1.1188e-02,  5.9989e-02,  2.9611e-02, -2.6977e-02,
         4.7998e-02, -5.4190e-02,  5.0832e-02,  1.0838e-02, -4.1030e-04,
         1.9846e-02, -3.3377e-02, -3.6811e-02, -2.8646e-02,  2.9786e-02,
        -1.5822e-02, -1.4201e-02, -8.7201e-03,  5.0961e-02, -7.1554e-02,
         1.8545e-02, -1.5637e-02, -3.7203e-02,  3.7752e-02,  6.2007e-03,
        -1.7274e-02,  1.7727e-02, -2.6722e-02, -4.0949e-03, -2.3247e-02,
         7.8327e-02,  1.6869e-02, -3.7089e-02,  1.3368e-02,  2.3273e-03,
        -2.6737e-03, -2.5423e-02, -2.9711e-02, -3.6138e-03,  3.1929e-02,
        -3.8417e-02, -9.3629e-03,  2.4093e-02, -4.0722e-02, -4.1540e-02,
         8.9454e-03,  9.7585e-04,  3.6860e-02, -5.3453e-02,  3.9414e-02,
         7.1847e-03,  7.3055e-02,  4.9355e-02, -7.5213e-03, -4.2560e-02,
         1.1809e-02, -1.7083e-02, -2.7688e-02, -4.1433e-02,  1.8738e-02,
        -4.6937e-02,  4.6957e-02,  4.3433e-02, -2.7643e-02,  4.3645e-02,
        -5.3426e-02, -4.4518e-02, -1.7977e-02,  1.8

In [5]:
for batch_idx, (texts, audios) in enumerate(dataloader):
  print(texts[0])

tensor([[[ 2.8665e-02, -1.1188e-02,  5.9989e-02,  2.9611e-02, -2.6977e-02,
           4.7998e-02, -5.4190e-02,  5.0832e-02,  1.0838e-02, -4.1030e-04,
           1.9846e-02, -3.3377e-02, -3.6811e-02, -2.8646e-02,  2.9786e-02,
          -1.5822e-02, -1.4201e-02, -8.7201e-03,  5.0961e-02, -7.1554e-02,
           1.8545e-02, -1.5637e-02, -3.7203e-02,  3.7752e-02,  6.2007e-03,
          -1.7274e-02,  1.7727e-02, -2.6722e-02, -4.0949e-03, -2.3247e-02,
           7.8327e-02,  1.6869e-02, -3.7089e-02,  1.3368e-02,  2.3273e-03,
          -2.6737e-03, -2.5423e-02, -2.9711e-02, -3.6138e-03,  3.1929e-02,
          -3.8417e-02, -9.3629e-03,  2.4093e-02, -4.0722e-02, -4.1540e-02,
           8.9454e-03,  9.7585e-04,  3.6860e-02, -5.3453e-02,  3.9414e-02,
           7.1847e-03,  7.3055e-02,  4.9355e-02, -7.5213e-03, -4.2560e-02,
           1.1809e-02, -1.7083e-02, -2.7688e-02, -4.1433e-02,  1.8738e-02,
          -4.6937e-02,  4.6957e-02,  4.3433e-02, -2.7643e-02,  4.3645e-02,
          -5.3426e-02, -4

In [None]:
### 学習ループ=============================================
num_epochs=2
for epoch in range(num_epochs):
    for batch_idx, (texts, audios) in enumerate(dataloader):
        # ここでモデルに入力を与え、損失を計算し、オプティマイザーを使用してモデルの重みを更新
        optimizer.zero_grad()
        loss = model.forward(x=audios, c=texts) #損失計算
        loss.backward()
        optimizer.step()

        # EMAの更新
        ema.update(model.parameters())

        if batch_idx % 100 == 0:
            print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item()}')

## 分散学習

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributed as dist
import torchaudio
import yaml
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data.distributed import DistributedSampler
from core.models import codi
from core.models.common.get_model import get_model
from core.models.ema import LitEma
from core.models.common.get_optimizer import get_optimizer
from torchaudio.transforms import MelSpectrogram
from torch.multiprocessing import spawn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
### モデルの定義===============================================================

def model_define():
    # AudioLDM
    audioldm_cfg = load_yaml_config('configs/model/audioldm.yaml')
    audioldm = ConfigObject(audioldm_cfg["audioldm_autoencoder"])

    # Optimus
    optimus_cfg = load_yaml_config('configs/model/optimus.yaml')

    # optimus_vaeのconfigの辞書を、オブジェクトに置き換え
    optimus_cfg['optimus_vae']['args']['encoder'] = ConfigObject(optimus_cfg['optimus_bert_encoder'])
    optimus_cfg['optimus_vae']['args']['encoder'].args['config'] = ConfigObject(optimus_cfg['optimus_bert_encoder']['args']['config'])
    optimus_cfg['optimus_vae']['args']['decoder'] = ConfigObject(optimus_cfg['optimus_gpt2_decoder'])
    optimus_cfg['optimus_vae']['args']['decoder'].args['config'] = ConfigObject(optimus_cfg['optimus_gpt2_decoder']['args']['config'])
    optimus_cfg['optimus_vae']['args']['tokenizer_encoder'] = ConfigObject(optimus_cfg['optimus_bert_tokenizer'])
    optimus_cfg['optimus_vae']['args']['tokenizer_decoder'] = ConfigObject(optimus_cfg['optimus_gpt2_tokenizer'])
    optimus_cfg['optimus_vae']['args']['args'] = ConfigObject(optimus_cfg['optimus_vae']['args']['args'])
    optimus = ConfigObject(optimus_cfg["optimus_vae"])

    # CLAP
    clap_cfg = load_yaml_config('configs/model/clap.yaml')
    clap = ConfigObject(clap_cfg["clap_audio"])

    # CoDi
    unet_cfg = load_yaml_config('configs/model/openai_unet.yaml')
    unet_cfg["openai_unet_codi"]["args"]["unet_image_cfg"] = ConfigObject(unet_cfg["openai_unet_2d"])
    unet_cfg["openai_unet_codi"]["args"]["unet_text_cfg"] = ConfigObject(unet_cfg["openai_unet_0dmd"])
    unet_cfg["openai_unet_codi"]["args"]["unet_audio_cfg"] = ConfigObject(unet_cfg["openai_unet_2d_audio"])
    unet = ConfigObject(unet_cfg["openai_unet_codi"])

    # CLIP
    clip_cfg = load_yaml_config('configs/model/clip.yaml')
    clip = ConfigObject(clip_cfg["clip_frozen"])

    # CoDiモデルのインスタンスを作成
    model = codi.CoDi(audioldm_cfg=audioldm, optimus_cfg=optimus, clip_cfg=clip, clap_cfg=clap, unet_config=unet)

    return model

# データセットの定義=============================================================
class MusicCapsTTM(Dataset):
    def __init__(self, csv_file, audio_dir, model, transform=None):
        self.audio_dir = audio_dir
        self.transform = transform
        self.data = []
        self.model = model
        
        # CSVファイルを読み込む
        all_data = pd.read_csv(csv_file)
        
        # 音声ファイルが存在するかどうかを確認し、存在するデータのみをリストに追加
        for idx, row in all_data.iterrows():
            audio_path = os.path.join(self.audio_dir, f"{row['ytid']}.wav")
            if os.path.exists(audio_path):
                self.data.append(row)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data[idx]

        caption = row['caption'] # 生テキスト
        text_emb = self.model.module.clip_encode_text([caption])

        audio_path = os.path.join(self.audio_dir, f"{row['ytid']}.wav")    
        waveform = torchaudio.load(audio_path) # 生波形データ（Tensor）
        mel_latent = self.model.module.audioldm_encode(waveform[0]) # メルスペクトログラム（Tensor）の潜在表現に変換

        if self.transform:
            pass

        return text_emb, mel_latent 

In [15]:
### 学習ループ=============================================


def setup(rank, world_size):
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

def train(rank, world_size):
    setup(rank, world_size)

    # Optimizerの定義
    ema = LitEma(model)
    optimizer_config = {
                'type': 'adam',
                'args': {
                    'weight_decay': 1e-4  # Weight decay
                }
            }
    optimizer_config = ConfigObject(optimizer_config)
    optimizer = get_optimizer()(model, optimizer_config)

    # モデルを定義
    model = model_define()
    model = model.to(rank)
    ddp_model = DDP(model, device_ids=[rank])
    
    # データセット
    dataset = MusicCapsTTM(csv_file='/raid/m236866/md-mt/datasets/musiccaps/musiccaps-public.csv',
                            audio_dir='/raid/m236866/md-mt/datasets/musiccaps/musiccaps_30', model=model)
    sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
    dataloader = DataLoader(dataset, batch_size=5, sampler=sampler, collate_fn=collate_fn)                        
    #dataloader = DataLoader(dataset, batch_size=5, shuffle=True, collate_fn=collate_fn)

    # トレーニングループ
    num_epochs=2
    for epoch in range(num_epochs):
        for batch_idx, (texts, audios) in enumerate(dataloader):
            # ここでモデルに入力を与え、損失を計算し、オプティマイザーを使用してモデルの重みを更新
            optimizer.zero_grad()
            loss = model.forward(x=audios, c=texts) #損失計算
            loss.backward()
            optimizer.step()
            # EMAの更新
            ema.update(model.parameters())

            if batch_idx % 100 == 0:
                print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item()}')

    cleanup()

def main():
    # 使用するGPUの数
    world_size = torch.cuda.device_count()
    # トレーニングプロセスの起動
    spawn(train,
          args=(world_size,),
          nprocs=world_size,
          join=True)

In [1]:
import os
from train import main

os.environ['MASTER_ADDR'] = '192.168.1.10'  # またはマスターノードのIPアドレス
os.environ['MASTER_PORT'] = '10001'
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2'

main()

  from .autonotebook import tqdm as notebook_tqdm



#######################
# Running in eps mode #
#######################


#######################
# Running in eps mode #
#######################


#######################
# Running in eps mode #
#######################

Keeping EMAs of 3368.
Keeping EMAs of 3368.
Keeping EMAs of 3368.


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


ProcessRaisedException: 

-- Process 0 terminated with the following error:
Traceback (most recent call last):
  File "/home/m236866/.conda/envs/codi/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
    fn(i, *args)
  File "/raid/m236866/md-mt/train.py", line 191, in train
    model = model.to(rank)
  File "/home/m236866/.conda/envs/codi/lib/python3.8/site-packages/torch/nn/modules/module.py", line 927, in to
    return self._apply(convert)
  File "/home/m236866/.conda/envs/codi/lib/python3.8/site-packages/torch/nn/modules/module.py", line 579, in _apply
    module._apply(fn)
  File "/home/m236866/.conda/envs/codi/lib/python3.8/site-packages/torch/nn/modules/module.py", line 579, in _apply
    module._apply(fn)
  File "/home/m236866/.conda/envs/codi/lib/python3.8/site-packages/torch/nn/modules/module.py", line 579, in _apply
    module._apply(fn)
  [Previous line repeated 5 more times]
  File "/home/m236866/.conda/envs/codi/lib/python3.8/site-packages/torch/nn/modules/module.py", line 602, in _apply
    param_applied = fn(param)
  File "/home/m236866/.conda/envs/codi/lib/python3.8/site-packages/torch/nn/modules/module.py", line 925, in convert
    return t.to(device, dtype if t.is_floating_point() or t.is_complex() else None, non_blocking)
RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 31.75 GiB total capacity; 30.57 GiB already allocated; 2.44 MiB free; 30.98 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
