In [1]:
import torch

In [2]:
from audiocraft.models import CompressionModel, HFEncodecCompressionModel

  from .autonotebook import tqdm as notebook_tqdm
    PyTorch 2.1.0+cu121 with CUDA 1201 (you have 2.1.0+cu118)
    Python  3.8.18 (you have 3.8.10)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


In [3]:
model = CompressionModel.get_pretrained('facebook/encodec_32khz')



## HF to NonHF Converter

In [4]:
state = model.model.state_dict()
new_state = {}
for k, v in state.items():
    if k.startswith('decoder.layers') and '.conv.' in k and '.block.' not in k:
        # We need to determine if this a convtr or a regular conv.
        layer = int(k.split('.')[2])
        if isinstance(model.model.decoder.layers[layer].conv, torch.nn.ConvTranspose1d):

            k = k.replace('.conv.', '.convtr.')
    k = k.replace('encoder.layers.', 'encoder.model.')
    k = k.replace('decoder.layers.', 'decoder.model.')
    k = k.replace('conv.', 'conv.conv.')
    k = k.replace('convtr.', 'convtr.convtr.')
    k = k.replace('quantizer.layers.', 'quantizer.vq.layers.')
    k = k.replace('.codebook.', '._codebook.')
    new_state[k] = v
state = new_state

In [5]:
torch.save(state, "encodec_32khz_whole_nonHF.pt")

In [8]:
quantizer_state = {}
for key in state.keys():
    if 'quantizer' in key:
        quantizer_state[key] = state[key]
quantizer_state.keys()


dict_keys(['quantizer.vq.layers.0._codebook.inited', 'quantizer.vq.layers.0._codebook.cluster_size', 'quantizer.vq.layers.0._codebook.embed', 'quantizer.vq.layers.0._codebook.embed_avg', 'quantizer.vq.layers.1._codebook.inited', 'quantizer.vq.layers.1._codebook.cluster_size', 'quantizer.vq.layers.1._codebook.embed', 'quantizer.vq.layers.1._codebook.embed_avg', 'quantizer.vq.layers.2._codebook.inited', 'quantizer.vq.layers.2._codebook.cluster_size', 'quantizer.vq.layers.2._codebook.embed', 'quantizer.vq.layers.2._codebook.embed_avg', 'quantizer.vq.layers.3._codebook.inited', 'quantizer.vq.layers.3._codebook.cluster_size', 'quantizer.vq.layers.3._codebook.embed', 'quantizer.vq.layers.3._codebook.embed_avg'])

In [None]:
torch.save(quantizer_state, "encodec_32khz_quantizer_nonHF.pt")

In [20]:
model.model

EncodecModel(
  (encoder): EncodecEncoder(
    (layers): ModuleList(
      (0): EncodecConv1d(
        (conv): Conv1d(1, 64, kernel_size=(7,), stride=(1,))
      )
      (1): EncodecResnetBlock(
        (block): ModuleList(
          (0): ELU(alpha=1.0)
          (1): EncodecConv1d(
            (conv): Conv1d(64, 32, kernel_size=(3,), stride=(1,))
          )
          (2): ELU(alpha=1.0)
          (3): EncodecConv1d(
            (conv): Conv1d(32, 64, kernel_size=(1,), stride=(1,))
          )
        )
        (shortcut): Identity()
      )
      (2): ELU(alpha=1.0)
      (3): EncodecConv1d(
        (conv): Conv1d(64, 128, kernel_size=(8,), stride=(4,))
      )
      (4): EncodecResnetBlock(
        (block): ModuleList(
          (0): ELU(alpha=1.0)
          (1): EncodecConv1d(
            (conv): Conv1d(128, 64, kernel_size=(3,), stride=(1,))
          )
          (2): ELU(alpha=1.0)
          (3): EncodecConv1d(
            (conv): Conv1d(64, 128, kernel_size=(1,), stride=(1,))
  

In [23]:
import json

In [18]:
with open('encodec_32khz_config.json') as json_file:
    data = json.load(json_file)
print(data)

{'_name_or_path': 'ArthurZ/encodec_48khz', 'architectures': ['EncodecModel'], 'audio_channels': 2, 'chunk_length_s': 1.0, 'codebook_dim': 128, 'codebook_size': 1024, 'compress': 2, 'dilation_growth_rate': 2, 'hidden_size': 128, 'kernel_size': 7, 'last_kernel_size': 7, 'model_type': 'encodec', 'norm_type': 'time_group_norm', 'normalize': True, 'num_filters': 32, 'num_lstm_layers': 2, 'num_residual_layers': 1, 'overlap': 0.01, 'pad_mode': 'reflect', 'residual_kernel_size': 3, 'sampling_rate': 48000, 'target_bandwidths': [3.0, 6.0, 12.0, 24.0], 'torch_dtype': 'float32', 'transformers_version': '4.31.0.dev0', 'trim_right_ratio': 1.0, 'upsampling_ratios': [8, 5, 4, 2], 'use_causal_conv': False}


In [19]:
from transformers.models.encodec.configuration_encodec import EncodecConfig

In [6]:
model.model.quantizer.state_dict()

OrderedDict([('layers.0.codebook.inited', tensor([1.])),
             ('layers.0.codebook.cluster_size',
              tensor([0.3551, 0.1121, 0.0704,  ..., 0.4311, 0.0968, 0.0752])),
             ('layers.0.codebook.embed',
              tensor([[ 2.3788,  3.0058, -2.3097,  ...,  0.1857,  3.3994,  2.0836],
                      [ 2.2991, -4.7996,  4.0720,  ...,  4.2219, -1.9379,  4.2589],
                      [-2.7116, -3.0016,  4.5625,  ...,  0.1807,  0.8780, -0.0471],
                      ...,
                      [-1.7840, -1.6820,  3.6292,  ...,  0.7658, -0.3749, -0.0599],
                      [ 1.3861, -3.3283,  0.4663,  ...,  0.4332,  1.1878, -0.8231],
                      [ 1.1625, -0.7506,  4.0101,  ..., -3.3142, -0.0616, -2.2769]])),
             ('layers.0.codebook.embed_avg',
              tensor([[ 0.8448,  1.0666, -0.8199,  ...,  0.0662,  1.2071,  0.7399],
                      [ 0.2579, -0.5378,  0.4559,  ...,  0.4734, -0.2173,  0.4778],
                      [-0.19

In [12]:
from transformers.models.encodec.modeling_encodec import EncodecResidualVectorQuantizer

In [25]:
quantizer.load_state_dict(model.model.quantizer.state_dict())

<All keys matched successfully>

In [27]:
model.frame_rate

50.0

In [26]:
torch.save(model.model.quantizer.state_dict(), 'encodec_32khz_quantizer.pt')

In [1]:
config = EncodecConfig.from_pretrained('facebook/encodec_32khz')

NameError: name 'EncodecConfig' is not defined

In [24]:
quantizer = EncodecResidualVectorQuantizer(config = EncodecConfig.from_pretrained('facebook/encodec_32khz'))

loading configuration file config.json from cache at /home/jongmin/.cache/huggingface/hub/models--facebook--encodec_32khz/snapshots/d0c45384f6c44db055f78200cfdcb9c1c8706727/config.json
Model config EncodecConfig {
  "architectures": [
    "EncodecModel"
  ],
  "audio_channels": 1,
  "chunk_length_s": null,
  "codebook_dim": 128,
  "codebook_size": 2048,
  "compress": 2,
  "dilation_growth_rate": 2,
  "hidden_size": 128,
  "kernel_size": 7,
  "last_kernel_size": 7,
  "model_type": "encodec",
  "norm_type": "weight_norm",
  "normalize": false,
  "num_filters": 64,
  "num_lstm_layers": 2,
  "num_residual_layers": 1,
  "overlap": null,
  "pad_mode": "reflect",
  "residual_kernel_size": 3,
  "sampling_rate": 32000,
  "target_bandwidths": [
    2.2
  ],
  "torch_dtype": "float32",
  "transformers_version": "4.36.2",
  "trim_right_ratio": 1.0,
  "upsampling_ratios": [
    8,
    5,
    4,
    4
  ],
  "use_causal_conv": false,
  "use_conv_shortcut": false
}



In [1]:
import torch.nn as nn
import torch

In [12]:
class SemCodecMidiDecoder(nn.Module):
  def __init__(self, in_channels=128, hidden_size=256, out_channels=88, kernel_size=3, stride=1, padding=1):
      super().__init__()
      self.out_channels = out_channels
      # self.emb = SummationEmbedder(vocab_size=[2048, 2048, 2048, 2048], input_keys = 4, dim=in_channels)
      self.layers = nn.Sequential(
            nn.Conv1d(in_channels=in_channels, out_channels=hidden_size, kernel_size=kernel_size, stride=1, padding=1),
            nn.BatchNorm1d(hidden_size),
            nn.ELU(),
            nn.Dropout(0.5),
            nn.Conv1d(in_channels=hidden_size, out_channels=hidden_size, kernel_size=kernel_size, stride=1, padding=1),
            nn.BatchNorm1d(hidden_size),
            nn.ELU(),
            nn.Dropout(0.5),
            nn.Conv1d(in_channels=hidden_size, out_channels=hidden_size, kernel_size=kernel_size, stride=1, padding=1),
            nn.BatchNorm1d(hidden_size),
            nn.ELU(),
            nn.Dropout(0.5),
        )
      self.rnn = nn.GRU(input_size=hidden_size, hidden_size=hidden_size//2, num_layers=2, batch_first=True, bidirectional=True, dropout=0.2)
      self.proj = nn.Linear(hidden_size, out_channels*2)
      self.act = nn.Sigmoid()
  
  def forward(self, x):
      # x = self.emb(x.permute(0,2,1))
      x = self.layers(x)
      x = self.rnn(x.permute(0,2,1))[0]
      x = self.proj(x)
      x = self.act(x.permute(0,2,1))
      x = torch.stack([x[:,:88,:],x[:,88:,:]], dim=1)
      return x

In [13]:
class SemCodecOnlyMidi(nn.Module):
  def __init__(self, in_channels=128, hidden_size=256, out_channels=88, kernel_size=3, stride=1, padding=1):
      super().__init__()
      self.frame_rate = 50
      self.out_channels = out_channels
      self.quantizer = EncodecResidualVectorQuantizer(config = EncodecConfig.from_pretrained('facebook/encodec_32khz'))
      self.quantizer.load_state_dict(torch.load('/home/jongmin/userdata/audiocraft/encodec_32khz_quantizer.pt'))
      self.decoder = SemCodecMidiDecoder(in_channels=in_channels, hidden_size=hidden_size, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding)

  def forward(self, x):
      x = self.quantizer.decode(x)
      x = self.decoder(x)
      return x

In [7]:
from transformers.models.encodec.modeling_encodec import EncodecResidualVectorQuantizer
from transformers.models.encodec.configuration_encodec import EncodecConfig

In [14]:
semcodecmidi = SemCodecOnlyMidi()
semcodecmidi

SemCodecOnlyMidi(
  (quantizer): EncodecResidualVectorQuantizer(
    (layers): ModuleList(
      (0-3): 4 x EncodecVectorQuantization(
        (codebook): EncodecEuclideanCodebook()
      )
    )
  )
  (decoder): SemCodecMidiDecoder(
    (layers): Sequential(
      (0): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ELU(alpha=1.0)
      (3): Dropout(p=0.5, inplace=False)
      (4): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (5): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ELU(alpha=1.0)
      (7): Dropout(p=0.5, inplace=False)
      (8): Conv1d(256, 256, kernel_size=(3,), stride=(1,), padding=(1,))
      (9): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (10): ELU(alpha=1.0)
      (11): Dropout(p=0.5, inplace=False)
    )
    (rnn): GRU(256, 128, num_layers=

In [9]:
tokens = torch.load("/home/jongmin/userdata/MAESTRO/maestro-v3.0.0/2004/MIDI-Unprocessed_SMF_02_R1_2004_01-05_ORIG_MID--AUDIO_02_R1_2004_05_Track05_wav_encodec.pt")

In [10]:
token = torch.stack([tokens[0].squeeze(0),tokens[30].squeeze(0)])
token.shape

torch.Size([2, 4, 1500])

In [15]:
out = semcodecmidi(token.transpose(0,1))

In [16]:
out.shape

torch.Size([2, 2, 88, 1500])

In [19]:
import pandas as pd    
jsonObj = pd.read_json(path_or_buf='/home/jongmin/userdata/SemCodec/egs/midiaudio_test/data.jsonl', lines=True)

In [18]:
for i in jsonObj["duration"]:
    if i < 30:
        print(i)