In [1]:
import librosa
import soundfile as sf
import os

In [3]:
"""
def preprocess_audio(input_dir, output_dir, target_sr=16000):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file_name in os.listdir(input_dir):
        if file_name.endswith(".wav"):
            file_path = os.path.join(input_dir, file_name)
            audio, sr = librosa.load(file_path, sr=None)
            
            # 将音频采样率转换为目标采样率
            if sr != target_sr:
                audio = librosa.resample(y=audio, orig_sr=sr, target_sr=target_sr)
            
            # 保存预处理后的音频
            output_path = os.path.join(output_dir, file_name)
            sf.write(output_path, audio, target_sr)

# 调用函数进行预处理
preprocess_audio("/Users/user/Downloads/audio_test/input", "/Users/user/Downloads/audio_test/output")
"""

In [8]:
import pandas as pd
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from speechbrain.pretrained import Tacotron2, HIFIGAN
import speechbrain.utils as su
import shutil

class CustomAudioDataset(Dataset):
    def __init__(self, csv_file, audio_dir, transform=None):
        """
        Args:
            csv_file (str): Path to the CSV file with audio file paths and transcripts.
            audio_dir (str): Directory with all the audio files.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.data = pd.read_csv(csv_file)
        self.audio_dir = audio_dir
        self.transform = transform

    def __len__(self):
        # Returns the total number of samples
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Get the audio file path and transcript from the CSV
        audio_path = self.data.iloc[idx, 1]
        print("audio_path", audio_path)
        transcript = self.data.iloc[idx, 2]

        # Load the audio file
        waveform, sample_rate = torchaudio.load(audio_path)
        print("original waveform:",waveform)

        # Apply any transformations (if specified)
        if self.transform:
            waveform = self.transform(waveform)

        # You can also preprocess the transcript here if necessary

        # Return a dictionary containing the waveform, transcript, and sample rate
        sample = {
            'waveform': waveform,
            'transcript': transcript,
            'sample_rate': sample_rate
        }

        return sample

# Paths to the CSV file and audio directory
csv_file = "/Users/user/myenv/output_transcripts.csv"
audio_dir = "/Users/user/Downloads/audio_test/input"

# 定义一个简单的音频预处理变换
# transform = T.Resample(orig_freq=48000, new_freq=16000)

# Create an instance of the custom dataset
dataset = CustomAudioDataset(csv_file=csv_file, audio_dir=audio_dir)
print("dataset:", dataset)
# Create a DataLoader to iterate through the dataset
data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

for i, batch in enumerate(data_loader):
    waveform = batch['waveform']
    transcript = batch['transcript']
    sample_rate = batch['sample_rate']

    print(f"Waveform: {waveform.shape}, Transcript: {transcript}, Sample Rate: {sample_rate}")

# 删除保存的模型目录
shutil.rmtree('tmpdir_tts', ignore_errors=True)

# 加载预训练模型
tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts")
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")

# 定义训练参数
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tacotron2.to(device)
hifi_gan.to(device)

# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(tacotron2.parameters(), lr=1e-4)


# 训练循环
def train_model(data_loader, model, optimizer, criterion, device):
    model.train()
    for batch in data_loader:
        print("------batch:", batch)
        print(f"Number of items in the batch: {len(batch)}")
        waveform = batch["waveform"]
        transcript = batch["transcript"]
        sample_rate = batch["sample_rate"]
        print("---*******--- waveform: ", waveform)
        print("---*******--- transcript: ", transcript)
        print("---*******--- sample_rate: ", sample_rate)
        #waveforms = waveforms.to(device)
        mel_outputs, mel_lengths, alignment = tacotron2.encode_batch(transcript)
        waveforms_hat = hifi_gan.decode_batch(mel_outputs)
        print("---*******--- waveforms_hat: ", waveforms_hat)

        # 计算损失
        loss = criterion(waveforms_hat, waveform)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"Loss: {loss.item()}")

# 运行训练
train_model(data_loader, tacotron2, optimizer, criterion, device)
"""

num_epochs = 10  # Number of epochs

for epoch in range(num_epochs):
    tacotron2.train()  # Set the model to training mode

    running_loss = 0.0

    for i, batch in enumerate(data_loader):
        # Extract the data from the batch
        waveforms = batch['waveform'].to(device)
        transcripts = batch['transcript']

        # Tacotron2 expects text input, so you'd encode transcripts to text sequences
        encoded_transcripts = [su.text_to_sequence.text_to_sequence(transcript, "english_cleaners") for transcript in transcripts]

        # Forward pass through Tacotron2 to get mel-spectrograms
        print("----Check it out---")
        #print(encoded_transcripts[0])
        string_encoded_transcripts = [[str(item) for item in sublist] for sublist in encoded_transcripts]
        #print(string_encoded_transcripts[0])
        mel_outputs, mel_lengths, alignments = tacotron2(string_encoded_transcripts[i])

        #print(waveforms)
        print("waveform[i]", waveforms[i])
        mel_outputs, _, _, alignments = tacotron2(transcripts[i], waveforms[i])

        # Forward pass through HiFi-GAN to get the generated waveforms
        waveforms_hat = hifi_gan(mel_outputs)

        # Compute the loss between generated waveforms and the original waveforms
        loss = criterion(waveforms_hat, waveforms)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if i % 10 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(data_loader)}], Loss: {loss.item():.4f}')

    print(f'End of Epoch {epoch+1}, Average Loss: {running_loss / len(data_loader):.4f}')

    # Save the model checkpoints
    torch.save(tacotron2.state_dict(), f'tacotron2_epoch_{epoch+1}.pth')
    torch.save(hifi_gan.state_dict(), f'hifigan_epoch_{epoch+1}.pth')
"""

dataset: <__main__.CustomAudioDataset object at 0x12ccdc490>
audio_path /Users/user/Downloads/audio_test/input/Mountain-1-wav.wav
original waveform: tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  9.1553e-05,
         -1.2207e-04, -2.4414e-04],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  3.0518e-04,
          9.1553e-05, -2.1362e-04]])
Waveform: torch.Size([1, 2, 6647008]), Transcript: ["Mountain meditation this is an Equanimity practice it doesn't depend on the cultivation of concentration or open monitoring and can be used at the beginning of treatment or whenever balance or perspective is needed through Equanimity we develop the ability to stay open to suffering to meet life as it is and to be with whatever arises holding it with patience and understanding start by sitting comfortably taking a moment to ground and Center yourself"], Sample Rate: tensor([44100])
audio_path /Users/user/Downloads/audio_test/input/Mountain-2-wav.wav
original waveform: tensor([[1.4954e-



audio_path /Users/user/Downloads/audio_test/input/Mountain-2-wav.wav
original waveform: tensor([[1.4954e-03, 1.7395e-03, 1.7700e-03,  ..., 0.0000e+00, 1.2207e-04,
         1.8311e-04],
        [1.0681e-03, 1.2817e-03, 1.3733e-03,  ..., 6.1035e-05, 1.5259e-04,
         1.5259e-04]])
------batch: {'waveform': tensor([[[1.4954e-03, 1.7395e-03, 1.7700e-03,  ..., 0.0000e+00,
          1.2207e-04, 1.8311e-04],
         [1.0681e-03, 1.2817e-03, 1.3733e-03,  ..., 6.1035e-05,
          1.5259e-04, 1.5259e-04]]]), 'transcript': ['for one that you create with your imagination it can be alone or part of a mountain range this mountain time the arms and shoulders the slopes the spine allow yourself to become centered grounded and present'], 'sample_rate': tensor([44100])}
Number of items in the batch: 3
---*******--- waveform:  tensor([[[1.4954e-03, 1.7395e-03, 1.7700e-03,  ..., 0.0000e+00,
          1.2207e-04, 1.8311e-04],
         [1.0681e-03, 1.2817e-03, 1.3733e-03,  ..., 6.1035e-05,
          1

  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (258560) must match the size of tensor b (5314006) at non-singleton dimension 2