In [None]:
%%capture
!pip install transformers
!pip install librosa
!pip install srt
!pip install pydub

In [None]:
import os
import librosa
import numpy as np
import pandas as pd
from pydub import AudioSegment
from srt import parse, Subtitle
from tqdm import tqdm
import soundfile as sf

In [None]:
# Paths
audio_path = "/content/drive/MyDrive/audio"
caption_path = "/content/drive/MyDrive/whisper_captions"
processed_data_path = "/content/drive/MyDrive/processed_data"

os.makedirs(processed_data_path, exist_ok=True)

In [None]:
def preprocess_audio(audio_file):
    y, sr = librosa.load(audio_file, sr=16000)  # Resample to 16kHz
    return y, sr

In [None]:
def parse_srt(srt_file):
    with open(srt_file, 'r') as file:
        subtitles = list(parse(file.read()))
    processed_subtitles = []
    for sub in subtitles:
        start, end = sub.start.total_seconds(), sub.end.total_seconds()
        text = sub.content.lower().strip().replace("\n", " ")
        processed_subtitles.append((start, end, text))
    return processed_subtitles

In [None]:
def segment_audio(audio_file, srt_file, output_path):
    y, sr = preprocess_audio(audio_file)
    subtitles = parse_srt(srt_file)
    segments = []
    for i, (start, end, text) in enumerate(subtitles):
        start_frame, end_frame = int(start * sr), int(end * sr)
        segment = y[start_frame:end_frame]
        segment_file = os.path.join(output_path, f"{os.path.basename(audio_file)}_seg_{i}.wav")
        sf.write(segment_file, segment, sr)  # Save the audio segment
        segments.append((segment_file, text))
    return segments

In [None]:
# Process all files
dataset = []
for file in tqdm(os.listdir(audio_path)):
    if file.endswith(".wav"):
        audio_file = os.path.join(audio_path, file)
        srt_file = os.path.join(caption_path, os.path.splitext(file)[0] + ".srt")
        segments = segment_audio(audio_file, srt_file, processed_data_path)
        dataset.extend(segments)

100%|██████████| 60/60 [08:39<00:00,  8.66s/it]


In [None]:
# Save dataset
df = pd.DataFrame(dataset, columns=["audio_path", "text"])
df.to_csv(os.path.join(processed_data_path, "dataset.csv"), index=False)

In [None]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch

In [None]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")

def extract_embeddings(audio_file):
    y, sr = librosa.load(audio_file, sr=16000)
    inputs = processor(y, sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state
    return embeddings.squeeze().mean(dim=0).numpy()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.84k [00:00<?, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

In [None]:
# Extract and save embeddings
dataset = pd.read_csv(os.path.join(processed_data_path, "dataset.csv"))
embeddings = []
for audio_path in tqdm(dataset["audio_path"]):
    embeddings.append(extract_embeddings(audio_path))

np.save(os.path.join(processed_data_path, "audio_embeddings.npy"), embeddings)
dataset["embedding_path"] = os.path.join(processed_data_path, "audio_embeddings.npy")
dataset.to_csv(os.path.join(processed_data_path, "dataset_with_embeddings.csv"), index=False)

  0%|          | 75/16797 [01:20<3:56:12,  1.18it/s]

In [None]:
from tacotron2 import Tacotron2, Tacotron2Loss
import torch
from torch.utils.data import DataLoader, Dataset

In [None]:
class TTSDataset(Dataset):
    def __init__(self, csv_path):
        self.data = pd.read_csv(csv_path)
        self.embeddings = np.load(self.data["embedding_path"][0])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["text"]
        embedding = self.embeddings[idx]
        return text, torch.tensor(embedding, dtype=torch.float32)

In [None]:
# Load dataset
dataset = TTSDataset(os.path.join(processed_data_path, "dataset_with_embeddings.csv"))
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
# Model initialization
model = Tacotron2()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = Tacotron2Loss()

In [None]:
# Training loop
model.train()
for epoch in range(10):  # Adjust epochs as needed
    for texts, embeddings in dataloader:
        optimizer.zero_grad()
        outputs = model(texts, embeddings)
        loss = criterion(outputs)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

In [None]:
# Save model
torch.save(model.state_dict(), "tacotron2_weights.h5")