In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Model1 Used the facebook/mms-tts-swh pretrained model for tokenization and applied Bidirectional RNN (BiRNN) with Attention Mechanism

In [34]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import librosa
import soundfile as sf

# Load Pretrained TTS Model and Tokenizer
model_name = "facebook/mms-tts-swh"  # Swahili TTS model from Facebook
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r"<\s*s>|<\s*/\s*s>", "", text).strip()
    match = re.search(r"\(tweet_\d+\)", text)
    tweet_id = match.group(0) if match else None
    text = re.sub(r"\(tweet_\d+\)", "", text).strip()
    return text, tweet_id

# Load File Mapping
df = pd.read_excel("/kaggle/input/file-mapping/file_mapping.xlsx")

class TTS_Dataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text_path = row["Text File Path"]
        mel_path = row["Mel Spectrogram File Path"]

        with open(text_path, "r", encoding="utf-8") as f:
            text = f.read().strip()
        text, tweet_id = preprocess_text(text)
        tokenized_text = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=256)

        mel_spectrogram = Image.open(mel_path)
        mel_spectrogram = np.array(mel_spectrogram) / 255.0  # Normalize
        mel_spectrogram = torch.tensor(mel_spectrogram, dtype=torch.float32).unsqueeze(0)  # Ensure 3D shape

        return tokenized_text.input_ids.squeeze(0), mel_spectrogram, tweet_id

# Custom collate function
def collate_fn(batch):
    tokenized_texts, mel_specs, tweet_ids = zip(*batch)
    max_text_len = max(t.shape[0] for t in tokenized_texts)
    max_mel_len = max(m.shape[-1] for m in mel_specs)
    
    padded_texts = [torch.cat([t, torch.zeros(max_text_len - t.shape[0])]) for t in tokenized_texts]

    padded_mels = []
    for m in mel_specs:
        if m.dim() == 3:
            padded_mels.append(torch.cat([m, torch.zeros(m.shape[0], m.shape[1], max(0, max_mel_len - m.shape[2]))], dim=2))
        elif m.dim() == 4:
            padded_mels.append(torch.cat([m, torch.zeros(m.shape[0], m.shape[1], m.shape[2], max(0, max_mel_len - m.shape[3]))], dim=3))

    return torch.stack(padded_texts), torch.stack(padded_mels), tweet_ids

# Train-Test Split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Dataloaders
train_dataset = TTS_Dataset(train_df, tokenizer)
test_dataset = TTS_Dataset(test_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

# Advanced Attention Layer (Multi-Head Attention)
class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.attn = nn.MultiheadAttention(embed_dim=hidden_dim * 2, num_heads=num_heads, batch_first=True)
    
    def forward(self, rnn_output):
        attn_output, _ = self.attn(rnn_output, rnn_output, rnn_output)
        return attn_output

# BiRNN with Multi-Head Attention
class BiRNNWithAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, mel_shape, vocab_size, num_heads=4):
        super(BiRNNWithAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, input_dim)
        self.rnn = nn.LSTM(input_dim, hidden_dim, bidirectional=True, batch_first=True, num_layers=2)
        self.attn = MultiHeadAttention(hidden_dim, num_heads)
        self.fc = nn.Linear(hidden_dim * 2, mel_shape[1] * mel_shape[2] * mel_shape[3])
        self.mel_shape = mel_shape
    
    def forward(self, x):
        x = x.long()  # Convert input to long
        x = self.embedding(x)  # Pass through the embedding layer
        rnn_out, _ = self.rnn(x)  # rnn_out shape: (batch_size, seq_len, hidden_dim * 2)
        
        # Apply multi-head attention to RNN output
        attn_out = self.attn(rnn_out)
        
        # Pass attention output through fully connected layer
        output = self.fc(attn_out[:, -1, :])  # Use the last output of the sequence

        # Reshape the output to match the mel spectrogram shape
        output = output.view(-1, self.mel_shape[1], self.mel_shape[2], self.mel_shape[3])
        return output

# Model initialization
mel_shape = (16, 400, 1000, 4)  # This should match your mel_spectrogram shape
vocab_size = tokenizer.vocab_size  # Retrieve the vocab_size from your tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiRNNWithAttention(input_dim=256, hidden_dim=128, mel_shape=mel_shape, vocab_size=vocab_size, num_heads=4).to(device)

# Loss and optimizer
criterion = nn.MSELoss()  # Assuming you're using MSE Loss for TTS
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training Loop
for epoch in range(5):
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        tokenized_text, mel_spectrogram, _ = batch
        tokenized_text, mel_spectrogram = tokenized_text.to(device), mel_spectrogram.to(device)

        optimizer.zero_grad()
        outputs = model(tokenized_text)
        
        # Ensure the outputs and mel_spectrogram have the same shape
        loss = criterion(outputs, mel_spectrogram.squeeze(1))
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/5], Train Loss: {total_train_loss/len(train_loader):.4f}")

# Testing Loop
model.eval()
total_test_loss = 0
with torch.no_grad():
    for batch in test_loader:
        tokenized_text, mel_spectrogram, _ = batch
        tokenized_text, mel_spectrogram = tokenized_text.to(device), mel_spectrogram.to(device)

        outputs = model(tokenized_text)
        loss = criterion(outputs, mel_spectrogram.squeeze(1))
        total_test_loss += loss.item()

print(f"Final Test Loss: {total_test_loss/len(test_loader):.4f}")

# Save a test sample to listen to the generated audio
sample_text, mel_spectrogram, _ = test_dataset[0]  # Pick a test sample
sample_text = torch.tensor(sample_text).unsqueeze(0).to(device)
mel_spectrogram = mel_spectrogram.squeeze(1).to(device)

# Generate mel spectrogram using the trained model
model.eval()
with torch.no_grad():
    output_mel = model(sample_text)

# Convert Mel spectrogram back to audio (using Griffin-Lim algorithm)
#output_mel = output_mel.squeeze().cpu().numpy()  # Convert to CPU and NumPy for Griffin-Lim
#output_audio = librosa.feature.inverse.mel_to_audio(output_mel)

# Rescale audio to the range [-1, 1] (if necessary)
#output_audio = np.clip(output_audio, -1.0, 1.0)

# Save the audio to a file
#sf.write('generated_audio.wav', output_audio, 22050)  # 22050 Hz sample rate

# Play audio (if in Jupyter environment)
#import IPython.display as ipd
#ipd.Audio('generated_audio.wav')


Epoch [1/5], Train Loss: 0.0368
Epoch [2/5], Train Loss: 0.0120
Epoch [3/5], Train Loss: 0.0120
Epoch [4/5], Train Loss: 0.0118
Epoch [5/5], Train Loss: 0.0117
Final Test Loss: 0.0115


  sample_text = torch.tensor(sample_text).unsqueeze(0).to(device)
  mel_basis = filters.mel(


LibsndfileError: Error opening 'generated_audio.wav': Format not recognised.

## Model1 Used the facebook/mms-tts-swh pretrained model for tokenization and applied Bidirectional RNN (BiRNN) with Attention Mechanism
## Mel spectrogram to Audio here I have taken one test sample

In [43]:
import numpy as np
import librosa
import soundfile as sf
import IPython.display as ipd

def save_and_play_audio(output_mel, sr=22050):
    # Process Mel spectrogram
    output_mel = output_mel.squeeze()
    
    # Handle different channel configurations
    if output_mel.ndim == 3:
        output_mel = output_mel.mean(axis=-1)  # Average across channels if needed
        
    # Ensure correct orientation [n_mels, time]
    if output_mel.shape[0] > output_mel.shape[1]:
        output_mel = output_mel.T

    # Convert Mel to audio
    output_audio = librosa.feature.inverse.mel_to_audio(
        output_mel,
        sr=sr,
        n_fft=2048,
        hop_length=512,
        n_iter=100
    )

    # Normalize and convert to proper format
    output_audio = librosa.util.normalize(output_audio) * 0.95
    output_audio = np.clip(output_audio, -1.0, 1.0)
    output_audio = output_audio.squeeze().astype(np.float32)

    # Save with explicit format specifications
    try:
        sf.write(
            'generated_audio.wav',
            output_audio,
            sr,
            subtype='PCM_24'  # Explicitly specify the format subtype
        )
    except Exception as e:
        print(f"Error saving audio: {str(e)}")
        print("Audio shape:", output_audio.shape)
        print("Audio dtype:", output_audio.dtype)
        print("Audio range:", output_audio.min(), output_audio.max())
        return

    # Verify and play
    try:
        print("Successfully saved audio file")
        return ipd.Audio('generated_audio.wav')
    except Exception as e:
        print(f"Error playing audio: {str(e)}")
        return None

# Usage example:
save_and_play_audio(output_mel)

Successfully saved audio file


## Used the facebook/mms-tts-swh pretrained model for tokenization and applied Convolutional Neural Network (CNN) + GRU Model.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from PIL import Image
import librosa
import soundfile as sf
import re
import os

# Load Pretrained TTS Model and Tokenizer
model_name = "facebook/mms-tts-swh"  # Swahili TTS model from Facebook
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r"<\s*s>|<\s*/\s*s>", "", text).strip()
    match = re.search(r"\(tweet_\d+\)", text)
    tweet_id = match.group(0) if match else None
    text = re.sub(r"\(tweet_\d+\)", "", text).strip()
    return text, tweet_id

# Load File Mapping
df = pd.read_excel("/kaggle/input/file-mapping/file_mapping.xlsx")

class TTS_Dataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text_path = row["Text File Path"]
        mel_path = row["Mel Spectrogram File Path"]

        with open(text_path, "r", encoding="utf-8") as f:
            text = f.read().strip()
        text, tweet_id = preprocess_text(text)
        tokenized_text = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=256)

        mel_spectrogram = Image.open(mel_path)
        mel_spectrogram = np.array(mel_spectrogram) / 255.0  # Normalize
        mel_spectrogram = torch.tensor(mel_spectrogram, dtype=torch.float32).unsqueeze(0)  # Ensure 3D shape

        return tokenized_text.input_ids.squeeze(0), mel_spectrogram, tweet_id

# Custom collate function
def collate_fn(batch):
    tokenized_texts, mel_specs, tweet_ids = zip(*batch)
    max_text_len = max(t.shape[0] for t in tokenized_texts)
    max_mel_len = max(m.shape[-1] for m in mel_specs)
    
    padded_texts = [torch.cat([t, torch.zeros(max_text_len - t.shape[0])]) for t in tokenized_texts]

    padded_mels = []
    for m in mel_specs:
        if m.dim() == 3:
            padded_mels.append(torch.cat([m, torch.zeros(m.shape[0], m.shape[1], max(0, max_mel_len - m.shape[2]))], dim=2))
        elif m.dim() == 4:
            padded_mels.append(torch.cat([m, torch.zeros(m.shape[0], m.shape[1], m.shape[2], max(0, max_mel_len - m.shape[3]))], dim=3))

    return torch.stack(padded_texts), torch.stack(padded_mels), tweet_ids

# Train-Test Split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Dataloaders
train_dataset = TTS_Dataset(train_df, tokenizer)
test_dataset = TTS_Dataset(test_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)

# CNN + GRU Model for Text-to-Speech
class CNN_GRU_TTS(nn.Module):
    def __init__(self, vocab_size, hidden_dim, mel_shape):
        super(CNN_GRU_TTS, self).__init__()

        # Embedding Layer
        self.embedding = nn.Embedding(vocab_size, hidden_dim)

        # Convolutional Layers
        self.conv1 = nn.Conv1d(in_channels=hidden_dim, out_channels=256, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2)

        # GRU Layer
        self.gru = nn.GRU(64, hidden_dim, batch_first=True)

        # Fully Connected Layer
        self.fc = nn.Linear(hidden_dim, mel_shape[1] * mel_shape[2] * mel_shape[3])

        self.mel_shape = mel_shape

    def forward(self, x):
        x = x.long()  # Convert to long for embedding lookup
        x = self.embedding(x)
        x = x.permute(0, 2, 1)  # Permute to match (batch, channels, seq_len)

        # Apply Convolutional Layers
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))

        x = x.permute(0, 2, 1)  # Switch back to (batch, seq_len, channels)
        
        # GRU Layer
        x, _ = self.gru(x)

        # Fully Connected Layer
        x = self.fc(x[:, -1, :])  # Use the last GRU output

        # Reshape the output to match mel spectrogram dimensions
        output = x.view(-1, *self.mel_shape[1:])

        return output

# Model initialization
mel_shape = (16, 400, 1000, 4)  # Example mel spectrogram shape
vocab_size = tokenizer.vocab_size  # Get vocab size from tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNN_GRU_TTS(vocab_size=vocab_size, hidden_dim=256, mel_shape=mel_shape).to(device)

# Loss and optimizer
criterion = nn.MSELoss()  # Assuming MSE Loss for TTS task
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training Loop
for epoch in range(5):
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        tokenized_text, mel_spectrogram, _ = batch
        tokenized_text, mel_spectrogram = tokenized_text.to(device), mel_spectrogram.to(device)

        optimizer.zero_grad()
        outputs = model(tokenized_text)

        # Ensure outputs and mel_spectrogram have same shape
        loss = criterion(outputs, mel_spectrogram.squeeze(1))
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/5], Train Loss: {total_train_loss/len(train_loader):.4f}")

# Testing Loop
model.eval()
total_test_loss = 0
with torch.no_grad():
    for batch in test_loader:
        tokenized_text, mel_spectrogram, _ = batch
        tokenized_text, mel_spectrogram = tokenized_text.to(device), mel_spectrogram.to(device)

        outputs = model(tokenized_text)
        loss = criterion(outputs, mel_spectrogram.squeeze(1))
        total_test_loss += loss.item()

print(f"Final Test Loss: {total_test_loss/len(test_loader):.4f}")

# Save a test sample to listen to the generated audio
sample_text, mel_spectrogram, _ = test_dataset[0]  # Pick a test sample
sample_text = torch.tensor(sample_text).unsqueeze(0).to(device)
mel_spectrogram = mel_spectrogram.squeeze(1).to(device)

# Generate mel spectrogram using the trained model
model.eval()
with torch.no_grad():
    output_mel = model(sample_text)

# Convert Mel spectrogram back to audio (using Griffin-Lim algorithm)
#output_mel = output_mel.squeeze().cpu().numpy()  # Convert to CPU and NumPy for Griffin-Lim
#output_audio = librosa.feature.inverse.mel_to_audio(output_mel)

# Rescale audio to the range [-1, 1] (if necessary)
#output_audio = np.clip(output_audio, -1.0, 1.0)

# Save the audio to a file
#sf.write('generated_audio.wav', output_audio, 22050)  # 22050 Hz sample rate

# Play audio (if in Jupyter environment)
#import IPython.display as ipd
#ipd.Audio('generated_audio.wav')


tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/423 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/47.0 [00:00<?, ?B/s]

Epoch [1/5], Train Loss: 0.0344
Epoch [2/5], Train Loss: 0.0120
Epoch [3/5], Train Loss: 0.0119
Epoch [4/5], Train Loss: 0.0118
Epoch [5/5], Train Loss: 0.0117
Final Test Loss: 0.0115


  sample_text = torch.tensor(sample_text).unsqueeze(0).to(device)
  mel_basis = filters.mel(


LibsndfileError: Error opening 'generated_audio.wav': Format not recognised.

In [None]:
import numpy as np
import librosa
import soundfile as sf
import IPython.display as ipd

def save_and_play_audio(output_mel, sr=22050):
    # Process Mel spectrogram
    output_mel = output_mel.squeeze()
    
    # Handle different channel configurations
    if output_mel.ndim == 3:
        output_mel = output_mel.mean(axis=-1)  # Average across channels if needed
        
    # Ensure correct orientation [n_mels, time]
    if output_mel.shape[0] > output_mel.shape[1]:
        output_mel = output_mel.T

    # Convert Mel to audio
    output_audio = librosa.feature.inverse.mel_to_audio(
        output_mel,
        sr=sr,
        n_fft=2048,
        hop_length=512,
        n_iter=100
    )

    # Normalize and convert to proper format
    output_audio = librosa.util.normalize(output_audio) * 0.95
    output_audio = np.clip(output_audio, -1.0, 1.0)
    output_audio = output_audio.squeeze().astype(np.float32)

    # Save with explicit format specifications
    try:
        sf.write(
            'generated_audio.wav',
            output_audio,
            sr,
            subtype='PCM_24'  # Explicitly specify the format subtype
        )
    except Exception as e:
        print(f"Error saving audio: {str(e)}")
        print("Audio shape:", output_audio.shape)
        print("Audio dtype:", output_audio.dtype)
        print("Audio range:", output_audio.min(), output_audio.max())
        return

    # Verify and play
    try:
        print("Successfully saved audio file")
        return ipd.Audio('generated_audio.wav')
    except Exception as e:
        print(f"Error playing audio: {str(e)}")
        return None

# Usage example:
save_and_play_audio(output_mel)