In [1]:
import torch

# Download pre-trained Tacotron 2 weights
tacotron2_model = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tacotron2', model_math='fp16')
tacotron2_model.eval()

# Save the pre-trained weights
torch.save(tacotron2_model.state_dict(), 'pretrained_tacotron2.pth')

Using cache found in C:\Users\vpved/.cache\torch\hub\NVIDIA_DeepLearningExamples_torchhub


In [2]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import librosa
import logging

# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Your provided directory structure and hyperparameters
BASE_DIR = "./"
DATA_DIR = os.path.join(BASE_DIR, 'emov-DB')
PROCESSED_DATA_DIR = os.path.join(BASE_DIR, 'processed_data')
MODEL_DIR = os.path.join(BASE_DIR, 'saved_models')

EMOTIONS = ['Neutral', 'Sleepy', 'Angry', 'Disgusted', 'Amused']
EMOTION_FILE_COUNTS = {
    'Neutral': 373,
    'Sleepy': 520,
    'Angry': 317,
    'Disgusted': 347,
    'Amused': 309
}

HPARAMS = {
    'training_files': os.path.join(BASE_DIR, 'metadata.csv'),
    'val_files': os.path.join(BASE_DIR, 'metadata.csv'),  # Use a separate validation set in practice
    'epochs': 10,
    'batch_size': 32,
    'learning_rate': 1e-4,
    'sampling_rate': 22050,
    'filter_length': 1024,
    'hop_length': 256,
    'win_length': 1024,
    'n_mel_channels': 80,
    'mel_fmin': 0.0,
    'mel_fmax': 8000.0,
    'EMOTIONS': EMOTIONS
}

logger.info("Hyperparameters and directories set up")

2024-06-26 11:19:53,380 - INFO - Hyperparameters and directories set up


In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import librosa
import logging

# Set up logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

logger.info("Script started")

#  directory structure and hyperparameters
BASE_DIR = "./"
DATA_DIR = os.path.join(BASE_DIR, 'emov-DB')
PROCESSED_DATA_DIR = os.path.join(BASE_DIR, 'processed_data')
MODEL_DIR = os.path.join(BASE_DIR, 'saved_models')

EMOTIONS = ['Neutral', 'Sleepy', 'Angry', 'Disgusted', 'Amused']
EMOTION_FILE_COUNTS = {
    'Neutral': 373,
    'Sleepy': 520,
    'Angry': 317,
    'Disgusted': 347,
    'Amused': 309
}

HPARAMS = {
    'training_files': os.path.join(BASE_DIR, 'metadata.csv'),
    'val_files': os.path.join(BASE_DIR, 'metadata.csv'),  # Use a separate validation set in practice
    'epochs': 10,
    'batch_size': 32,
    'learning_rate': 1e-4,
    'sampling_rate': 22050,
    'filter_length': 1024,
    'hop_length': 256,
    'win_length': 1024,
    'n_mel_channels': 80,
    'mel_fmin': 0.0,
    'mel_fmax': 8000.0,
    'EMOTIONS': EMOTIONS
}

logger.info(f"Hyperparameters: {HPARAMS}")
logger.info(f"Directories: BASE_DIR={BASE_DIR}, DATA_DIR={DATA_DIR}, PROCESSED_DATA_DIR={PROCESSED_DATA_DIR}, MODEL_DIR={MODEL_DIR}")

# Custom dataset class
class EmotionalSpeechDataset(Dataset):
    def __init__(self, csv_file, root_dir):
        logger.info(f"Initializing EmotionalSpeechDataset with csv_file={csv_file}, root_dir={root_dir}")
        self.metadata = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.emotion_to_id = {emotion: i for i, emotion in enumerate(EMOTIONS)}
        logger.info(f"Dataset initialized with {len(self.metadata)} samples")
        logger.debug(f"Emotion to ID mapping: {self.emotion_to_id}")

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        logger.debug(f"Getting item {idx}")
        audio_path = os.path.join(self.root_dir, self.metadata.iloc[idx, 0])
        text = self.metadata.iloc[idx, 1]
        emotion = self.metadata.iloc[idx, 2]

        logger.debug(f"Loading sample {idx}: audio_path={audio_path}, text={text}, emotion={emotion}")

        try:
            # Load and preprocess audio
            audio, _ = librosa.load(audio_path, sr=HPARAMS['sampling_rate'])
            logger.debug(f"Audio loaded: shape={audio.shape}, min={audio.min()}, max={audio.max()}")
            
            mel = librosa.feature.melspectrogram(
                y=audio,
                sr=HPARAMS['sampling_rate'],
                n_mels=HPARAMS['n_mel_channels'],
                fmin=HPARAMS['mel_fmin'],
                fmax=HPARAMS['mel_fmax']
            )
            mel = librosa.power_to_db(mel, ref=np.max)
            logger.debug(f"Mel spectrogram computed: shape={mel.shape}, min={mel.min()}, max={mel.max()}")

            return {
                'text': torch.LongTensor(self.text_to_sequence(text)),
                'mel': torch.FloatTensor(mel),
                'emotion': torch.LongTensor([self.emotion_to_id[emotion]])
            }
        except Exception as e:
            logger.error(f"Error processing item {idx}: {str(e)}")
            raise

    def text_to_sequence(self, text):
        # Implement text to sequence conversion here
        # This is a placeholder implementation
        sequence = [ord(c) for c in text.lower()]
        logger.debug(f"Text to sequence: '{text}' -> {sequence}")
        return sequence

logger.info("EmotionalSpeechDataset class defined")

# Modified Tacotron2 model with emotion embedding
class EmotionalTacotron2(nn.Module):
    def __init__(self, tacotron2_model, num_emotions, emotion_embed_dim):
        super(EmotionalTacotron2, self).__init__()
        logger.info(f"Initializing EmotionalTacotron2 with num_emotions={num_emotions}, emotion_embed_dim={emotion_embed_dim}")
        self.tacotron2 = tacotron2_model
        self.emotion_embedding = nn.Embedding(num_emotions, emotion_embed_dim)
        
        # Modify the encoder to accept emotion embeddings
        old_embed_dim = self.tacotron2.embedding.embedding_dim
        self.new_embedding = nn.Embedding(
            self.tacotron2.embedding.num_embeddings,
            old_embed_dim + emotion_embed_dim
        )
        logger.info(f"New embedding layer created with input dim={self.tacotron2.embedding.num_embeddings} and output dim={old_embed_dim + emotion_embed_dim}")

    def forward(self, text, emotion):
        logger.debug(f"EmotionalTacotron2 forward pass: text shape={text.shape}, emotion shape={emotion.shape}")
        emotion_embed = self.emotion_embedding(emotion)
        text_embed = self.new_embedding(text)
        
        # Combine text and emotion embeddings
        combined_embed = torch.cat((text_embed, emotion_embed.unsqueeze(1).expand(-1, text_embed.size(1), -1)), dim=-1)
        logger.debug(f"Combined embedding shape: {combined_embed.shape}")
        
        # Pass through the encoder
        encoder_outputs = self.tacotron2.encoder(combined_embed)
        logger.debug(f"Encoder output shape: {encoder_outputs.shape}")
        
        # Pass through the decoder
        mel_outputs, mel_outputs_postnet, gate_outputs, alignments = self.tacotron2.decoder(
            encoder_outputs,
            memory=None,
            processed_memory=None,
            attention_weights=None
        )
        logger.debug(f"Decoder output shapes: mel={mel_outputs.shape}, mel_postnet={mel_outputs_postnet.shape}, gate={gate_outputs.shape}, alignments={alignments.shape}")
        
        return mel_outputs, mel_outputs_postnet, gate_outputs, alignments

logger.info("EmotionalTacotron2 class defined")

# Load the pre-trained Tacotron2 model
logger.info("Loading pre-trained Tacotron2 model")
tacotron2_model = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tacotron2', model_math='fp16')
tacotron2_model.eval()
logger.info("Pre-trained Tacotron2 model loaded")

# Create the emotional Tacotron2 model
emotional_tacotron2 = EmotionalTacotron2(tacotron2_model, num_emotions=len(EMOTIONS), emotion_embed_dim=64)
logger.info("EmotionalTacotron2 model created")

# Prepare dataset and dataloader
logger.info("Preparing dataset and dataloader")
dataset = EmotionalSpeechDataset(HPARAMS['training_files'], DATA_DIR)
dataloader = DataLoader(dataset, batch_size=HPARAMS['batch_size'], shuffle=True, num_workers=4)
logger.info(f"Dataset and dataloader prepared with {len(dataset)} samples")

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(emotional_tacotron2.parameters(), lr=HPARAMS['learning_rate'])
logger.info("Loss function and optimizer set up")

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")
emotional_tacotron2.to(device)

try:
    for epoch in range(HPARAMS['epochs']):
        logger.info(f"Starting epoch {epoch+1}/{HPARAMS['epochs']}")
        emotional_tacotron2.train()
        total_loss = 0

        for batch_idx, batch in enumerate(dataloader):
            logger.debug(f"Processing batch {batch_idx+1}/{len(dataloader)}")
            
            try:
                optimizer.zero_grad()

                logger.debug("Moving batch to device")
                text = batch['text'].to(device)
                mel_target = batch['mel'].to(device)
                emotion = batch['emotion'].to(device)
                logger.debug(f"Batch shapes: text={text.shape}, mel_target={mel_target.shape}, emotion={emotion.shape}")

                logger.debug("Running forward pass")
                mel_output, mel_output_postnet, _, _ = emotional_tacotron2(text, emotion)

                logger.debug("Calculating loss")
                loss = criterion(mel_output, mel_target) + criterion(mel_output_postnet, mel_target)
                logger.debug(f"Loss: {loss.item()}")
                
                logger.debug("Running backward pass")
                loss.backward()
                
                logger.debug("Optimizer step")
                optimizer.step()

                total_loss += loss.item()

                if batch_idx % 10 == 0:
                    logger.info(f"Epoch {epoch+1}, Batch {batch_idx+1}/{len(dataloader)}, Loss: {loss.item():.4f}")

            except Exception as e:
                logger.error(f"Error processing batch {batch_idx+1}: {str(e)}")
                raise

        avg_loss = total_loss / len(dataloader)
        logger.info(f"Epoch {epoch+1}/{HPARAMS['epochs']}, Average Loss: {avg_loss:.4f}")

        # Save checkpoint
        if (epoch + 1) % 5 == 0:
            checkpoint_path = os.path.join(MODEL_DIR, f'emotional_tacotron2_checkpoint_epoch_{epoch+1}.pth')
            torch.save({
                'epoch': epoch,
                'model_state_dict': emotional_tacotron2.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': avg_loss,
            }, checkpoint_path)
            logger.info(f"Checkpoint saved: {checkpoint_path}")

except Exception as e:
    logger.error(f"An error occurred during training: {str(e)}")
    raise

# Save the final model
final_model_path = os.path.join(MODEL_DIR, 'emotional_tacotron2_final.pth')
torch.save(emotional_tacotron2.state_dict(), final_model_path)
logger.info(f"Final model saved: {final_model_path}")

logger.info("Training completed")

2024-06-26 11:36:59,068 - INFO - Script started
2024-06-26 11:36:59,075 - INFO - Hyperparameters: {'training_files': './metadata.csv', 'val_files': './metadata.csv', 'epochs': 10, 'batch_size': 32, 'learning_rate': 0.0001, 'sampling_rate': 22050, 'filter_length': 1024, 'hop_length': 256, 'win_length': 1024, 'n_mel_channels': 80, 'mel_fmin': 0.0, 'mel_fmax': 8000.0, 'EMOTIONS': ['Neutral', 'Sleepy', 'Angry', 'Disgusted', 'Amused']}
2024-06-26 11:36:59,079 - INFO - Directories: BASE_DIR=./, DATA_DIR=./emov-DB, PROCESSED_DATA_DIR=./processed_data, MODEL_DIR=./saved_models
2024-06-26 11:36:59,084 - INFO - EmotionalSpeechDataset class defined
2024-06-26 11:36:59,088 - INFO - EmotionalTacotron2 class defined
2024-06-26 11:36:59,088 - INFO - Loading pre-trained Tacotron2 model
Using cache found in C:\Users\vpved/.cache\torch\hub\NVIDIA_DeepLearningExamples_torchhub
2024-06-26 11:37:01,620 - DEBUG - bytecode dump:
>          0	NOP(arg=None, lineno=1039)
           2	RESUME(arg=0, lineno=1039)


2024-06-26 11:37:01,725 - DEBUG - bytecode dump:
>          0	NOP(arg=None, lineno=1045)
           2	RESUME(arg=0, lineno=1045)
           4	LOAD_FAST(arg=0, lineno=1048)
           6	LOAD_CONST(arg=1, lineno=1048)
           8	BINARY_SUBSCR(arg=None, lineno=1048)
          12	LOAD_FAST(arg=0, lineno=1048)
          14	LOAD_CONST(arg=2, lineno=1048)
          16	BINARY_SUBSCR(arg=None, lineno=1048)
          20	COMPARE_OP(arg=2, lineno=1048)
          24	LOAD_FAST(arg=0, lineno=1048)
          26	LOAD_CONST(arg=1, lineno=1048)
          28	BINARY_SUBSCR(arg=None, lineno=1048)
          32	LOAD_FAST(arg=0, lineno=1048)
          34	LOAD_CONST(arg=3, lineno=1048)
          36	BINARY_SUBSCR(arg=None, lineno=1048)
          40	COMPARE_OP(arg=26, lineno=1048)
          44	BINARY_OP(arg=1, lineno=1048)
          48	RETURN_VALUE(arg=None, lineno=1048)
2024-06-26 11:37:01,727 - DEBUG - pending: deque([State(pc_initial=0 nstack_initial=0)])
2024-06-26 11:37:01,727 - DEBUG - stack: []
2024-06-2