# Import Libraries

In [1]:
import torch
import torch.nn as nn
import torchaudio
import numpy as np
import os
from sklearn.model_selection import train_test_split
#from google.colab import drive

# Google Drive and Device Configurations

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#drive.mount('/content/drive')

# Hyperparameters

In [3]:
input_dim = 120  # 40 MFCC + 40 delta + 40 delta-delta
hidden_dim = 256
output_dim = 40  # Original MFCC dimension
num_layers = 2
num_epochs = 100
batch_size = 32
learning_rate = 0.001

# RVC Model

In [4]:
class RVCModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(RVCModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x, lengths):
        packed = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        out = self.linear(output)
        return out

# Data Preparation

In [5]:
def extract_features(wav_file):
    waveform, sample_rate = torchaudio.load(wav_file)
    mfcc = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=40)(waveform)
    delta = torchaudio.functional.compute_deltas(mfcc)
    delta2 = torchaudio.functional.compute_deltas(delta)
    features = torch.cat([mfcc, delta, delta2], dim=1)
    return features.squeeze(0).T.numpy()  # (time, features)

def prepare_data(audio_folder):
    features_list = []
    lengths = []
    for file in os.listdir(audio_folder):
        if file.endswith('.wav'):
            wav_path = os.path.join(audio_folder, file)
            features = extract_features(wav_path)
            features_list.append(features)
            lengths.append(features.shape[0])

    # Pad sequences to the same length
    max_len = max(lengths)
    padded_features = [np.pad(f, ((0, max_len - f.shape[0]), (0, 0)), mode='constant') for f in features_list]

    # Convert to PyTorch tensors
    features_tensor = torch.tensor(np.array(padded_features), dtype=torch.float32)
    lengths_tensor = torch.tensor(lengths, dtype=torch.long)
    return features_tensor, lengths_tensor

# Training the RVC Model

In [6]:
def train_model(source_features, source_lengths, target_features, target_lengths):
    # Split data into train and test sets
    train_source, test_source, train_target, test_target, train_source_lengths, test_source_lengths = train_test_split(
        source_features, target_features, source_lengths, test_size=0.2, random_state=42)

    model = RVCModel(input_dim, hidden_dim, output_dim, num_layers).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = nn.MSELoss()

    n_batches = len(train_source) // batch_size

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for i in range(n_batches):
            start_idx = i * batch_size
            end_idx = start_idx + batch_size

            batch_source = train_source[start_idx:end_idx].to(device)
            batch_target = train_target[start_idx:end_idx, :, :output_dim].to(device)  # Only use the first 40 features
            batch_lengths = train_source_lengths[start_idx:end_idx]

            optimizer.zero_grad()
            output = model(batch_source, batch_lengths)

            # Ensure output and target have the same time steps
            min_time_steps = min(output.size(1), batch_target.size(1))
            output = output[:, :min_time_steps, :]
            batch_target = batch_target[:, :min_time_steps, :]

            loss = loss_fn(output, batch_target)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / n_batches
        if epoch % 10 == 0:
            print(f'Epoch: {epoch}, Average Loss: {avg_loss:.4f}')

        # Evaluate on test set
        model.eval()
        with torch.no_grad():
            test_output = model(test_source.to(device), test_source_lengths)
            test_target = test_target[:, :, :output_dim].to(device)  # Only use the first 40 features
            min_time_steps = min(test_output.size(1), test_target.size(1))
            test_output = test_output[:, :min_time_steps, :]
            test_target = test_target[:, :min_time_steps, :]
            test_loss = loss_fn(test_output, test_target)
            print(f'Test Loss: {test_loss.item():.4f}')

    return model

# Voice conversion

In [7]:
def convert_voice(model, source_features, source_lengths):
    model.eval()
    with torch.no_grad():
        converted_features = model(source_features.to(device), source_lengths)
    return converted_features

# Data Preparation

In [8]:
# Main execution
source_folder = 'C:\\Users\\User\\Downloads\\obama_dataset\\source_data'
target_folder = 'C:\\Users\\User\\Downloads\\obama_dataset\\target_data'
# output_folder = 'path/to/output_folder'

# Prepare data
source_features, source_lengths = prepare_data(source_folder)
target_features, target_lengths = prepare_data(target_folder)

print("Source features shape:", source_features.shape)
print("Target features shape:", target_features.shape)
print("Source lengths shape:", source_lengths.shape)
print("Target lengths shape:", target_lengths.shape)



Source features shape: torch.Size([137, 5289, 120])
Target features shape: torch.Size([137, 4734, 120])
Source lengths shape: torch.Size([137])
Target lengths shape: torch.Size([137])


# Train the Model

In [9]:
# Train the model
rvc_model = train_model(source_features, source_lengths, target_features, target_lengths)

drive_save_path = 'C:\\Users\\User\\Downloads\\obama_dataset\\rvc_model.pth'

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(drive_save_path), exist_ok=True)

# Save the model
torch.save(rvc_model.state_dict(), drive_save_path)

# Convert voice (example)
sample_source = source_features[0].unsqueeze(0)  # Add batch dimension
sample_length = source_lengths[0].unsqueeze(0)
converted_features = convert_voice(rvc_model, sample_source, sample_length)

print("Voice conversion completed.")

Epoch: 0, Average Loss: 344.5980
Test Loss: 374.4282
Test Loss: 368.6375
Test Loss: 363.9091
Test Loss: 360.2078
Test Loss: 357.2200
Test Loss: 354.7082
Test Loss: 352.4995
Test Loss: 350.4926
Test Loss: 348.6634
Test Loss: 346.9780
Epoch: 10, Average Loss: 315.9451
Test Loss: 345.4282
Test Loss: 343.9987
Test Loss: 342.6696
Test Loss: 341.4384
Test Loss: 340.2976
Test Loss: 339.2351
Test Loss: 338.2326
Test Loss: 337.2787
Test Loss: 336.3721
Test Loss: 335.5155
Epoch: 20, Average Loss: 305.6469
Test Loss: 334.7023
Test Loss: 333.9244
Test Loss: 333.1805
Test Loss: 332.4727
Test Loss: 331.7946
Test Loss: 331.1429
Test Loss: 330.5148
Test Loss: 329.9167
Test Loss: 329.3419
Test Loss: 328.7873
Epoch: 30, Average Loss: 299.4725
Test Loss: 328.2542
Test Loss: 327.7427
Test Loss: 327.2535
Test Loss: 326.7849
Test Loss: 326.3374
Test Loss: 325.9100
Test Loss: 325.5022
Test Loss: 325.1134
Test Loss: 324.7434
Test Loss: 324.3911
Epoch: 40, Average Loss: 295.6133
Test Loss: 324.0556
Test Loss: 

# Convert Voice

In [None]:
audio_file = "C:\\Users\\User\\Downloads\\audio1.wav\\"

