<a href="https://colab.research.google.com/github/Kojojoko/Comic/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import librosa

dataset_path = 'wav_dataset/vijaydata'  # Your extracted folder

audio_data = []
for filename in os.listdir(dataset_path):
    if filename.endswith('.wav'):
        filepath = os.path.join(dataset_path, filename)
        # Load audio (returns waveform and sample rate)
        waveform, sr = librosa.load(filepath, sr=16000)  # resample to 16kHz if needed
        audio_data.append((waveform, sr))

print(f'Loaded {len(audio_data)} audio files')


Loaded 280 audio files


In [2]:
import torch.nn.functional as F
from torch.utils.data import Dataset
import librosa
import os
import torch

class AudioDataset(Dataset):
    def __init__(self, folder_path, fixed_length=16000*5):  # 5 seconds at 16kHz
        self.folder_path = folder_path
        self.files = [f for f in os.listdir(folder_path) if f.endswith('.wav')]
        self.fixed_length = fixed_length

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        file_path = os.path.join(self.folder_path, self.files[idx])
        waveform, sr = librosa.load(file_path, sr=16000)
        waveform = torch.tensor(waveform)

        # Pad or truncate waveform to fixed length
        if len(waveform) < self.fixed_length:
            padding = self.fixed_length - len(waveform)
            waveform = F.pad(waveform, (0, padding))
        else:
            waveform = waveform[:self.fixed_length]

        return waveform


In [3]:
import torch
import torch.nn as nn

class SimpleAudioAutoencoder(nn.Module):
    def __init__(self):
        super(SimpleAudioAutoencoder, self).__init__()
        # Encoder: downsample waveform
        self.encoder = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=15, stride=2, padding=7),
            nn.ReLU(),
            nn.Conv1d(16, 32, kernel_size=15, stride=2, padding=7),
            nn.ReLU(),
            nn.Conv1d(32, 64, kernel_size=15, stride=2, padding=7),
            nn.ReLU(),
        )
        # Decoder: upsample waveform
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(64, 32, kernel_size=15, stride=2, padding=7, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(32, 16, kernel_size=15, stride=2, padding=7, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(16, 1, kernel_size=15, stride=2, padding=7, output_padding=1),
            nn.Tanh(),  # Output normalized waveform [-1,1]
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


In [4]:
from torch.utils.data import DataLoader


dataset = AudioDataset('wav_dataset/vijaydata', fixed_length=16000*5)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)


In [5]:
import torch.optim as optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleAudioAutoencoder().to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50

for epoch in range(num_epochs):
    running_loss = 0.0
    for batch in dataloader:
        batch = batch.unsqueeze(1).to(device)  # [batch, 1, length]
        optimizer.zero_grad()
        outputs = model(batch)
        loss = criterion(outputs, batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(dataloader):.4f}")
    torch.save(model.state_dict(), 'your_model.pth')
    print("Model saved!")


Epoch 1/50, Loss: 0.0008
Model saved!
Epoch 2/50, Loss: 0.0003
Model saved!
Epoch 3/50, Loss: 0.0002
Model saved!
Epoch 4/50, Loss: 0.0001
Model saved!
Epoch 5/50, Loss: 0.0001
Model saved!
Epoch 6/50, Loss: 0.0001
Model saved!
Epoch 7/50, Loss: 0.0001
Model saved!
Epoch 8/50, Loss: 0.0001
Model saved!
Epoch 9/50, Loss: 0.0000
Model saved!
Epoch 10/50, Loss: 0.0001
Model saved!
Epoch 11/50, Loss: 0.0000
Model saved!
Epoch 12/50, Loss: 0.0000
Model saved!
Epoch 13/50, Loss: 0.0000
Model saved!
Epoch 14/50, Loss: 0.0000
Model saved!
Epoch 15/50, Loss: 0.0000
Model saved!
Epoch 16/50, Loss: 0.0000
Model saved!
Epoch 17/50, Loss: 0.0000
Model saved!
Epoch 18/50, Loss: 0.0000
Model saved!
Epoch 19/50, Loss: 0.0000
Model saved!
Epoch 20/50, Loss: 0.0000
Model saved!
Epoch 21/50, Loss: 0.0000
Model saved!
Epoch 22/50, Loss: 0.0000
Model saved!
Epoch 23/50, Loss: 0.0000
Model saved!
Epoch 24/50, Loss: 0.0000
Model saved!
Epoch 25/50, Loss: 0.0000
Model saved!
Epoch 26/50, Loss: 0.0000
Model sa

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class VoiceConversionModelWithSkip(nn.Module):
    def __init__(self, num_speakers, embedding_dim, bottleneck_dim):
        super().__init__()
        # Encoder layers
        self.enc1 = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=15, stride=2, padding=7),
            nn.ReLU()
        )
        self.enc2 = nn.Sequential(
            nn.Conv1d(32, 64, kernel_size=15, stride=2, padding=7),
            nn.ReLU()
        )
        self.enc3 = nn.Sequential(
            nn.Conv1d(64, 128, kernel_size=15, stride=2, padding=7),
            nn.ReLU()
        )

        self.bottleneck = nn.Conv1d(128, bottleneck_dim, kernel_size=1)

        self.speaker_embedding = nn.Embedding(num_speakers, embedding_dim)

        # Decoder layers (note input channels are bigger due to skip concat)
        self.dec1 = nn.Sequential(
            nn.ConvTranspose1d(bottleneck_dim + embedding_dim + 128, 128, kernel_size=15, stride=2, padding=7, output_padding=1),
            nn.ReLU()
        )
        self.dec2 = nn.Sequential(
            nn.ConvTranspose1d(128 + 64, 64, kernel_size=15, stride=2, padding=7, output_padding=1),
            nn.ReLU()
        )
        self.dec3 = nn.Sequential(
            nn.ConvTranspose1d(64 + 32, 32, kernel_size=15, stride=2, padding=7, output_padding=1),
            nn.ReLU()
        )

        self.final_conv = nn.Conv1d(32, 1, kernel_size=1)

    def forward(self, x, speaker_id):
        # Encoder
        e1 = self.enc1(x)    # [batch, 32, L/2]
        e2 = self.enc2(e1)   # [batch, 64, L/4]
        e3 = self.enc3(e2)   # [batch, 128, L/8]

        bottleneck = self.bottleneck(e3)  # [batch, bottleneck_dim, L/8]

        spk_embed = self.speaker_embedding(speaker_id)  # [batch, embedding_dim]
        spk_embed = spk_embed.unsqueeze(2).expand(-1, -1, bottleneck.size(2))  # [batch, embedding_dim, L/8]

        combined = torch.cat([bottleneck, spk_embed, e3], dim=1)  # concat skip from e3

        d1 = self.dec1(combined)   # [batch, 128, L/4]
        d1 = torch.cat([d1, e2], dim=1)  # concat skip from e2

        d2 = self.dec2(d1)         # [batch, 64, L/2]
        d2 = torch.cat([d2, e1], dim=1)  # concat skip from e1

        d3 = self.dec3(d2)         # [batch, 32, L]

        out = self.final_conv(d3)  # [batch, 1, L]
        return out


In [7]:
import torch.optim as optim
import torch.nn as nn

# Assume device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create model: 2 speakers, embedding 16 dims, bottleneck 128 dims
model = VoiceConversionModelWithSkip(num_speakers=2, embedding_dim=16, bottleneck_dim=128).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50

for epoch in range(num_epochs):
    running_loss = 0.0
    for batch_waveform in dataloader:
        batch_waveform = batch_waveform.unsqueeze(1).to(device)  # shape: [batch, 1, length]

        # Create dummy speaker IDs for example (replace with real IDs!)
        # Example: alternating speaker IDs for batch of size N
        batch_size = batch_waveform.size(0)
        speaker_ids = torch.randint(0, 2, (batch_size,), device=device)  # random 0 or 1 speaker

        optimizer.zero_grad()
        outputs = model(batch_waveform, speaker_ids)
        loss = criterion(outputs, batch_waveform)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(dataloader):.6f}")
    torch.save(model.state_dict(), 'your_model.pth')
    print("Model saved!")



Epoch 1/50, Loss: 0.000832
Model saved!
Epoch 2/50, Loss: 0.000076
Model saved!
Epoch 3/50, Loss: 0.000019
Model saved!
Epoch 4/50, Loss: 0.000010
Model saved!
Epoch 5/50, Loss: 0.000007
Model saved!
Epoch 6/50, Loss: 0.000006
Model saved!
Epoch 7/50, Loss: 0.000005
Model saved!
Epoch 8/50, Loss: 0.000006
Model saved!
Epoch 9/50, Loss: 0.000003
Model saved!
Epoch 10/50, Loss: 0.000003
Model saved!
Epoch 11/50, Loss: 0.000003
Model saved!
Epoch 12/50, Loss: 0.000003
Model saved!
Epoch 13/50, Loss: 0.000003
Model saved!
Epoch 14/50, Loss: 0.000002
Model saved!
Epoch 15/50, Loss: 0.000005
Model saved!
Epoch 16/50, Loss: 0.000002
Model saved!
Epoch 17/50, Loss: 0.000002
Model saved!
Epoch 18/50, Loss: 0.000002
Model saved!
Epoch 19/50, Loss: 0.000002
Model saved!
Epoch 20/50, Loss: 0.000004
Model saved!
Epoch 21/50, Loss: 0.000002
Model saved!
Epoch 22/50, Loss: 0.000002
Model saved!
Epoch 23/50, Loss: 0.000002
Model saved!
Epoch 24/50, Loss: 0.000002
Model saved!
Epoch 25/50, Loss: 0.0000

In [8]:
import torch
import torch.nn.functional as F

def smooth_audio(waveform, kernel_size=7):
    """
    Apply a simple moving average low-pass filter on 1D audio tensor.
    waveform: Tensor of shape [length] or [batch, 1, length]
    kernel_size: window size for smoothing

    Returns smoothed waveform tensor of same shape.
    """
    if waveform.dim() == 1:
        waveform = waveform.unsqueeze(0).unsqueeze(0)  # [1,1,length]
    elif waveform.dim() == 2:
        waveform = waveform.unsqueeze(1)  # [batch,1,length]

    kernel = torch.ones(1, 1, kernel_size, device=waveform.device) / kernel_size
    padding = kernel_size // 2

    smoothed = F.conv1d(waveform, kernel, padding=padding)
    return smoothed.squeeze()  # remove extra dims


In [9]:
def normalize_audio(waveform):
    max_val = waveform.abs().max()
    if max_val > 0:
        waveform = waveform / max_val
    return waveform


In [10]:
def remove_spikes(waveform, threshold=0.3):
    # waveform: 1D tensor
    diff = torch.abs(waveform[1:] - waveform[:-1])
    spikes = (diff > threshold).nonzero(as_tuple=True)[0] + 1
    for idx in spikes:
        if 1 < idx < waveform.size(0) - 1:
            waveform[idx] = (waveform[idx-1] + waveform[idx+1]) / 2
    return waveform


In [11]:
def fade_in_out(waveform, sample_rate, fade_duration_ms=50):
    fade_len = int(sample_rate * fade_duration_ms / 1000)
    fade_in = torch.linspace(0, 1, fade_len)
    fade_out = torch.linspace(1, 0, fade_len)
    waveform[:fade_len] *= fade_in
    waveform[-fade_len:] *= fade_out
    return waveform


In [19]:
import torchaudio

def save_waveform(waveform, sample_rate, filename):
    # waveform shape: [1, length] or [length]
    if waveform.dim() == 2:
        waveform = waveform.squeeze(0)  # remove channel dim if needed
    torchaudio.save(filename, waveform.unsqueeze(0), sample_rate)


In [17]:
import torch
import torchaudio
import torch.nn.functional as F

def chunk_audio(waveform, chunk_size):
    """
    Split waveform [channels, length] into chunks of chunk_size.
    Pads last chunk with zeros if needed.
    Returns list of [channels, chunk_size] tensors.
    """
    channels, total_len = waveform.shape
    chunks = []
    for start in range(0, total_len, chunk_size):
        end = min(start + chunk_size, total_len)
        chunk = waveform[:, start:end]
        if chunk.size(1) < chunk_size:
            pad_size = chunk_size - chunk.size(1)
            chunk = F.pad(chunk, (0, pad_size))  # pad at end
        chunks.append(chunk)
    return chunks

def process_chunk(model, chunk, speaker_id, device):
    """
    Process one chunk of shape [channels, length].
    If stereo, processes each channel separately.
    Returns tensor [channels, length].
    """
    model.eval()
    converted_channels = []
    with torch.no_grad():
        for ch in range(chunk.size(0)):
            ch_wave = chunk[ch].unsqueeze(0).unsqueeze(0).to(device)  # [1, 1, length]
            spk = torch.tensor([speaker_id], device=device)
            out = model(ch_wave, spk)
            out = out.squeeze(0).squeeze(0).cpu()
            converted_channels.append(out)
    return torch.stack(converted_channels)

def process_song(model, waveform, speaker_id, device, chunk_size=16000*5):
    """
    Full pipeline: chunk audio, convert each chunk, concat back.
    waveform: [channels, length]
    """
    chunks = chunk_audio(waveform, chunk_size)
    converted_chunks = []
    for chunk in chunks:
        converted_chunk = process_chunk(model, chunk, speaker_id, device)
        converted_chunks.append(converted_chunk)
    return torch.cat(converted_chunks, dim=1)

# --- Usage example ---

# Load audio (replace 'your_song.wav' with your audio file)
waveform, sample_rate = torchaudio.load('ssd.wav')  # [channels, length]

# If you want, resample to 16kHz if model trained on 16kHz
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)
    sample_rate = 16000

model=VoiceConversionModelWithSkip(num_speakers=2, embedding_dim=16, bottleneck_dim=128).to(device)
model.load_state_dict(torch.load('your_model.pth'))
model.eval()


# Choose speaker id for target voice
speaker_id = 1

# Process full song through model
converted_waveform = process_song(model, waveform, speaker_id, device)

# Normalize to prevent clipping
max_val = converted_waveform.abs().max()
if max_val > 0:
    converted_waveform = converted_waveform / max_val

# Save output
torchaudio.save('converted_song.wav', converted_waveform, sample_rate)
print("Saved converted_song.wav")


Saved converted_song.wav


In [None]:
model.eval()  # set model to eval mode
sample_rate = 16000

with torch.no_grad():
    for batch in dataloader:
        batch = batch.unsqueeze(1).to(device)  # [batch, 1, length]

        # Create dummy speaker IDs (replace with actual IDs if you have)
        speaker_ids = torch.randint(0, 2, (batch.size(0),), device=device)

        outputs = model(batch,speaker_ids)

        # Save first example original and reconstructed
        save_waveform(batch[0].cpu(), sample_rate, 'original.wav')
        save_waveform(outputs[0].cpu(), sample_rate, 'reconstructed.wav')
        print("Saved original.wav and reconstructed.wav")
        break


In [20]:
from IPython.display import Audio

print("Original audio:")
display(Audio('original.wav'))

print("Reconstructed audio:")
display(Audio('converted_song.wav'))


Original audio:


Reconstructed audio:
