In [1]:
import librosa
import librosa.display
import soundfile as sf
import matplotlib.pyplot as plt
import os

## Preprocessing

In [2]:
""" 
TEST Segmentation

Goal here is to segment a song into 30 second intervals
"""

exp_sr = 3200

def segment_song(file_path, segment_length=30, start_offset=10):
    """
    Segment a song into fixed-length parts.

    :param file_path: Path to the audio file.
    :param segment_length: Length of each segment in seconds.
    :param start_offset: Time in seconds to start segmenting from.
    :return: List of audio segments.
    """
    # Load the full audio file
    audio, sr = librosa.load(file_path, sr=None)
    sample_rate = sr
    print('sample rate:', sr)

    # Calculate start and end sample for the segmentation
    start_sample = int(start_offset * sr)
    end_sample = int(len(audio))

    # Segment length in samples
    segment_sample_length = segment_length * sr

    # Split the audio into segments
    segments = []
    for start in range(start_sample, end_sample, segment_sample_length):
        end = start + segment_sample_length
        # Check if the segment is shorter than the desired length, discard if necessary
        if end <= end_sample:
            segment = audio[start:end]
            segments.append(segment)
        else:
            # Optional: Handle the last segment if it's shorter than the desired length
            # For example, you can discard it or pad it
            pass

    return segments

# Example usage
file_path = 'data\\A Bigger Fear - A GlitchTale Soundtrack (Commission).m4a'
_, sr = librosa.load(file_path, sr=None)
segments = segment_song(file_path)

expected_duration = 30  # in seconds
for i, segment in enumerate(segments):
    duration = len(segment) / sr  # 'sr' is the sample rate
    if duration >= expected_duration:
        print(f"Segment {i} duration: {duration} seconds - OK")
    else:
        print(f"Segment {i} duration: {duration} seconds - Too Short")
        
# Find number of channels
audio, _ = librosa.load(file_path, sr=None, mono=False)
num_channels = audio.shape[0] if audio.ndim > 1 else 1

print(f"The audio file has {num_channels} channel(s).")

  _, sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


sample rate: 48000
Segment 0 duration: 30.0 seconds - OK
Segment 1 duration: 30.0 seconds - OK
Segment 2 duration: 30.0 seconds - OK
Segment 3 duration: 30.0 seconds - OK
Segment 4 duration: 30.0 seconds - OK
Segment 5 duration: 30.0 seconds - OK
Segment 6 duration: 30.0 seconds - OK
The audio file has 2 channel(s).


  audio, _ = librosa.load(file_path, sr=None, mono=False)


In [3]:
""" 
DO Segmentation
"""

def segment_audio(file_path, segment_length=30, start_offset=10, sr=exp_sr):
    # Load and segment the audio file as before
    audio, _ = librosa.load(file_path, sr=sr)
    segments = []
    start_sample = int(start_offset * sr)
    segment_samples = segment_length * sr

    for start in range(start_sample, len(audio), segment_samples):
        end = start + segment_samples
        if end <= len(audio):
            segments.append(audio[start:end])
    return segments

def process_directory(directory_path, output_directory=None):
    for filename in os.listdir(directory_path):
        if filename.endswith('.m4a'):  # Check for m4a files
            file_path = os.path.join(directory_path, filename)
            segments = segment_audio(file_path)

            if output_directory:
                save_segments(segments, filename, output_directory)

def save_segments(segments, original_filename, output_directory, sr=exp_sr):
    base_filename = os.path.splitext(original_filename)[0]
    for i, segment in enumerate(segments):
        output_filename = f"{base_filename}_segment_{i}.wav"  # Change format if needed
        output_path = os.path.join(output_directory, output_filename)
        sf.write(output_path, segment, sr)

# Example usage
process_directory('C:\\Hrita\\Code\\music_generator\\data', 'C:\\Hrita\\Code\\music_generator\\data_segments')

  audio, _ = librosa.load(file_path, sr=sr)


## Model


In [3]:
from diffusers import UNet1DModel
from sklearn.model_selection import train_test_split
# os imported

import torch
import torch.nn as nn 
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchaudio  # For audio processing


In [4]:
""" 
Train-Test split
"""

data_dir = 'data_segments/'

files = [os.path.join(data_dir, file) for file in os.listdir(data_dir)]

# Split dataset into training and testing sets
train_files, test_files = train_test_split(files, test_size=0.2)  # 20% for testing

In [5]:
"""  
Dataset
"""

class AudioDataset(Dataset):
    def __init__(self, file_paths):
        self.file_paths = file_paths
    
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        filepath = self.file_paths[idx]
        waveform, sample_rate = torchaudio.load(filepath)
        
        return waveform, sample_rate
    
# Create train and test datasets
train_dataset = AudioDataset(train_files)
test_dataset = AudioDataset(test_files)

In [6]:
""" 
Dataloaders
"""
train_loader = DataLoader(train_dataset, batch_size=1, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle = False)

In [7]:
""" 
Model: class diffusers.UNet1DModel
"""

sample_rate = exp_sr
duration_of_segment = 30    # seconds
sample_size = sample_rate * duration_of_segment

model = UNet1DModel(
    sample_size=sample_size,        # Size of your input samples
    in_channels=1,           # Number of input channels, e.g., 1 for mono audio
    out_channels=1,          # Number of output channels
    # layers_per_block=(2, 2, 2, 2),  # Layers in each downsampling/upsampling block
    down_block_types=("DownBlock1D", "AttnDownBlock1D", "DownBlock1D"),
    up_block_types=("UpBlock1D", "AttnUpBlock1D", "UpBlock1D"),
    block_out_channels=[32, 64, 128]
    # other configuration parameters if needed
)

# optimizer and loss funciton
device = 'cuda'
loss_function = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.to(device)

UNet1DModel(
  (time_proj): GaussianFourierProjection()
  (down_blocks): ModuleList(
    (0): DownBlock1D(
      (down): Downsample1d()
      (resnets): ModuleList(
        (0): ResConvBlock(
          (conv_skip): Conv1d(1, 32, kernel_size=(1,), stride=(1,), bias=False)
          (conv_1): Conv1d(1, 32, kernel_size=(5,), stride=(1,), padding=(2,))
          (group_norm_1): GroupNorm(1, 32, eps=1e-05, affine=True)
          (gelu_1): GELU(approximate='none')
          (conv_2): Conv1d(32, 32, kernel_size=(5,), stride=(1,), padding=(2,))
          (group_norm_2): GroupNorm(1, 32, eps=1e-05, affine=True)
          (gelu_2): GELU(approximate='none')
        )
        (1-2): 2 x ResConvBlock(
          (conv_1): Conv1d(32, 32, kernel_size=(5,), stride=(1,), padding=(2,))
          (group_norm_1): GroupNorm(1, 32, eps=1e-05, affine=True)
          (gelu_1): GELU(approximate='none')
          (conv_2): Conv1d(32, 32, kernel_size=(5,), stride=(1,), padding=(2,))
          (group_norm_2): Grou

In [8]:
""" 
Training loop
"""
def add_noise(data, noise_level):
    return data + noise_level * torch.randn_like(data)

def noise_schedule(timestep, num_timesteps):
    # Example of a linear noise schedule
    return timestep / num_timesteps

num_timesteps = 1000 
num_epochs = 10  

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    train_loss = 0.0

    for batch in train_loader:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)
        
        print(inputs.shape)
        # In a diffusion model, we typically select a random timestep for each batch
        timestep = torch.randint(0, num_timesteps, (1,), device=device).item()
        noise_level = noise_schedule(timestep, num_timesteps)
        noisy_inputs = add_noise(inputs, noise_level).to(device)
        print(noisy_inputs.shape)

        # Forward pass
        outputs = model(noisy_inputs, torch.tensor([timestep], device=device), return_dict = False)
        loss = loss_function(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Calculate average training loss for the epoch
    train_loss /= len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss:.4f}")

    # Validation phase
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    with torch.no_grad():  # No gradients needed for validation
        for batch in test_loader:
            inputs, targets = batch
            outputs = model(inputs)
            loss = loss_function(outputs, targets)
            val_loss += loss.item()

    val_loss /= len(test_loader)

    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

torch.Size([1, 1, 96000])
torch.Size([1, 1, 96000])


AttributeError: 'tuple' object has no attribute 'size'