In [1]:
import os
# from tqdm import tqdm
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F  
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd

os.environ["PYTORCH_NO_CUDA_MEMORY_CACHING"] = "1"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler

In [3]:
import utils

In [4]:
# define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
print(torch.cuda.is_available())

True


In [6]:
DATA_DIR = './fma/data/fma_small'

tracks = utils.load('fma/data/fma_metadata/tracks.csv')
features = utils.load('fma/data/fma_metadata/features.csv')
echonest = utils.load('fma/data/fma_metadata/echonest.csv')

subset = tracks.index[tracks['set', 'subset'] <= 'small']

assert subset.isin(tracks.index).all()
assert subset.isin(features.index).all()

features_all = features.join(echonest, how='inner').sort_index(axis=1)
print('Not enough Echonest features: {}'.format(features_all.shape))

tracks = tracks.loc[subset]
features_all = features.loc[subset]

tracks.shape, features_all.shape

train = tracks.index[tracks['set', 'split'] == 'training']
val = tracks.index[tracks['set', 'split'] == 'validation']
test = tracks.index[tracks['set', 'split'] == 'test']

Not enough Echonest features: (13129, 767)


In [7]:
labels_onehot = LabelBinarizer().fit_transform(tracks['track', 'genre_top'])
labels_onehot = pd.DataFrame(labels_onehot, index=tracks.index)

In [8]:
def mono_to_stereo(waveform):
    # reshape the waveform from a 1D tensor to a 2D tensor with 2 columns
    waveform = waveform.view(-1, 1)
    # repeat the waveform along the columns to create a stereo signal
    waveform = waveform.repeat(1, 2)
    return waveform.T

def stereo_to_mono(waveform):
    if waveform.dim() == 2:
        return torch.mean(waveform, dim=0)
    elif waveform.dim() == 1:
        return waveform
    else:
        raise ValueError("Input must be 1D or 2D tensor")

In [9]:
len(train)/4

1600.0

In [18]:

class FMADataset(Dataset):
    def __init__(self, data_dir, track_ids, precomputed=True, subsampling=True, sampling_rate=22050):
        self.data_dir = data_dir
        self.track_ids = track_ids
        self.sampling_rate = sampling_rate
        self.max_length = 750000
        self.subsampling = subsampling
        self.precomputed = precomputed
        # create the Resample transform
        self.resample = torchaudio.transforms.Resample(44100, sampling_rate)
        
        
    def __getitem__(self, index):
        
        tid = self.track_ids[index]
        
    
        if not self.precomputed:
            # load the MP3 file
            filepath = utils.get_audio_path(self.data_dir, tid)
            try:
                waveform, sample_rate = torchaudio.load(filepath)
            except RuntimeError:
                return self.__getitem__(index + 1)

            waveform = stereo_to_mono(waveform) 

            # resample the waveform to the desired sample rate using the Resample transform
            waveform = self.resample(waveform)
        else:
            
            try:
                # load pre-computed waveform resampled to 22050
                filepath = os.path.join('./fma/data/waveforms/'+"{:06d}".format(tid)+'.pt')
                waveform = torch.load(filepath)
            except:
                return self.__getitem__(index + 1)
            

        
        # get label
        label = torch.from_numpy(labels_onehot.loc[tid].values).float()
        
        # subsampling
        if self.subsampling:
            # set the length of the subsamples and the overlap
            subsample_length = self.sampling_rate * 5  # 10 seconds
            overlap = int(subsample_length * 0.25)  # 50% overlap
            subsamples = []
            shift = subsample_length - overlap
            for i in range(0, waveform.size(0) - subsample_length + 1, shift):
                subsample = waveform[i:(i + subsample_length)]
                subsamples.append(subsample)
                #if len(subsamples) == 4: break
                if len(subsamples) == 7: break
            
            # ignore samples that have less than 30s (it should not be the case but there is a 15s sample in validation dataset we dont know why)
            if len(subsamples) != 7: return self.__getitem__(index + 1)
        
            # return the subsamples
            return subsamples, label
        else:
            # padding
            padding = self.max_length - waveform.shape[0]
            padding_tensor = torch.zeros((padding, waveform.shape[1]))
            waveform = torch.cat((waveform, padding_tensor), dim=0)

        
        return waveform, label

    def __len__(self):
        return int(len(self.track_ids)/2) # using ONLY 50% of the dataset for faster training

In [None]:
class DenseBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=4, stride, padding):
        super(DenseBlock, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, bias=False)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.conv2 = nn.Conv1d(out_channels, out_channels+in_channels, kernel_size, bias=False)
        self.bn2 = nn.BatchNorm1d(out_channels)
        final = out_channels + 2 * in_channels
        self.conv_trans = nn.Conv1d(final, final//2, kernel_size=1, bias=False)
        self.pool = nn.AvgPool1d(kernel_size=2, stride=2)

    def forward(self, x):
        

        out = self.conv1(x)
        out = self.bn1(out)
        out = F.relu(out)
        
        out = torch.cat([out, x], dim=1)
        
        out = self.conv2(out)
        out = self.bn2(out)
        out = F.relu(out)
        
        out = torch.cat([out, x], dim=1)
        out = self.conv_trans(out)
        out = self.pool(out)
        
        return out
    

In [19]:

class Res1DLayer(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding):
        super(Res1DLayer, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size, stride, padding)
        self.bn2 = nn.BatchNorm1d(out_channels)

        # Projection shortcut
        self.projection = nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=stride, padding=0, bias=False)

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = F.leaky_relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        
        # Add the projection shortcut
        identity = self.projection(identity)
        out += identity

        out = F.leaky_relu(out)

        return out
    
    
class ResNet1D(nn.Module):
    def __init__(self, input_size, num_classes=10):
        super(ResNet1D, self).__init__()
        
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=128, kernel_size=3, stride=3, padding=3, bias=False)
     
        self.layer1 = nn.Sequential(
            Res1DLayer(128, 128, kernel_size=3, stride=1, padding=1),
            nn.MaxPool1d(kernel_size=9, stride=9, padding=1),
            Res1DLayer(128, 128, kernel_size=3, stride=1, padding=1),
            nn.MaxPool1d(kernel_size=9, stride=9, padding=1)
        )
        
        self.layer2 = nn.Sequential(
            Res1DLayer(128, 256, kernel_size=3, stride=1, padding=1),
            nn.MaxPool1d(kernel_size=9, stride=9, padding=1),
            Res1DLayer(256, 256, kernel_size=3, stride=1, padding=1),
            nn.MaxPool1d(kernel_size=9, stride=9, padding=1)
        )
        
        self.layer3 = nn.Sequential(
            Res1DLayer(256, 256, kernel_size=3, stride=1, padding=1),
            nn.MaxPool1d(kernel_size=3, stride=3, padding=1),
            Res1DLayer(256, 256, kernel_size=3, stride=1, padding=1),
            nn.MaxPool1d(kernel_size=3, stride=3, padding=1),
            Res1DLayer(256, 256, kernel_size=3, stride=1, padding=1),
            nn.MaxPool1d(kernel_size=3, stride=3, padding=1),
            Res1DLayer(256, 256, kernel_size=3, stride=1, padding=1),
            nn.MaxPool1d(kernel_size=3, stride=3, padding=1)
        )
        
        self.layer4 = nn.Sequential(
            Res1DLayer(256, 512, kernel_size=3, stride=1, padding=1),
            nn.MaxPool1d(kernel_size=3, stride=3, padding=1),
            nn.Conv1d(512, 512, kernel_size=1, stride=1, padding=0)
        )
        
#         self.avgpool = nn.AdaptiveAvgPool1d(1)
        # The output layer has ten units and a sigmoid activation function, followed by a dropout of 0.5
        self.fc = nn.Linear(1024, num_classes)
        #self.sigmoid = nn.Sigmoid()
        #self.dropout = nn.Dropout(p=0.5)

    def forward(self, x):
        x = self.conv1(x)
        #x = self.bn1(x)
        x = self.layer1(x)
        x = self.layer2(x)
        #x = self.layer3(x)
        x = self.layer4(x)
        x = x.view(x.size(0), -1)  # flatten the tensor
        #x = self.avgpool(x)
        #x = self.dropout(x)
        x = self.fc(x)
        #x = self.sigmoid(x)
        #x = self.dropout(x)
        return x


In [12]:
"""conv1  pool1 conv2 pool2 conv3 pool3 conv4 conv5

#f     32   32   64   64   128   128  256   1401
f-size  64  8    32   8    16    8     8     16
stride  2   8     2   8    2     8     2     12
padding 32  0    16   0    8     0     4      4
dim   220,050 27,506 13,782 1,722 862 432 217 54"""

class SoundNet1D(nn.Module):
    def __init__(self, input_size, num_classes=8):
        super(SoundNet1D, self).__init__()
     
        self.group1 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=32, kernel_size=64, stride=2, padding=32, bias=False),
            nn.BatchNorm1d(32),
            nn.ReLU(True),
            nn.MaxPool1d(kernel_size=8, stride=8, padding=0),
            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=32, stride=2, padding=16),
            nn.BatchNorm1d(64),
            nn.ReLU(True),
            nn.MaxPool1d(kernel_size=8, stride=8, padding=0)
        )
    
        self.group2 = nn.Sequential(
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=16, stride=2, padding=8),
            nn.BatchNorm1d(128),
            nn.ReLU(True),
            nn.MaxPool1d(kernel_size=8, stride=8, padding=0),
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=8, stride=2, padding=4),
            nn.Conv1d(in_channels=256, out_channels=1401, kernel_size=16, stride=12, padding=4)

        )
        
        # self.avgpool = nn.AdaptiveAvgPool1d(1)
        # The output layer has ten units and a sigmoid activation function, followed by a dropout of 0.5
        self.fc = nn.Linear(2802, num_classes)
        #self.sigmoid = nn.Sigmoid()
        #self.dropout = nn.Dropout(p=0.5)

    def forward(self, x):

        #x = self.bn1(x)
        x = self.group1(x)
        x = self.group2(x)

        x = x.view(x.size(0), -1)  # flatten the tensor
        #x = self.avgpool(x)
        #x = self.dropout(x)
        x = self.fc(x)
        #x = self.sigmoid(x)
        #x = self.dropout(x)
        return x


In [13]:
class SoundNet1DLarge(nn.Module):
    def __init__(self, input_size, num_classes=8):
        super(SoundNet1DLarge, self).__init__()
     
        self.group1 = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=16, kernel_size=64, stride=2, padding=32),
            nn.BatchNorm1d(16),
            nn.ReLU(True),
            nn.MaxPool1d(kernel_size=8, stride=1, padding=0),
            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=32, stride=2, padding=16),
            nn.BatchNorm1d(32),
            nn.ReLU(True),
            nn.MaxPool1d(kernel_size=8, stride=1, padding=0)
        )
    
        self.group2 = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=64, kernel_size=16, stride=2, padding=8),
            nn.BatchNorm1d(64),
            nn.ReLU(True),
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=8, stride=2, padding=4),
            nn.BatchNorm1d(128),
            nn.ReLU(True),
            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=4, stride=2, padding=2),
            nn.BatchNorm1d(256),
            nn.ReLU(True),
            nn.MaxPool1d(kernel_size=4, stride=1, padding=0),
            nn.Conv1d(in_channels=256, out_channels=512, kernel_size=4, stride=2, padding=2),
            nn.BatchNorm1d(512),
            nn.ReLU(True),
            nn.Conv1d(in_channels=512, out_channels=1024, kernel_size=4, stride=2, padding=2),
            nn.BatchNorm1d(1024),
            nn.ReLU(True)
        )
        self.conv81 = nn.Conv1d(in_channels=1024, out_channels=1000, kernel_size=8, stride=2, padding=0)
        self.conv82 = nn.Conv1d(in_channels=1024, out_channels=401, kernel_size=8, stride=2, padding=0)
        # self.avgpool = nn.AdaptiveAvgPool1d(1)
        # The output layer has ten units and a sigmoid activation function, followed by a dropout of 0.5
        self.fc = nn.Linear(1401, num_classes)
        #self.sigmoid = nn.Sigmoid()
        #self.dropout = nn.Dropout(p=0.5)

    def forward(self, x):

        #x = self.bn1(x)
        x = self.group1(x)
        x = self.group2(x)
        x1 = self.conv81(x)
        x2 = self.conv82(x)
        x = torch.cat((x1, x2), dim=1)
        x = x.view(x.size(0), -1)  # flatten the tensor
        #x = self.avgpool(x)
        #x = self.dropout(x)
        x = self.fc(x)
        #x = self.sigmoid(x)
        #x = self.dropout(x)
        return x


In [21]:
BATCH = 8

# create a training dataset and dataloader
dataset = FMADataset(DATA_DIR, train, precomputed=True)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH, shuffle=True)

# create a validation dataset and dataloader
val_dataset = FMADataset(DATA_DIR, val, precomputed=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=BATCH, shuffle=True)

    
# create the CNN model
model = ResNet1D(input_size=110250, num_classes=8) # HERE YOU PUT UR NETWORK
model.to(device)

# define the loss function and the optimizer
loss_fn = nn.CrossEntropyLoss()

# Adam optimizer
optimizer = torch.optim.Adam(model.parameters())


# Lee 2017
# Define the optimizer
#optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, nesterov=True)

# Define the scheduler
#scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=5)



num_epochs = 10
i = 0
running_loss = 0.0


import time
t2 = 0
# train the model
for epoch in range(num_epochs):
    # evaluate the model on the training dataset
    train_correct = 0
    train_total = 0
    
    for subsamples, label in dataloader:
        
        label = label.to(device)
        train_label = torch.argmax(label, dim=1)
        for waveform in subsamples:
            #t2 = time.time()
            # clear the gradients
            optimizer.zero_grad()
            waveform = waveform.to(device)
            # forward pass
            waveform = waveform.squeeze(0)  
            waveform = waveform.unsqueeze(1)
            
            output = model(waveform)
            
            loss = loss_fn(output, label)

            # backward pass
            loss.backward()
            optimizer.step()  
            
            # Update the learning rate
            # scheduler.step(loss)
            
            _, train_predicted = torch.max(output.data, 1)
            train_total += train_label.size(0)
            train_correct += (train_predicted == train_label).sum().item()
            # print statistics
            i += 1
            running_loss += loss.item()
            #print(t2 - time.time())
        
           
    print('[%d, %5d subsamples] Training loss: %.3f' % (epoch + 1, i*BATCH, running_loss / len(dataloader)))
    running_loss = 0            
    # evaluate the model on the validation dataset
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for val_subsamples, val_label in val_dataloader:
            val_label = val_label.to(device)
            val_label = torch.argmax(val_label, dim=1)
            for val_waveform in val_subsamples:
                val_waveform = val_waveform.squeeze(0)
                val_waveform = val_waveform.unsqueeze(1)
                val_waveform = val_waveform.to(device)
                val_output = model(val_waveform)
                val_loss += loss_fn(val_output, val_label).item()
                _, val_predicted = torch.max(val_output.data, 1)
                val_total += val_label.size(0)
                val_correct += (val_predicted == val_label).sum().item()


    print('Validation Loss: {:.4f} | Validation Accuracy: {:.4f} | Training Accuracy: {:.4f}'.format(val_loss / len(val_dataloader), val_correct / val_total, train_correct / train_total))
print('Finished Training')


# subsampling time:  5.8650970458984375e-05
# waveform load time:  0.0013015270233154297
# subsampling time:  5.793571472167969e-05
# waveform comp time:  0.052204132080078125

[1, 22400 subsamples] Training loss: 13.670
Validation Loss: 14.5356 | Validation Accuracy: 0.1904 | Training Accuracy: 0.2520
[2, 44800 subsamples] Training loss: 13.537
Validation Loss: 14.2705 | Validation Accuracy: 0.2511 | Training Accuracy: 0.2508
[3, 67200 subsamples] Training loss: 13.464
Validation Loss: 14.0717 | Validation Accuracy: 0.1796 | Training Accuracy: 0.2523
[4, 89600 subsamples] Training loss: 13.429
Validation Loss: 13.1604 | Validation Accuracy: 0.2693 | Training Accuracy: 0.2549
[5, 112000 subsamples] Training loss: 13.191
Validation Loss: 18.8018 | Validation Accuracy: 0.1300 | Training Accuracy: 0.2681


KeyboardInterrupt: 

In [1]:
model_name = "SoundNetLarge_2022-01-25_batchsize_32_epochs_10_opt_Adam_LR_0.0001_acc_0."
torch.save(model, model_name + '.pt')

NameError: name 'torch' is not defined

In [None]:
#model_name = "ResNet_2022-01-23_batchsize_40_epochs_5_opt_Adam_LR_0.001_acc_"
#model_name = "SoundNet_2022-01-25_batchsize_32_epochs_10_opt_Adam_LR_0.0001_acc_0.3987"
#model_name = "SoundNet_2022-01-25_batchsize_32_epochs_10_20_opt_Adam_LR_0.0001_acc_0.4519"
#model_name = "ResNet_2022-01-23_batchsize_40_epochs_10_opt_Adam_LR_0.001"

SoundNet 20 epochs BATCH 32 <br>
Validation Loss: 7.2820 | Validation Accuracy: 0.4159 | Training Accuracy: 0.4429 <br>
[9, 230400 subsamples] Training loss: 6.039
Validation Loss: 6.7605 | Validation Accuracy: 0.4103 | Training Accuracy: 0.4553 <br>
[10, 256000 subsamples] Training loss: 6.025
Validation Loss: 7.0612 | Validation Accuracy: 0.3950 | Training Accuracy: 0.4519 <br>

In [36]:
with torch.no_grad():
    torch.cuda.empty_cache()

In [16]:
model = torch.load(model_name+'.pt')
#model.state_dict()

In [None]:

# create a Mp3Dataset from a directory of MP3 files
dataset = FMADataset(DATA_DIR, train)

# create a DataLoader from the FMADataset
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

    
# create the CNN model
model = ResNet1D(input_size=750000, num_classes=8)

# define the loss function and the optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

num_epochs = 10
i = 0
running_loss = 0.0
# train the model
for epoch in range(num_epochs):
    for waveform, label in dataloader:
        # clear the gradients
        optimizer.zero_grad()

        # forward pass
#         waveform = waveform.unsqueeze(0)  # add a batch dimension
#         print(waveform.shape)

        # extract the first channel
        first_channel = waveform[:, :, 0]

        # reshape the first channel to add an additional dimension for the channel dimension
        first_channel = first_channel.unsqueeze(-1)
        
        output = model(first_channel)

        loss = loss_fn(output, label)

        # backward pass
        loss.backward()
        optimizer.step()
        # print statistics
        i += 1
        running_loss += loss.item()
        if i % 10 == 9:    # print every 100 samples
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0


print('Finished Training')

## Testing - subsampling

In [None]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# create a dataset object for testing
test_dataset = FMADataset(DATA_DIR, test)
batch_size = 8
# create a data loader to load the dataset
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)



# test the model
model.eval()
model.to(device)
correct = 0
total = 0
with torch.no_grad(): # don't need to track, calculate or save the gradients in the model
    for subsamples, labels in test_loader:
        labels = labels.to(device)
        labels = torch.argmax(labels, dim=1)
        batch_size = labels.size(0) # we reupdate the batch size because the last batch can be incomplete.
        subsample_outputs = {i: [] for i in range(batch_size)}
        for waveform in subsamples:
            waveform = waveform.squeeze(0)  
            waveform = waveform.unsqueeze(-1)
            waveform = waveform.to(device)
            outputs = model(waveform)
            predicted = torch.argmax(outputs.data, dim=1).cpu()
            
            for j in range(batch_size):
                subsample_outputs[j].append(predicted[j]) 
        for j in range(batch_size):
            # count the occurrences of each class
            counts = np.bincount(subsample_outputs[j])
            # Find the class with the highest count
            aggregate_prediction = np.argmax(counts)
            correct += (aggregate_prediction == labels[j])
        total += labels.size(0)
        
    
        print(f"CORRECT #  {correct}")

print('Accuracy of the network on the test samples: %d %%' % (100 * correct / total))

## Testing - full sample

In [None]:
# create a dataset object for testing
test_dataset = FMADataset(DATA_DIR, test)

# create a data loader to load the dataset
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

# test the model
model.eval()
model.to(device)
correct = 0
total = 0
with torch.no_grad(): # don't need to track, calculate or save the gradients in the model
    for data in test_loader:
        # get the inputs
        audio, labels = data
        # wrap them in a torch Variable
        audio, labels = audio.to(device), labels.to(device)

        # forward + backward + optimize
        outputs = model(audio)
        _, predicted = torch.max(outputs.data, 1)
        labels = torch.argmax(labels, dim=1)
        predicted = torch.argmax(outputs.data, dim=1)
        print(labels)
        print(predicted)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        print(f"CORRECT #  {correct}")

print('Accuracy of the network on the test samples: %d %%' % (100 * correct / total))

In [19]:
import os
import torch
import torchaudio

data_dir = './fma/data/fma_small'
output_dir = './fma/data/waveforms'
sampling_rate = 22_050

# create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
    
resample = torchaudio.transforms.Resample(44100, sampling_rate)
    
# loop through all MP3 files in the data directory
for root, dirs, files in os.walk(data_dir):
    for filename in files:
        if filename.endswith('.mp3'):
            filepath = os.path.join(root, filename)
            try: 
                waveform, sample_rate = torchaudio.load(filepath)
                waveform = stereo_to_mono(waveform) 

                # resample the waveform to the desired sample rate using the Resample transform
                waveform = resample(waveform)

                # save the spectrogram to the output directory
                output_file = os.path.join(output_dir, filename[:-4] + '.pt')
                torch.save(waveform, output_file)
            except:
                continue

## Remarks on implementations - draft

### There are two types of samples: mono and stereo - we need to convert mono to stereo when feeding the CNN

An audio channel refers to a single track of audio. The number of channels in an audio file determines the number of separate audio tracks that are mixed together to form the final audio.

A mono audio file has a single channel, which means that all the audio is mixed together into one single track. This means that if you play a mono audio file, the same audio will come out of both the left and right speakers (or headphones) and it will sound the same regardless of the stereo or mono setup.

A stereo audio file, on the other hand, has two channels - a left channel and a right channel. These two channels carry separate audio tracks that are mixed together to create the final audio. When played back on a stereo setup, each channel will be played through its corresponding speaker or headphone and this way, the stereo audio creates a sense of space and directionality.

So, for example, a stereo audio recording of a live concert will have different audio captured by different microphone positioned in different positions in the concert hall, and when it is played back, it creates the sense of being there in the concert hall.

It is worth noting that there are also audio file format with more than 2 channels, such as 5.1 or 7.1 surround sound audio.


### Downsampling

 we downsample the audio signals to a lower sample rate to reduce the data size or to simplify the processing of the signal. Downsampling can be useful for tasks such as speech recognition or audio classification, where the lower frequencies of the signal are more important than the higher frequencies.