In [4]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio

In [5]:
class AudioUtil():
  # ----------------------------
  # Load an audio file. Return the signal as a tensor and the sample rate
  # ----------------------------
  @staticmethod
  def open(audio_file):
    sig, sr = torchaudio.load(audio_file)
    return (sig, sr)

  @staticmethod
  def rechannel(aud, new_channel):
    sig, sr = aud

    if (sig.shape[0] == new_channel):
      # Nothing to do
      return aud

    if (new_channel == 1):
      # Convert from stereo to mono by selecting only the first channel
      resig = sig[:1, :]
    else:
      # Convert from mono to stereo by duplicating the first channel
      resig = torch.cat([sig, sig])

    return ((resig, sr))

  @staticmethod
  def resample(aud, newsr):
    sig, sr = aud

    if (sr == newsr):
      # Nothing to do
      return aud

    num_channels = sig.shape[0]
    # Resample first channel
    resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
    if (num_channels > 1):
      # Resample the second channel and merge both channels
      retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
      resig = torch.cat([resig, retwo])

    return ((resig, newsr))

  @staticmethod
  def pad_trunc(aud, max_ms):
    sig, sr = aud
    num_rows, sig_len = sig.shape
    max_len = sr//1000 * max_ms

    if (sig_len > max_len):
      # Truncate the signal to the given length
      sig = sig[:,:max_len]

    elif (sig_len < max_len):
      # Length of padding to add at the beginning and end of the signal
      pad_begin_len = random.randint(0, max_len - sig_len)
      pad_end_len = max_len - sig_len - pad_begin_len

      # Pad with 0s
      pad_begin = torch.zeros((num_rows, pad_begin_len))
      pad_end = torch.zeros((num_rows, pad_end_len))

      sig = torch.cat((pad_begin, sig, pad_end), 1)
      
    return (sig, sr)

  @staticmethod
  def time_shift(aud, shift_limit):
    sig,sr = aud
    _, sig_len = sig.shape
    shift_amt = int(random.random() * shift_limit * sig_len)
    return (sig.roll(shift_amt), sr)

  @staticmethod
  def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
    sig,sr = aud
    top_db = 80

    # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
    spec = torchaudio.transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

    # Convert to decibels
    spec = torchaudio.transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)

  @staticmethod
  def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec

    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
      aug_spec = torchaudio.transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
      aug_spec = torchaudio.transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

    return aug_spec

In [48]:
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio

# ----------------------------
# Sound Dataset
# ----------------------------
class SoundDS(Dataset):
  def __init__(self, df, data_path):
    self.df = df
    self.data_path = str(data_path)
    self.duration = 4000
    self.sr = 44100
    self.channel = 2
    self.shift_pct = 0.4
            
  # ----------------------------
  # Number of items in dataset
  # ----------------------------
  def __len__(self):
    return len(self.df)    
    
  # ----------------------------
  # Get i'th item in dataset
  # ----------------------------
  def __getitem__(self, idx):
    # Absolute file path of the audio file - concatenate the audio directory with
    # the relative path
    audio_file = self.data_path + "/smalldata" + "/wavs" + self.df.loc[idx, 'folder'] + "/"+ self.df.loc[idx, 'fileName']
    # Get the Class ID
    labels = {
        "acousticness" : self.df.loc[idx, 'acousticness'],
        "danceability" : self.df.loc[idx, 'danceability'],
        "energy" : self.df.loc[idx, 'energy'],
        "instrumentalness" : self.df.loc[idx, 'instrumentalness'],
        "liveness" : self.df.loc[idx, 'liveness'],
        "speechiness" : self.df.loc[idx, 'speechiness'],
        "tempo" : self.df.loc[idx, 'tempo'],
        "valence" : self.df.loc[idx, 'valence']
    }


    aud = AudioUtil.open(audio_file)
    # Some sounds have a higher sample rate, or fewer channels compared to the
    # majority. So make all sounds have the same number of channels and same 
    # sample rate. Unless the sample rate is the same, the pad_trunc will still
    # result in arrays of different lengths, even though the sound duration is
    # the same.
    reaud = AudioUtil.resample(aud, self.sr)
    rechan = AudioUtil.rechannel(reaud, self.channel)

    dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
    shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
    sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
    aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

    return [aug_sgram, labels]

In [148]:
import pandas as pd
datafile = pd.read_csv("./spotify_data3.csv")
# datafile = datafile.loc[datafile["folder"] == "/000"]
datafile.head()
len(datafile)

13123

In [149]:
from torch.utils.data import random_split

myds = SoundDS(datafile, ".")

In [150]:
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])
num_items
# myds.__getitem__(13122)

13123

In [151]:
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=8, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=8, shuffle=False)

In [53]:
acousticDict = {"0": 0.0, "1" : 0.1, "2": 0.2, "3": 0.3, "4": 0.4, "5": 0.5, "6": 0.6, "7": 0.7, "8": 0.8, "9": 0.9, "10": 1.0}
danceabilityDict = {"0": 0.0, "1": 0.1, "2": 0.2, "3": 0.3, "4": 0.4, "5": 0.5, "6": 0.6, "7": 0.7, "8": 0.8, "9": 0.9, "10": 1.0}
energyDict = {"0": 0.0, "1": 0.1, "2": 0.2, "3": 0.3, "4": 0.4, "5": 0.5, "6": 0.6, "7": 0.7, "8": 0.8, "9": 0.9, "10": 1.0}
instrumentalnessDict = {"0": 0.0, "1": 0.1, "2": 0.2, "3": 0.3, "4": 0.4, "5": 0.5, "6": 0.6, "7": 0.7, "8": 0.8, "9": 0.9, "10": 1.0}
livenessDict = {"0": 0.0, "1": 0.1, "2": 0.2, "3": 0.3, "4": 0.4, "5": 0.5, "6": 0.6, "7": 0.7, "8": 0.8, "9": 0.9, "10": 1.0}
speechinessDict = {"0": 0.0, "1": 0.1, "2": 0.2, "3": 0.3, "4": 0.4, "5": 0.5, "6": 0.6, "7": 0.7, "8": 0.8, "9": 0.9, "10": 1.0}
valenceDict = {"0": 0.0, "1": 0.1, "2": 0.2, "3": 0.3, "4": 0.4, "5": 0.5, "6": 0.6, "7": 0.7, "8": 0.8, "9": 0.9, "10": 1.0}
tempo = {}
for i in range(250):
    tempo[f"{i}"] = i

In [152]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time
from torchvision import datasets, transforms, models
from torch.nn import init

In [209]:
class MultilabelClassifier(nn.Module):
    def __init__(self):
        super().__init__()
#         self.resnet = models.resnet50(pretrained=True)
#         self.model_wo_fc = nn.Sequential(*(list(self.resnet.children())[:-1]))
    
        conv_layers = []

        # First Convolution Block with Relu and Batch Norm. Use Kaiming Initialization
        self.conv1 = nn.Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
#         self.relu1 = nn.LeakyReLU()

        self.bn1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        # Second Convolution Block
        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
#         self.relu2 = nn.LeakyReLU()
        self.bn2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        # Second Convolution Block
        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
#         self.relu3 = nn.LeakyReLU()
        self.bn3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        # Second Convolution Block
        self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
#         self.relu4 = nn.LeakyReLU()
        self.bn4 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]
        
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        
        self.lin = nn.Linear(in_features=64, out_features=8)
        self.sig = nn.Sigmoid()

        self.conv = nn.Sequential(*conv_layers)
        



    def forward(self, x):
#         x = self.model_wo_fc(x)
#         x = torch.flatten(x, 1)
        x = self.conv(x)
        x = self.ap(x)
        x = x.view(x.shape[0], -1)
#         x = torch.flatten(x,1)
        x = self.lin(x)
        x = self.sig(x)
#         print(x.shape)
        
        return x

In [210]:
labelList = ["acousticness", "danceability", "energy",
             "instrumentalness", "liveness", "speechiness", "tempo", "valence"]
def extractLabels(labels, index):
    newLabels = []
    for label in labelList:
        newLabels.append(labels[label][index])
    return torch.Tensor(newLabels)

In [211]:

def criterion(loss_func,outputs,labels):
  losses = 0
  lossList = []
  for i, key in enumerate(outputs):
    newLoss = loss_func(outputs[i], 
      (extractLabels(labels,i)).to(device))
    losses += newLoss
    lossList.append(newLoss)
#     losses += loss_func(outputs[i], 
#               ((labels[labelList[i]]).type(torch.LongTensor)).to(device))
  return [losses,lossList]

In [212]:
def training(model,device,lr_rate,epochs,train_loader):
  num_epochs = epochs
  losses = []
  checkpoint_losses = []

  optimizer = torch.optim.Adam(model.parameters(), lr=lr_rate)
  n_total_steps = len(train_loader)

  loss_func = nn.CrossEntropyLoss()
#   loss_func = nn.MSELoss

  for epoch in range(num_epochs):
     for i, data in enumerate(train_loader):
        inputs = data[0].to(device)
        labels = data[1]

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s
        
        optimizer.zero_grad()

        outputs = model(inputs)

        loss = criterion(loss_func, outputs, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % (int(n_total_steps/1)) == 0:
            checkpoint_loss = torch.tensor(losses).mean().item()
            checkpoint_losses.append(checkpoint_loss)
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {checkpoint_loss:.4f}')
  return checkpoint_losses

In [213]:
def checkAccuracy(model, testloader, criterion, loss_func):
    totalLosses = [0,0,0,0,0,0,0,0]
    testLen = len(testloader)
    with torch.no_grad():
        for data in testloader:
            inputs = data[0].to(device)
            labels = data[1]
            
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s
            
            outputs = model(inputs)
            lossList = criterion(loss_func, outputs, labels)[1]
            
            for i in range(len(lossList)):
                totalLosses[i] += lossList[i] 
    totalLoss = 0
    for index, loss in enumerate(totalLosses):
        print(f"{labelList[index]} loss: {(loss/testLen):.2f}")
        totalLoss += loss
    
    print(f"Average loss: {totalLoss/testLen:.2f}")
            

In [214]:
def training2(model, train_dl, num_epochs):
  # Loss Function, Optimizer and Scheduler
#   criterion = nn.CrossEntropyLoss()
  loss_func = nn.CrossEntropyLoss()
#   loss_func = nn.MSELoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  # Repeat for each epoch
  for epoch in range(num_epochs):
    print(f"Epoch {epoch}")
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0
    count = 1

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs = data[0].to(device)
        labels = data[1]

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
#         print(outputs)
        loss = criterion(loss_func, outputs, labels)[0]
        
#         loss = loss_func(outputs,labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()
    
        if count % 40 == 0:
            print(f"average loss: {running_loss / count}")
        count += 1

    
    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    checkAccuracy(model, val_dl, criterion, loss_func)
#     acc = correct_prediction/total_prediction
#     print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')

  print('Finished Training')

In [215]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

device 

'cpu'

In [188]:
model = MultilabelClassifier()
model.to(device)

MultilabelClassifier(
  (conv1): Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
  (relu1): ReLU()
  (bn1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu2): ReLU()
  (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu3): ReLU()
  (bn3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu4): ReLU()
  (bn4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (ap): AdaptiveAvgPool2d(output_size=1)
  (lin): Linear(in_features=64, out_features=8, bias=True)
  (sig): Sigmoid()
  (conv): Sequential(
    (0): Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (1): ReLU()
    (2): BatchNo

In [189]:
import time

In [190]:
start = time.time()
epochs = 5
training2(model, train_dl, 5)
finish = time.time()
print(f"Train time {finish-start} seconds, {(finish-start)/epochs} per epoch")

Epoch 0
average loss: 1733.2946075439454
average loss: 1532.2121719360352
average loss: 1465.2283905029296
average loss: 1428.7839904785155
average loss: 1405.4399346923828
average loss: 1393.5009552001952
average loss: 1381.9992292131697
average loss: 1372.7504692077637
average loss: 1366.4614491780599
average loss: 1358.3598561096192
average loss: 1352.5009769786489
average loss: 1350.5669338226319
average loss: 1347.29114649846
average loss: 1344.6089251926967
average loss: 1338.6746614583333
average loss: 1335.6765615463257
average loss: 1334.4349849925322
average loss: 1333.2247375488282
average loss: 1331.946075600072
average loss: 1331.097087097168
average loss: 1330.8899068196615
average loss: 1330.7230298822576
average loss: 1330.4940752775772
average loss: 1330.3597940444947
average loss: 1328.7473377685546
average loss: 1327.3492342435397
average loss: 1327.2256421124493
average loss: 1327.340642111642
average loss: 1326.2127123602506
average loss: 1324.2954753621418
average

KeyboardInterrupt: 

In [None]:
#training(model, device, 0.001, 2, train_dl)

In [207]:
model2 = MultilabelClassifier()
model2.to(device)

MultilabelClassifier(
  (conv1): Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
  (relu1): LeakyReLU(negative_slope=0.01)
  (bn1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu2): LeakyReLU(negative_slope=0.01)
  (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu3): LeakyReLU(negative_slope=0.01)
  (bn3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu4): LeakyReLU(negative_slope=0.01)
  (bn4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (ap): AdaptiveAvgPool2d(output_size=1)
  (lin): Linear(in_features=64, out_features=8, bias=True)
  (sig): Sigmoid()
  (conv): Sequential(
    (0): 

In [208]:
start = time.time()
epochs = 5
training2(model2, train_dl, 3)
finish = time.time()
print(f"Train time {finish-start} seconds, {(finish-start)/epochs} per epoch")

Epoch 0
average loss: 2156.933822631836
average loss: 2137.98702545166
average loss: 2130.705706787109
average loss: 2120.044928741455
average loss: 2107.0176068115234
average loss: 2095.4641174316407
average loss: 2079.7478363037108
average loss: 2061.73191947937
average loss: 2037.861258951823
average loss: 2009.325751953125
average loss: 1986.8490628329191
average loss: 1963.3611854553224
average loss: 1942.2881368783803
average loss: 1917.5614663260324
average loss: 1891.9995166015624
average loss: 1868.1356231689454
average loss: 1838.0979346780216
average loss: 1812.7252888997396
average loss: 1789.740015130294
average loss: 1766.4146389770508
average loss: 1744.0600792294456
average loss: 1724.1626124988902
average loss: 1706.7006610372791
average loss: 1689.8837933858235
average loss: 1675.0576717529298
average loss: 1660.7224140460676
average loss: 1647.564229329427
average loss: 1636.2161660330637
average loss: 1625.7045904751483
average loss: 1614.019960530599
average loss: 

KeyboardInterrupt: 

In [160]:
model3 = MultilabelClassifier()
model3.to(device)

MultilabelClassifier(
  (conv1): Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
  (relu1): ReLU()
  (bn1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu2): ReLU()
  (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu3): ReLU()
  (bn3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu4): ReLU()
  (bn4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (ap): AdaptiveAvgPool2d(output_size=1)
  (lin): Linear(in_features=64, out_features=8, bias=True)
  (conv): Sequential(
    (0): Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (1): ReLU()
    (2): BatchNorm2d(8, eps=1e-05, 

In [161]:
start = time.time()
epochs = 5
training2(model3, train_dl, 3)
finish = time.time()
print(f"Train time {finish-start} seconds, {(finish-start)/epochs} per epoch")

Epoch 0
average loss: 2129.8644287109373
average loss: 2139.5966674804686
average loss: 2154.4096995035807
average loss: 2142.975350189209
average loss: 2132.8676416015624
average loss: 2123.6351572672525
average loss: 2090.849804251535
average loss: 2043.552516555786
average loss: 1980.822171359592
average loss: 1903.4624281311035
average loss: 1811.627336952903
average loss: 1710.4257383346558
average loss: 1608.3401161193847
average loss: 1513.399100630624
average loss: 1427.5774856567382
average loss: 1350.408284020424
average loss: 1281.485315883861
average loss: 1219.5851495107015
average loss: 1164.0594557711952
average loss: 1113.7658982086182
average loss: 1068.3740258716402
average loss: 1027.0556077263573
average loss: 989.1242160299549
average loss: 954.3683097998302
average loss: 922.3778828125
average loss: 892.8368297430185
average loss: 865.4958880813034
average loss: 840.1496646608625
average loss: 816.5703138219899
average loss: 794.5285026105245
average loss: 773.987

KeyboardInterrupt: 

In [191]:
def set_parameter_requires_grad(model, feature_extracting):
    if feature_extracting:
        for param in model.parameters():
            param.requires_grad = False

In [192]:
model_ft = models.resnet50(pretrained=True)
# set_parameter_requires_grad(model_ft, False)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 8)



In [193]:
start = time.time()
epochs = 5
training2(model_ft, train_dl, 3)
finish = time.time()
print(f"Train time {finish-start} seconds, {(finish-start)/epochs} per epoch")

Epoch 0


RuntimeError: Given groups=1, weight of size [64, 3, 7, 7], expected input[8, 2, 64, 344] to have 3 channels, but got 2 channels instead

In [96]:
from operator import add
res_list = list(map(add, [3,1], [4,6]))
res_list

[7, 7]