**dasheng-base does not support variable length audio inputs**

In [1]:
from dasheng_model.feature_extraction_dasheng import DashengFeatureExtractor
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, Audio
import numpy as np
import torch
import json
import evaluate

In [2]:
with open("../config.json", mode = "r") as f:
    data = json.load(f)
    SAMPLING_RATE = data["sampling_rate"]
    SEGMENT_LEN = data["segment_length"]
    OVERLAP_LEN = data["overlap_length"]

In [3]:
extractor = DashengFeatureExtractor.from_pretrained("mispeech/dasheng-base")



In [4]:
dataset = load_dataset("Saads/xecanto_birds", split = "train")
dataset = dataset.class_encode_column("common_name")

Resolving data files:   0%|          | 0/11032 [00:00<?, ?it/s]

In [5]:
labels = dataset.features["common_name"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [6]:
dataset = dataset.train_test_split(test_size = 0.2)

In [7]:
dataset = dataset.remove_columns([
    "primary_label",
    "secondary_labels",
    "scientific_name",
    "author",
    "license",
    "rating",
    "type",
    "latitude",
    "longitude",
    "url"
])
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'common_name'],
        num_rows: 8824
    })
    test: Dataset({
        features: ['audio', 'common_name'],
        num_rows: 2207
    })
})

In [8]:
dataset["train"][0:3]

{'audio': [{'path': '/users/labnet5/gr5/abahari/.cache/huggingface/hub/datasets--Saads--xecanto_birds/snapshots/a0cca0425468f94c84acd14479327b43c3c06084/mabeat1/XC591667.ogg',
   'array': array([-3.12357679e-06,  9.21771857e-07,  1.82094045e-05, ...,
           2.39509536e-05,  9.46432010e-06, -1.19023462e-05]),
   'sampling_rate': 32000},
  {'path': '/users/labnet5/gr5/abahari/.cache/huggingface/hub/datasets--Saads--xecanto_birds/snapshots/a0cca0425468f94c84acd14479327b43c3c06084/abethr1/XC363503.ogg',
   'array': array([ 1.38384030e-06, -1.17889140e-05,  1.05290583e-05, ...,
          -1.04965176e-04, -1.32110901e-04, -3.10554635e-04]),
   'sampling_rate': 32000},
  {'path': '/users/labnet5/gr5/abahari/.cache/huggingface/hub/datasets--Saads--xecanto_birds/snapshots/a0cca0425468f94c84acd14479327b43c3c06084/eaywag1/XC118267.ogg',
   'array': array([-3.12964548e-05, -2.68409913e-05, -9.95137452e-06, ...,
           2.18505065e-05,  1.52069197e-05, -2.19970061e-05]),
   'sampling_rate': 

In [9]:
def extract_chunked_audio(audio_array, chunk_length = SEGMENT_LEN, overlap = OVERLAP_LEN):
    chunk_length = chunk_length * SAMPLING_RATE
    overlap = overlap * SAMPLING_RATE
    
    chunks = []
    start = 0
    while start + chunk_length <= len(audio_array):
        chunks.append(
            extractor(
                audio_array[start : start + chunk_length],
                sampling_rate = SAMPLING_RATE,
                max_length = chunk_length,
                truncation = True
            )["input_values"].squeeze(0)
        )
        start += (chunk_length - overlap)
    
    if start < len(audio_array):
        last_chunk = audio_array[start:]
        padded_last_chunk = np.pad(last_chunk, (0, chunk_length - len(last_chunk)))
        chunks.append(
            extractor(
                padded_last_chunk,
                sampling_rate = SAMPLING_RATE,
                max_length = chunk_length,
                truncation = True
            )["input_values"].squeeze(0)
        )
    
    return chunks

In [10]:
def preprocess(row):
    chunked_batched_data = {}
    inputs = extract_chunked_audio(row["audio"]["array"])
    chunked_batched_data["input_values"] = inputs
    chunked_batched_data["common_name"] = [row["common_name"]] * len(inputs)
    return chunked_batched_data

In [11]:
def concate(batch):
   return {
       "concate_input_values": [chunk for chunks in batch["input_values"] for chunk in chunks],
       "chunked_common_name": [label for chunks in batch["common_name"] for label in chunks]
   }

In [12]:
dataset = dataset.cast_column("audio", Audio(sampling_rate = SAMPLING_RATE))
dataset = dataset.map(
    preprocess,
    remove_columns = "audio",
    batched = False,
    num_proc = 16,
    writer_batch_size = 500 #200
)
dataset = dataset.map(
    concate,
    remove_columns = ["input_values", "common_name"],
    batched = True,
    batch_size = 16,
    num_proc = 16,
    writer_batch_size = 500 #100
)
len(dataset["train"][0]["concate_input_values"])

Map (num_proc=16):   0%|          | 0/8824 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/2207 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/8824 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/2207 [00:00<?, ? examples/s]

64

In [13]:
dataset = dataset.rename_column("concate_input_values", "input_values")
dataset = dataset.rename_column("chunked_common_name", "label")
dataset = dataset.shuffle(seed = 42)

In [14]:
# dataset["train"][0]

In [15]:
from dasheng_model.modeling_dasheng import DashengModel

model = DashengModel.from_pretrained(
    "mispeech/dasheng-base",
    outputdim = len(id2label),
    num_labels = len(id2label),
    label2id = label2id,
    id2label = id2label
)

model.freeze_encoder()
model.config.loss = "CrossEntropyLoss"



config.json:   0%|          | 0.00/391 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/342M [00:00<?, ?B/s]

Some weights of DashengModel were not initialized from the model checkpoint at mispeech/dasheng-base and are newly initialized: ['outputlayer.0.bias', 'outputlayer.1.weight', 'outputlayer.1.bias', 'outputlayer.0.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
accuracy = evaluate.load("accuracy")

In [17]:
def compute_metrics(eval_pred):
    # [0] added after .predictions!!!
    predictions = np.argmax(eval_pred.predictions[0], axis = 1)
    return accuracy.compute(predictions = predictions, references = eval_pred.label_ids)

In [18]:
training_args = TrainingArguments(
    output_dir = "checkpoints-10-2",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = 5e-4,
    per_device_train_batch_size = 16,
    # gradient_accumulation_steps = 4,
    eval_accumulation_steps = 2,
    per_device_eval_batch_size = 16,
    num_train_epochs = 10,
    warmup_ratio = 0,
    logging_steps = 10,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",
    dataloader_num_workers = 16,
    fp16 = True
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = dataset["train"],
    eval_dataset = dataset["test"],
    tokenizer = extractor,
    compute_metrics = compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,4.9955,5.070867,0.198428
2,4.908,5.021171,0.255893
3,4.8656,5.000422,0.277951
4,4.8516,4.989946,0.294708
5,4.8548,4.983718,0.298495
6,4.8068,4.978,0.305027
7,4.7724,4.973799,0.315062
8,4.7657,4.97104,0.323488
9,4.7641,4.969545,0.320174
10,4.7806,4.969994,0.320553


TrainOutput(global_step=26510, training_loss=4.851824414725575, metrics={'train_runtime': 8147.1207, 'train_samples_per_second': 52.044, 'train_steps_per_second': 3.254, 'total_flos': 1.3959849454373224e+19, 'train_loss': 4.851824414725575, 'epoch': 10.0})