In [1]:
%run local_functions.py
from local_functions import *


import numpy as np
import math
import glob

import IPython.display as ipd

import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

import torch
from torch import nn
import matplotlib.pyplot as plt


from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix


from scipy.special import expit, logit

import os
import ffmpeg
from math import ceil

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

from datasets import Dataset, DatasetDict
from transformers import EvalPrediction
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification


plt.style.use("dark_background")

pd.set_option("display.max_columns", 2500)
pd.set_option("display.max_rows", 50)

plt.style.use("dark_background")

%load_ext lab_black

2023-09-12 10:38:49.436433: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-12 10:38:49.562047: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Split into 5 sec clips

In [6]:
audio_files = glob.glob("audio_files/**/*.mp3", recursive=True)

In [13]:

def split_audio(input_file, output_dir, clip_duration=5):
    input_file = os.path.abspath(input_file)
    output_dir = os.path.abspath(output_dir)

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Get the duration of the input audio file
    probe = ffmpeg.probe(
        input_file, v="error", select_streams="a:0", show_entries="format=duration"
    )
    duration = float(probe["format"]["duration"])

    # Calculate the number of clips needed
    num_clips = ceil(duration / clip_duration)

    # Get the base filename without extension
    base_filename = os.path.splitext(os.path.basename(input_file))[0]

    # Split the audio into clips
    for i in range(num_clips):
        start_time = i * clip_duration
        end_time = min((i + 1) * clip_duration, duration)

        # Generate a unique filename for each clip
        clip_filename = (
            f"{base_filename}_clip_{i + 1}.mp3"  # Change the extension if needed
        )
        output_file = os.path.join(output_dir, clip_filename)

        ffmpeg.input(input_file, ss=start_time, to=end_time).output(output_file).run(
            overwrite_output=True
        )

    print(f"{input_file} split into {num_clips} clips")

In [16]:
# for file in audio_files:
#    artist = file.split("/")[1].split()[0]

#    if __name__ == "__main__":
#        input_audio = file  # Replace with your input audio file
#        output_directory = os.path.join(
#            "split_train_files", artist
#        )  # Construct the output directory path

#       split_audio(input_audio, output_directory)

# Build and Train 

In [2]:
files = get_random_files_from_subdirs("split_train_files", num_files=100)
labels = [file.split()[0].split("/")[-1] for file in files]
df = pd.DataFrame({"file": files, "label": labels})

In [3]:
df["features"] = df["file"].apply(file_to_librosa_features, args=(160000,))
df.drop("file", axis=1, inplace=True)

In [4]:
encoded_df = pd.get_dummies(df["label"], columns=["label"], prefix="", prefix_sep="")
encoded_df = encoded_df.astype(bool)
df = pd.concat([df["features"], encoded_df], axis=1)

In [5]:
df.features[0].shape

(800000,)

In [6]:
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Convert the split DataFrames into Datasets
train = Dataset.from_pandas(train_df, split="train")
valid = Dataset.from_pandas(valid_df, split="validation")
test = Dataset.from_pandas(test_df, split="test")

dataset = DatasetDict({"train": train, "validation": valid, "test": test})

In [7]:
labels = [
    label
    for label in dataset["train"].features.keys()
    if label not in ["features", "__index_level_0__"]
]
id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

In [8]:
extractor = AutoFeatureExtractor.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593"
)

In [9]:
def preprocess_data(examples):
    # take a batch of texts
    features = examples["features"]
    # encode them
    encoding = extractor(features, sampling_rate=16000, return_tensors="pt")
    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(features), len(labels)))
    # fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]

    encoding["labels"] = labels_matrix.tolist()

    return encoding

In [10]:
encoded_dataset = dataset.map(
    preprocess_data, batched=True, remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/840 [00:00<?, ? examples/s]

In [12]:
encoded_dataset.set_format("torch")

In [11]:
encoded_dataset

NameError: name 'encoded_dataset' is not defined

In [36]:
encoded_dataset["train"]["input_values"][2]

tensor([[-0.9753, -1.2468, -0.8700,  ..., -1.2776, -1.2776, -1.2776],
        [-1.1615, -1.0551, -0.6783,  ..., -0.7615, -1.2640, -1.2776],
        [-0.5504, -0.9755, -0.5987,  ..., -0.4456, -1.0860, -1.2776],
        ...,
        [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
        [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670],
        [ 0.4670,  0.4670,  0.4670,  ...,  0.4670,  0.4670,  0.4670]])

In [20]:

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593",
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([12]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([12, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
batch_size = 2

args = TrainingArguments(
    f"ast-finetuned-ks",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-8,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # push_to_hub=True,
)

In [23]:
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average="micro")
    roc_auc = roc_auc_score(y_true, y_pred, average="micro")
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {"f1": f1_micro_average, "roc_auc": roc_auc, "accuracy": accuracy}
    return metrics


def compute_metrics_2(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

In [24]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    tokenizer=extractor,
    compute_metrics=compute_metrics_2,
)

In [37]:
trainer.train()

  0%|          | 0/525 [00:00<?, ?it/s]

{'loss': 0.7336, 'learning_rate': 5.660377358490566e-09, 'epoch': 0.1}
{'loss': 0.7342, 'learning_rate': 1.1320754716981132e-08, 'epoch': 0.19}
{'loss': 0.7403, 'learning_rate': 1.6981132075471695e-08, 'epoch': 0.29}
{'loss': 0.7204, 'learning_rate': 2.2641509433962263e-08, 'epoch': 0.38}
{'loss': 0.724, 'learning_rate': 2.830188679245283e-08, 'epoch': 0.48}
{'loss': 0.7154, 'learning_rate': 2.955508474576271e-08, 'epoch': 0.57}
{'loss': 0.7285, 'learning_rate': 2.8919491525423726e-08, 'epoch': 0.67}
{'loss': 0.7285, 'learning_rate': 2.8283898305084745e-08, 'epoch': 0.76}
{'loss': 0.7118, 'learning_rate': 2.764830508474576e-08, 'epoch': 0.86}
{'loss': 0.7167, 'learning_rate': 2.701271186440678e-08, 'epoch': 0.95}


  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.7002573609352112, 'eval_f1': 0.1608040201005025, 'eval_roc_auc': 0.5348484848484848, 'eval_accuracy': 0.0, 'eval_runtime': 8.0687, 'eval_samples_per_second': 22.308, 'eval_steps_per_second': 11.154, 'epoch': 1.0}
{'loss': 0.6997, 'learning_rate': 2.6377118644067792e-08, 'epoch': 1.05}
{'loss': 0.7068, 'learning_rate': 2.574152542372881e-08, 'epoch': 1.14}
{'loss': 0.7109, 'learning_rate': 2.5105932203389827e-08, 'epoch': 1.24}
{'loss': 0.6887, 'learning_rate': 2.4470338983050847e-08, 'epoch': 1.33}
{'loss': 0.682, 'learning_rate': 2.3834745762711862e-08, 'epoch': 1.43}
{'loss': 0.6779, 'learning_rate': 2.319915254237288e-08, 'epoch': 1.52}
{'loss': 0.7022, 'learning_rate': 2.2563559322033897e-08, 'epoch': 1.62}
{'loss': 0.6851, 'learning_rate': 2.1927966101694913e-08, 'epoch': 1.71}
{'loss': 0.6814, 'learning_rate': 2.129237288135593e-08, 'epoch': 1.81}
{'loss': 0.6788, 'learning_rate': 2.0656779661016948e-08, 'epoch': 1.9}
{'loss': 0.6697, 'learning_rate': 2.0021186440

  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.6683839559555054, 'eval_f1': 0.16431095406360427, 'eval_roc_auc': 0.5414141414141413, 'eval_accuracy': 0.0, 'eval_runtime': 7.9207, 'eval_samples_per_second': 22.725, 'eval_steps_per_second': 11.363, 'epoch': 2.0}
{'loss': 0.6735, 'learning_rate': 1.9385593220338983e-08, 'epoch': 2.1}
{'loss': 0.6552, 'learning_rate': 1.875e-08, 'epoch': 2.19}
{'loss': 0.668, 'learning_rate': 1.8114406779661015e-08, 'epoch': 2.29}
{'loss': 0.665, 'learning_rate': 1.747881355932203e-08, 'epoch': 2.38}
{'loss': 0.6903, 'learning_rate': 1.6843220338983047e-08, 'epoch': 2.48}
{'loss': 0.6568, 'learning_rate': 1.6207627118644066e-08, 'epoch': 2.57}
{'loss': 0.6533, 'learning_rate': 1.5572033898305082e-08, 'epoch': 2.67}
{'loss': 0.6652, 'learning_rate': 1.49364406779661e-08, 'epoch': 2.76}
{'loss': 0.652, 'learning_rate': 1.4300847457627117e-08, 'epoch': 2.86}
{'loss': 0.6478, 'learning_rate': 1.3665254237288134e-08, 'epoch': 2.95}


  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.6462792158126831, 'eval_f1': 0.1541318477251625, 'eval_roc_auc': 0.5250000000000001, 'eval_accuracy': 0.0, 'eval_runtime': 7.9183, 'eval_samples_per_second': 22.732, 'eval_steps_per_second': 11.366, 'epoch': 3.0}
{'loss': 0.6421, 'learning_rate': 1.3029661016949152e-08, 'epoch': 3.05}
{'loss': 0.6236, 'learning_rate': 1.239406779661017e-08, 'epoch': 3.14}
{'loss': 0.6585, 'learning_rate': 1.1758474576271185e-08, 'epoch': 3.24}
{'loss': 0.644, 'learning_rate': 1.1122881355932203e-08, 'epoch': 3.33}
{'loss': 0.647, 'learning_rate': 1.048728813559322e-08, 'epoch': 3.43}
{'loss': 0.6455, 'learning_rate': 9.851694915254236e-09, 'epoch': 3.52}
{'loss': 0.6479, 'learning_rate': 9.216101694915254e-09, 'epoch': 3.62}
{'loss': 0.6418, 'learning_rate': 8.580508474576271e-09, 'epoch': 3.71}
{'loss': 0.6501, 'learning_rate': 7.944915254237289e-09, 'epoch': 3.81}
{'loss': 0.6415, 'learning_rate': 7.3093220338983044e-09, 'epoch': 3.9}
{'loss': 0.6432, 'learning_rate': 6.67372881355932

  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.6333296895027161, 'eval_f1': 0.14942528735632182, 'eval_roc_auc': 0.5181818181818182, 'eval_accuracy': 0.0, 'eval_runtime': 7.927, 'eval_samples_per_second': 22.707, 'eval_steps_per_second': 11.354, 'epoch': 4.0}
{'loss': 0.6285, 'learning_rate': 6.038135593220338e-09, 'epoch': 4.1}
{'loss': 0.6335, 'learning_rate': 5.402542372881355e-09, 'epoch': 4.19}
{'loss': 0.6431, 'learning_rate': 4.766949152542372e-09, 'epoch': 4.29}
{'loss': 0.6405, 'learning_rate': 4.1313559322033895e-09, 'epoch': 4.38}
{'loss': 0.6385, 'learning_rate': 3.495762711864406e-09, 'epoch': 4.48}
{'loss': 0.6227, 'learning_rate': 2.8601694915254233e-09, 'epoch': 4.57}
{'loss': 0.6395, 'learning_rate': 2.2245762711864404e-09, 'epoch': 4.67}
{'loss': 0.6371, 'learning_rate': 1.5889830508474575e-09, 'epoch': 4.76}
{'loss': 0.6403, 'learning_rate': 9.533898305084744e-10, 'epoch': 4.86}
{'loss': 0.6307, 'learning_rate': 3.177966101694915e-10, 'epoch': 4.95}


  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.6290994882583618, 'eval_f1': 0.1475728155339806, 'eval_roc_auc': 0.5156565656565657, 'eval_accuracy': 0.0, 'eval_runtime': 7.917, 'eval_samples_per_second': 22.736, 'eval_steps_per_second': 11.368, 'epoch': 5.0}
{'train_runtime': 578.6124, 'train_samples_per_second': 7.259, 'train_steps_per_second': 0.907, 'train_loss': 0.6708842518216088, 'epoch': 5.0}


TrainOutput(global_step=525, training_loss=0.6708842518216088, metrics={'train_runtime': 578.6124, 'train_samples_per_second': 7.259, 'train_steps_per_second': 0.907, 'train_loss': 0.6708842518216088, 'epoch': 5.0})

In [38]:
trainer.evaluate()

  0%|          | 0/90 [00:00<?, ?it/s]

{'eval_loss': 0.7002573609352112,
 'eval_f1': 0.1608040201005025,
 'eval_roc_auc': 0.5348484848484848,
 'eval_accuracy': 0.0,
 'eval_runtime': 7.8671,
 'eval_samples_per_second': 22.88,
 'eval_steps_per_second': 11.44,
 'epoch': 5.0}

# Eval 

In [30]:
model = AutoModelForAudioClassification.from_pretrained(
    "ast-finetuned-ks/checkpoint-525",
    num_labels=len(id2label),
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
)

extractor = AutoFeatureExtractor.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593"
)

trainer = Trainer(model=model, tokenizer=extractor)

In [31]:
files = get_random_files_from_subdirs("split_train_files", num_files=100)
labels = [file.split()[0].split("/")[-1] for file in files]
df_test = pd.DataFrame({"file": files, "label": labels})
df_test["features"] = df_test["file"].apply(file_to_librosa_features)
df_test.drop("file", axis=1, inplace=True)

df_test = df_test.sample(frac=1).reset_index(drop=True)

In [32]:
df_test

Unnamed: 0,label,features
0,Gould,"[-4.8818765e-07, -9.2718983e-07, -6.4910273e-0..."
1,Schiff,"[-0.008452152, -0.017701644, -0.018430043, -0...."
2,Horowitz,"[-7.096678e-07, -1.4826655e-06, -1.5776604e-06..."
3,Richter,"[-1.6298145e-09, 1.3969839e-09, -1.7462298e-09..."
4,Crochet,"[-2.7939677e-09, -1.3969839e-09, -1.3969839e-0..."
...,...,...
1195,Pogorelich,"[-5.005859e-09, -6.4028427e-09, -2.3283064e-09..."
1196,Pogorelich,"[-9.313226e-10, -1.2805685e-09, -5.820766e-10,..."
1197,Gould,"[1.7369166e-07, 3.091991e-07, 2.5704503e-07, 2..."
1198,Schiff,"[-1.6778418e-10, -1.03521386e-10, -3.929264e-1..."


In [33]:
def inference(initial_features, trainer, extractor, id2label, CONFIDENCE_THRESHOLD=0.5):
    encoding = extractor(initial_features, sampling_rate=16000, return_tensors="pt")
    encoding = {k: v.to(trainer.model.device) for k, v in encoding.items()}

    outputs = trainer.model(**encoding)

    logits = outputs.logits

    # apply sigmoid + threshold
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())
    predictions = np.zeros(probs.shape)

    predictions[np.where(probs >= CONFIDENCE_THRESHOLD)] = 1
    # turn predicted id's into actual label names
    predicted_labels = [
        id2label[idx] for idx, label in enumerate(predictions) if label == 1.0
    ]

    return predicted_labels

In [34]:
ROWS_TO_EVALUATE = 500
CONFIDENCE_THRESHOLD = 0.6

df_test["predicted_label"] = df_test["features"][0:ROWS_TO_EVALUATE].apply(
    inference, args=(trainer, extractor, id2label, CONFIDENCE_THRESHOLD)
)

df_test["correct"] = df_test.apply(
    lambda row: int(str(row["label"]) in str(row["predicted_label"])), axis=1
)

df_test["top_n_preds"] = df_test["features"][0:ROWS_TO_EVALUATE].apply(
    n_most_likely_classes, args=(trainer, extractor, id2label, 5)
)

In [35]:
df_test[0:ROWS_TO_EVALUATE]

Unnamed: 0,label,features,predicted_label,correct,top_n_preds
0,Gould,"[-4.8818765e-07, -9.2718983e-07, -6.4910273e-0...","[Horowitz, Richter, Tureck]",0,"{'Tureck': 0.654, 'Richter': 0.639, 'Horowitz'..."
1,Schiff,"[-0.008452152, -0.017701644, -0.018430043, -0....","[Horowitz, Tureck]",0,"{'Horowitz': 0.661, 'Tureck': 0.611, 'Schiff':..."
2,Horowitz,"[-7.096678e-07, -1.4826655e-06, -1.5776604e-06...","[Horowitz, Ishizaka]",1,"{'Horowitz': 0.699, 'Ishizaka': 0.657, 'Richte..."
3,Richter,"[-1.6298145e-09, 1.3969839e-09, -1.7462298e-09...",[Horowitz],0,"{'Horowitz': 0.753, 'Tureck': 0.57, 'Richter':..."
4,Crochet,"[-2.7939677e-09, -1.3969839e-09, -1.3969839e-0...","[Richter, Tureck]",0,"{'Tureck': 0.638, 'Richter': 0.621, 'Horowitz'..."
...,...,...,...,...,...
495,Richter,"[-8.940697e-08, 2.6077032e-08, -4.7683716e-07,...",[Horowitz],0,"{'Horowitz': 0.674, 'Tureck': 0.6, 'Ishizaka':..."
496,Gould,"[-1.6746654e-10, -7.866863e-11, -1.3521877e-10...","[Horowitz, Tureck]",0,"{'Tureck': 0.666, 'Horowitz': 0.652, 'Ishizaka..."
497,Tureck,"[-3.434252e-08, -1.14087015e-08, -3.934838e-08...",[Richter],0,"{'Richter': 0.615, 'Moravec': 0.581, 'Ishizaka..."
498,Gould,"[-2.0256266e-07, -3.259629e-07, -2.9057264e-07...","[Horowitz, Ishizaka, Tureck]",0,"{'Horowitz': 0.644, 'Tureck': 0.64, 'Ishizaka'..."


In [36]:
df_test[0:ROWS_TO_EVALUATE].correct.sum() / ROWS_TO_EVALUATE

0.18