<a href="https://colab.research.google.com/github/Gunnika/Audio_Processing/blob/main/Music_Genre_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets[audio]

In [None]:
!pip install transformers[torch]

In [None]:
!pip install accelerate -U

In [None]:
!pip install evaluate

## Loading and exploring dataset
GTZAN is a dataset for musical genre classification of audio signals. The dataset consists of 1,000 audio tracks, each of 30 seconds long. It contains 10 genres, each represented by 100 tracks. The tracks are all 22,050Hz Mono 16-bit audio files in WAV format. The genres are: blues, classical, country, disco, hiphop, jazz, metal, pop, reggae, and rock.

In [None]:
from datasets import load_dataset
gtzan = load_dataset("marsyas/gtzan", "all")
gtzan

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 999
    })
})

In [None]:
gtzan = gtzan["train"].train_test_split(seed=42, shuffle=True, test_size=0.1)
gtzan

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 899
    })
    test: Dataset({
        features: ['file', 'audio', 'genre'],
        num_rows: 100
    })
})

In [None]:
gtzan["train"][0]['audio']['array'].shape

(661504,)

In [None]:
id2label_fn = gtzan["train"].features["genre"].int2str

# Collect unique genre labels
unique_genres = set()

for example in gtzan["train"]:
    genre_label = example["genre"]
    unique_genres.add(genre_label)

# Map integer labels to their corresponding names
genre_names = [id2label_fn(genre_label) for genre_label in unique_genres]

# Print all types of genres in the dataset
print("All types of genres in the dataset:")
for genre_name in genre_names:
    print(genre_name)


All types of genres in the dataset:
blues
classical
country
disco
hiphop
jazz
metal
pop
reggae
rock


## Feature Extraction for Audio Spectrogram Transformer

In [None]:
from transformers import AutoFeatureExtractor

model_id = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = AutoFeatureExtractor.from_pretrained(
    model_id
)



In [None]:
sampling_rate = feature_extractor.sampling_rate
sampling_rate

16000

In [None]:
inputs = feature_extractor(gtzan["train"][0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

In [None]:
print(f"inputs keys: {list(inputs.keys())}")

inputs keys: ['input_values']


In [None]:
inputs

{'input_values': tensor([[[ 0.2897,  0.0298,  0.4067,  ...,  0.7563,  0.8342,  0.5735],
         [ 0.2441, -0.1166,  0.2602,  ...,  0.7163,  0.7613,  0.5688],
         [ 0.2622, -0.0075,  0.3694,  ...,  0.6727,  0.6937,  0.5928],
         ...,
         [ 0.4629,  0.2080,  0.5849,  ...,  0.7233,  0.7433,  0.7789],
         [ 0.3562,  0.0682,  0.4450,  ...,  0.6267,  0.6270,  0.5913],
         [ 0.3906,  0.0230,  0.3998,  ...,  0.5871,  0.8071,  0.7353]]])}

## Preprocessing the dataset with the feature extractor

In [None]:
import torch

def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate,
        return_tensors="pt"
    )
    return inputs

In [None]:
# Map the preprocess_function to the dataset
gtzan_encoded = gtzan.map(
    preprocess_function,
    remove_columns=["audio", "file"],
    batched=True,
    batch_size=100,
    num_proc=1,
)


In [None]:
# Rename the 'genre' column to 'label'
gtzan_encoded = gtzan_encoded.rename_column("genre", "label")

In [None]:
gtzan_encoded

DatasetDict({
    train: Dataset({
        features: ['label', 'input_values'],
        num_rows: 899
    })
    test: Dataset({
        features: ['label', 'input_values'],
        num_rows: 100
    })
})

In [None]:
gtzan_encoded["train"].shape

(899, 2)

In [None]:
gtzan_encoded["test"].shape

(100, 2)

In [None]:
id2label = {
    str(i): id2label_fn(i)
    for i in range(len(gtzan_encoded["train"].features["label"].names))
}
label2id = {v: k for k, v in id2label.items()}

id2label["7"]

'pop'

## Loading the model and setting training hyperparameters

In [None]:
from transformers import ASTForAudioClassification
model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", low_cpu_mem_usage=True)



In [None]:
from transformers import TrainingArguments

model_name = model_id.split("/")[-1]
batch_size = 4
gradient_accumulation_steps = 1
num_train_epochs = 10

training_args = TrainingArguments(
    f"{model_name}-finetuned-gtzan",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    warmup_ratio=0.1,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    push_to_hub=True,
)

## Finetuning AST for the dataset

In [None]:
from transformers import AutoModelForAudioClassification

num_labels = len(id2label)

model = AutoModelForAudioClassification.from_pretrained(
    model_id,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,
)

Some weights of ASTForAudioClassification were not initialized from the model checkpoint at MIT/ast-finetuned-audioset-10-10-0.4593 and are newly initialized because the shapes did not match:
- classifier.dense.bias: found shape torch.Size([527]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.dense.weight: found shape torch.Size([527, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Setting evaluation and training the model

In [None]:
import evaluate
import numpy as np

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=gtzan_encoded["train"],
    eval_dataset=gtzan_encoded["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.2788,0.45331,0.88
2,0.3838,1.080045,0.75
3,0.3945,0.944553,0.76
4,0.0219,0.624278,0.89
5,0.0005,0.483073,0.91
6,0.0,0.62618,0.88
7,0.0001,0.482654,0.93
8,0.0,0.479399,0.93
9,0.0,0.481364,0.92


Non-default generation parameters: {'max_length': 1024}
Non-default generation parameters: {'max_length': 1024}
Non-default generation parameters: {'max_length': 1024}
Non-default generation parameters: {'max_length': 1024}
Non-default generation parameters: {'max_length': 1024}
Non-default generation parameters: {'max_length': 1024}
Non-default generation parameters: {'max_length': 1024}
Non-default generation parameters: {'max_length': 1024}
Non-default generation parameters: {'max_length': 1024}


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2788,0.45331,0.88
2,0.3838,1.080045,0.75
3,0.3945,0.944553,0.76
4,0.0219,0.624278,0.89
5,0.0005,0.483073,0.91
6,0.0,0.62618,0.88
7,0.0001,0.482654,0.93
8,0.0,0.479399,0.93
9,0.0,0.481364,0.92
10,0.0,0.48352,0.92


Non-default generation parameters: {'max_length': 1024}


TrainOutput(global_step=2250, training_loss=0.2043787725369071, metrics={'train_runtime': 1868.9857, 'train_samples_per_second': 4.81, 'train_steps_per_second': 1.204, 'total_flos': 6.122220759574118e+17, 'train_loss': 0.2043787725369071, 'epoch': 10.0})

## Uploading model checkpoints to HuggingFace Hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
kwargs = {
    "dataset_tags": "marsyas/gtzan",
    "dataset": "GTZAN",
    "model_name": f"{model_name}-finetuned-gtzan",
    "finetuned_from": model_id,
    "tasks": "audio-classification",
}

trainer.push_to_hub(**kwargs)

Non-default generation parameters: {'max_length': 1024}


events.out.tfevents.1714710127.07e0d8020a89.4612.5:   0%|          | 0.00/129k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Gunnika/ast-finetuned-audioset-10-10-0.4593-finetuned-gtzan/commit/c7659a6697e5defeb11bb20ceaec3a8d659d20e0', commit_message='End of training', commit_description='', oid='c7659a6697e5defeb11bb20ceaec3a8d659d20e0', pr_url=None, pr_revision=None, pr_num=None)

## Loading the model from HuggingfaceHub & Building a demo with Gradio

In [None]:
from transformers import pipeline

pipe = pipeline(
    "audio-classification", model="Gunnika/ast-finetuned-audioset-10-10-0.4593-finetuned-gtzan"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/26.9k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

In [None]:
def classify_audio(filepath):
    preds = pipe(filepath)
    outputs = {}
    for p in preds:
        outputs[p["label"]] = p["score"]
    return outputs

In [None]:
!pip install gradio --upgrade

In [None]:
import gradio as gr

demo = gr.Interface(
    fn=classify_audio, inputs=gr.Audio(type="filepath"), outputs=gr.Label()
)
demo.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://3728487775ecc0eff5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gradio/queueing.py", line 527, in process_events
    response = await route_utils.call_process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/route_utils.py", line 270, in call_process_api
    output = await app.get_blocks().process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1847, in process_api
    result = await self.call_function(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1433, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
    return await future
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 807, in run
    r