# **Music Genre Classification using HuBERT**: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units
**HuBERT** is an innovative self-supervised learning method developed for extracting high-quality speech representations from raw audio data. Model was released on June 2021 by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov and Abdelrahman Mohamed.

The key idea is to leverage masked prediction of hidden units to enable the model to learn meaningful and robust representations of speech without requiring labeled data. This approach has shown to be highly effective for various downstream tasks in speech processing, such as automatic speech recognition (ASR), speaker identification, and more. Learn more about HuBERT [here.](https://arxiv.org/pdf/2106.07447)

In this notebook, we finetune HuBERT for **music genre classification** task.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/content/cache
%env HF_DATASETS_CACHE=/content/cache
%env CUDA_LAUNCH_BLOCKING=1

!pip install evaluate
#!pip install git+https://github.com/huggingface/datasets.git
#!pip install git+https://github.com/huggingface/transformers.git
!pip install jiwer
!pip install torchaudio
!pip install librosa
!pip install transformers[torch]

from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification
from transformers import TrainingArguments
from transformers import Trainer

import evaluate
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
import torchaudio
from sklearn.model_selection import train_test_split
import os
import sys

Mounted at /content/drive
env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
env: TRANSFORMERS_CACHE=/content/cache
env: HF_DATASETS_CACHE=/content/cache
env: CUDA_LAUNCH_BLOCKING=1
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m737.7 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194



In [None]:
data = []

target_count = 120
genre_counts = {'house':0, 'R&B, Soul':0, 'trap':0, 'reagge':0, 'rock':0, 'jazz':0, 'pop':0, 'metal':0, 'rap':0,
 'latin':0, 'future bass':0, 'dnb':0, 'classical':0, 'dubstep':0, 'blues':0, 'folk':0,
 'country':0, 'hardstyle':0}

def should_select(label):
    return genre_counts[label] < target_count

for path in tqdm(Path("/content/drive/MyDrive/data").glob("**/*.wav")):
  name = str(path).split('/')[-1].split('.')[0]
  #print(name)
  label = str(path).split('/')[-2]
  #print(label)

  if should_select(label):
          name = str(path).split('/')[-1].split('.')[0]
          data.append({
          "name": name,
          "path": path,
          "genre": label
          })

          genre_counts[label] += 1

df = pd.DataFrame(data)
df.head()

print("Labels: ", df["genre"].unique())
print()
df.groupby("genre").count()[["path"]]

2167it [00:00, 15213.97it/s]

Labels:  ['country' 'dnb' 'house' 'jazz' 'hardstyle' 'classical' 'future bass'
 'blues' 'folk' 'dubstep' 'metal' 'reagge' 'latin' 'rock' 'pop' 'trap'
 'R&B, Soul' 'rap']






Unnamed: 0_level_0,path
genre,Unnamed: 1_level_1
"R&B, Soul",120
blues,120
classical,120
country,120
dnb,120
dubstep,120
folk,120
future bass,120
hardstyle,120
house,120


In [None]:
save_path = "/content/drive/MyDrive/data"

train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["genre"])

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)


print(train_df.shape)
print(test_df.shape)

data_files = {
    "train": r"/content/drive/MyDrive/data/train.csv",
    "validation": r"/content/drive/MyDrive/data/test.csv",
}

from datasets import load_dataset, load_metric

dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

print(train_dataset)
print(eval_dataset)

input_column = "path"
output_column = "genre"

label_list = train_dataset.unique(output_column)
label_list.sort(reverse=True)  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes: {label_list}")

(1728, 3)
(432, 3)


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['name', 'path', 'genre'],
    num_rows: 1728
})
Dataset({
    features: ['name', 'path', 'genre'],
    num_rows: 432
})
A classification problem with 18 classes: ['trap', 'rock', 'reagge', 'rap', 'pop', 'metal', 'latin', 'jazz', 'house', 'hardstyle', 'future bass', 'folk', 'dubstep', 'dnb', 'country', 'classical', 'blues', 'R&B, Soul']


In [None]:
model = "SeyedAli/Musical-genres-Classification-Hubert-V1"
feature_extractor = AutoFeatureExtractor.from_pretrained(model, do_normalize = True, return_attention_mask = True)
sampling_rate = feature_extractor.sampling_rate
print(f'Musical-genres-Classification-Hubert-V1 Sampling Rate: {sampling_rate} Hz')

Musical-genres-Classification-Hubert-V1 Sampling Rate: 16000 Hz


In [None]:
import librosa
import torch

first_sample = train_dataset[0]
print(first_sample)
#inputs = feature_extractor(train_dataset)

def speech_file_to_array_fn(path, sampling_rate=16000):
    audio, sr = librosa.load(path, sr=sampling_rate)
    return audio

audio = speech_file_to_array_fn(first_sample['path'])
inputs = feature_extractor(audio, sampling_rate=16000, return_tensors="pt", padding=True)

# Normalize to have variance 1
mean = torch.mean(inputs['input_values'])
variance = torch.var(inputs['input_values'])

print(f"Mean: {mean.item()}")
print(f"Variance: {variance.item()}")


{'name': 'Féa - Shades of Blue (Official Video)_3', 'path': '/content/drive/MyDrive/data/house/Féa - Shades of Blue (Official Video)_3.wav', 'genre': 'house'}
Mean: -4.597505043335559e-09
Variance: 1.000001311302185


In [None]:
max_duration = 30.0 # 30 seconds

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    audio_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]

    # Preprocessing audio inputs
    inputs = feature_extractor(audio_list,
                              sampling_rate = feature_extractor.sampling_rate,
                              max_length = int(feature_extractor.sampling_rate * max_duration),
                              truncation = True,
                              return_attention_mask = True)
    inputs["labels"] = list(target_list)

    return inputs

In [None]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    num_proc=4
)

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/1728 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/432 [00:00<?, ? examples/s]

In [None]:
hubert_model = AutoModelForAudioClassification.from_pretrained(
    model,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    ignore_mismatched_sizes=True,
  )

config.json:   0%|          | 0.00/1.85k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/94.8M [00:00<?, ?B/s]

Some weights of the model checkpoint at SeyedAli/Musical-genres-Classification-Hubert-V1 were not used when initializing HubertForSequenceClassification: ['hubert.encoder.pos_conv_embed.conv.weight_g', 'hubert.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing HubertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of HubertForSequenceClassification were not initialized from the model checkpoint at SeyedAli/Musical-genres-Classification-Hubert-V1 and are newly initialized: ['hubert.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'hubert

In [None]:
idx = 0
#print(f"Training input_values: {train_dataset[idx]['input_values']}")
print(f"Training labels: {train_dataset[idx]['labels']} - {train_dataset[idx]['genre']}")

Training labels: 8 - house


In [None]:
model_output_dir = "/content/drive/MyDrive/hubert genre/"

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = model_output_dir,
    evaluation_strategy = 'steps',
    save_strategy = 'steps',
    #load_best_model_at_end = True,
    metric_for_best_model = 'accuracy',
    save_steps = 600,
    eval_steps = 300,
    logging_steps = 300,
    learning_rate = 5e-5,
    seed = 42,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 1,
    num_train_epochs = 7,
    warmup_ratio = 0.1,
    fp16 = True,
    save_total_limit = 2,
    report_to = 'none',
    adam_epsilon = 1e-08,
    adam_beta1 = 0.9,
    adam_beta2 = 0.999,
)



In [None]:
metric = evaluate.load('accuracy')
# Creating function to compute accuracy
def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis = 1)
    return metric.compute(predictions = predictions, references = eval_pred.label_ids)

In [None]:
trainer = Trainer(
    model=hubert_model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = feature_extractor,
    compute_metrics = compute_metrics)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
300,2.5845,2.157084,0.409722
600,1.9116,1.602522,0.486111
900,1.529,1.340546,0.597222


Step,Training Loss,Validation Loss,Accuracy
300,2.5845,2.157084,0.409722
600,1.9116,1.602522,0.486111
900,1.529,1.340546,0.597222
1200,1.1049,1.184507,0.625
1500,0.8368,1.092867,0.666667
1800,0.587,0.964095,0.701389
2100,0.3451,0.883491,0.75
2400,0.234,0.829888,0.766204
2700,0.1526,0.835317,0.775463
3000,0.1137,0.809304,0.782407


TrainOutput(global_step=3024, training_loss=0.93317037410837, metrics={'train_runtime': 9475.3365, 'train_samples_per_second': 1.277, 'train_steps_per_second': 0.319, 'total_flos': 8.2539667574784e+17, 'train_loss': 0.93317037410837, 'epoch': 7.0})

In [None]:
trainer.save_model(model_output_dir)
feature_extractor.save_pretrained(model_output_dir)

['/content/drive/MyDrive/hubert genre/preprocessor_config.json']