**Wav2Vec 2.0** is a pretrained model for Automatic Speech Recognition (ASR) and was released in [September 2020](https://ai.facebook.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/) by Alexei Baevski, Michael Auli, and Alex Conneau.  Soon after the superior performance of Wav2Vec2 was demonstrated on the English ASR dataset LibriSpeech, *Facebook AI* presented XLSR-Wav2Vec2 (click [here](https://arxiv.org/abs/2006.13979)). XLSR stands for *cross-lingual  speech representations* and refers to XLSR-Wav2Vec2`s ability to learn speech representations that are useful across multiple languages.

Similar to Wav2Vec2, XLSR-Wav2Vec2 learns powerful speech representations from hundreds of thousands of hours of speech in more than 50 languages of unlabeled speech. Similar, to [BERT's masked language modeling](http://jalammar.github.io/illustrated-bert/), the model learns contextualized speech representations by randomly masking feature vectors before passing them to a transformer network.

![wav2vec2_structure](https://raw.githubusercontent.com/patrickvonplaten/scientific_images/master/xlsr_wav2vec2.png)

The authors show for the first time that massively pretraining an ASR model on cross-lingual unlabeled speech data, followed by language-specific fine-tuning on very little labeled data achieves state-of-the-art results. See Table 1-5 of the official [paper](https://arxiv.org/pdf/2006.13979.pdf).

In [None]:
%%capture

!pip install git+https://github.com/huggingface/datasets.git
!pip install git+https://github.com/huggingface/transformers.git
!pip install torchaudio
!pip install librosa

# Monitor the training process
# !pip install wandb

In [None]:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/kaggle/working/cache
%env HF_DATASETS_CACHE=/kaggle/working/cache
%env CUDA_LAUNCH_BLOCKING=1

## Set Variables

In [None]:
dataset_path = '../input/gtzan-dataset-music-genre-classification/Data'

In [None]:
ignorefiles = ['jazz.00054.wav']

In [None]:
model_name_or_path = "m3hrdadfi/wav2vec2-base-100k-gtzan-music-genres"
# model_name_or_path = "models/wav2vec2-base-100k-gtzan-music-genres"

## Import Libraries

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio

import transformers
from transformers import (
    HfArgumentParser,
    TrainingArguments,
    EvalPrediction,
    AutoConfig,
    Wav2Vec2Processor,
    Wav2Vec2FeatureExtractor,
    is_apex_available,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint, is_main_process


import os
import sys
import librosa
import numpy as np
import pandas as pd
import librosa
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from pathlib import Path
from tqdm.auto import tqdm

import torchaudio
from sklearn.model_selection import train_test_split


## wav2vec2 model

In [None]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput

In [None]:
@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

In [None]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)

In [None]:
class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x


class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

## Prepare Dataset

### Read directory structure

In [None]:
data = []

for path in Path(f'{dataset_path}/genres_original').glob("**/*.wav"):
    if sys.platform == 'win32':
        pathsep = '\\'
    else:
        pathsep = '/'
        
    label = str(path).split(pathsep)[-2]
    name = str(path).split(pathsep)[-1]
    
    if name in ignorefiles:
        continue

    data.append({
        # "name": name,
        "path": path,
        "label": label
    })

In [None]:
df = pd.DataFrame(data)
df.head()

In [None]:
df.groupby("label").count()[["path"]]

### Split train and test set

In [None]:
save_path = "/kaggle/working"

train_df, test_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["label"])

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df.to_csv(f"{save_path}/train.csv", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", encoding="utf-8", index=False)


print(train_df.shape)
print(test_df.shape)

In [None]:
train_df, rest_df = train_test_split(df, test_size=0.2, random_state=101, stratify=df["label"])
valid_df, test_df = train_test_split(rest_df, test_size=0.5, random_state=101, stratify=rest_df["label"])

train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df.to_csv(f"{save_path}/train.csv", encoding="utf-8", index=False)
valid_df.to_csv(f"{save_path}/valid.csv", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", encoding="utf-8", index=False)


print(train_df.shape)
print(valid_df.shape)
print(test_df.shape)

## Prepare Data for Training

In [None]:
from datasets import load_dataset, load_metric

In [None]:
data_files = {
    "train": f"{save_path}/train.csv", 
    "validation": f"{save_path}/test.csv",
    # "test": f"{dataset_path}/test.csv",
}

dataset = load_dataset("csv", data_files=data_files)

In [None]:
train_dataset = dataset["train"]
eval_dataset = dataset["validation"]

In [None]:
# We need to specify the input and output column
input_column = "path"
output_column = "label"

In [None]:
# we need to distinguish the unique labels in our SER dataset
label_list = train_dataset.unique(output_column)
label_list.sort()  # Let's sort it for determinism
num_labels = len(label_list)
print(f"A classification problem with {num_labels} classes:\n {label_list}")

In [None]:
from transformers import AutoConfig, Wav2Vec2Processor, Wav2Vec2FeatureExtractor

In [None]:
model_name_or_path = "facebook/wav2vec2-base-100k-voxpopuli"
pooling_mode = "mean"

In [None]:
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

In [None]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
target_sampling_rate = feature_extractor.sampling_rate
print(f"The target sampling rate: {target_sampling_rate}")

## Preprocess Data

In [None]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def label_to_id(label, label_list):

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label, label_list) for label in examples[output_column]]

    result = feature_extractor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result

In [None]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=10,
    batched=True,
    # num_proc=4
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=10,
    batched=True,
    # num_proc=4
)

## Set Up Trainer

In [None]:
from dataclasses import dataclass
from typing import Dict, List, Optional, Union
import torch

import transformers
from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    feature_extractor: Wav2Vec2FeatureExtractor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.feature_extractor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(feature_extractor=feature_extractor, padding=True)

In [None]:
is_regression = False

In [None]:
import numpy as np
from transformers import EvalPrediction

In [None]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)

    if is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

In [None]:
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)

In [None]:
model.freeze_feature_extractor()

In [None]:
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast


class CTCTrainer(Trainer):
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        model.train()
        inputs = self._prepare_inputs(inputs)

        if self.use_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
            loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_amp:
            self.scaler.scale(loss).backward()
        elif self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            self.deepspeed.backward(loss)
        else:
            loss.backward()

        return loss.detach()


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="/content/wav2vec2-base-100k-eating-sound-collection",
    # output_dir="/content/gdrive/MyDrive/wav2vec2-base-100k-eating-sound-collection"
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=1.0,
    fp16=True,
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=1e-4,
    save_total_limit=2,
)

training_args = TrainingArguments(
    # _n_gpu=1,
    adafactor=False,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08,
    bf16=False,
    bf16_full_eval=False,
    dataloader_drop_last=False,
    dataloader_num_workers=0,
    dataloader_pin_memory=True,
    ddp_bucket_cap_mb=None,
    ddp_find_unused_parameters=None,
    debug=[],
    deepspeed=None,
    disable_tqdm=False,
    do_eval=True,
    do_predict=True,
    do_train=True,
    eval_accumulation_steps=None,
    eval_steps=500,
    evaluation_strategy="steps",
    fp16=True,
    fp16_full_eval=False,
    gradient_accumulation_steps=2,
    gradient_checkpointing=False,
    greater_is_better=None,
    group_by_length=False,
    ignore_data_skip=False,
    label_names=None,
    label_smoothing_factor=0.0,
    learning_rate=0.0001,
    load_best_model_at_end=False,
    local_rank=-1,
    max_grad_norm=1.0,
    max_steps=-1,
    metric_for_best_model=None,
    no_cuda=False,
    num_train_epochs=20.0,
    optim="adamw_hf",
    output_dir=f"{save_path}/wav2vec2-base-100k-voxpopuli-gtzan-music",
    overwrite_output_dir=True,
    past_index=-1,
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    prediction_loss_only=False,
    remove_unused_columns=True,
    report_to=['tensorboard'],
    resume_from_checkpoint=None,
    save_on_each_node=False,
    save_steps=100,
    save_total_limit=2,
    seed=42,
    sharded_ddp=[],
    skip_memory_metrics=True,
    tf32=None,
    tpu_metrics_debug=False,
    tpu_num_cores=None,
    use_legacy_prediction_loop=False,
    warmup_ratio=0.0,
    warmup_steps=2000,
    weight_decay=0.0,
    xpu_backend=None,
)

In [None]:
trainer = CTCTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=feature_extractor,
)

## Train

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

## Evaluation

In [None]:
import numpy as np
import librosa
import torchaudio
from datasets import load_dataset, load_metric
from sklearn.metrics import classification_report

In [None]:
test_dataset = load_dataset("csv", data_files={"test": "./test.csv"})["test"]
test_dataset

In [None]:
model_name_or_path = "m3hrdadfi/wav2vec2-base-100k-gtzan-music-genres"
model_name_or_path = "./wav2vec2-base-100k-voxpopuli-gtzan-music/checkpoint-7900"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(model_name_or_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)

In [None]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    speech_array = speech_array.squeeze().numpy()
    speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=feature_extractor.sampling_rate)

    batch["speech"] = speech_array
    return batch


def predict(batch):
    features = feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate, return_tensors="pt", padding=True)

    input_values = features.input_values.to(device)

    with torch.no_grad():
        logits = model(input_values).logits 

    pred_ids = torch.argmax(logits, dim=-1).detach().cpu().numpy()
    batch["predicted"] = pred_ids
    return batch

In [None]:
test_dataset = test_dataset.map(speech_file_to_array_fn)

In [None]:
result = test_dataset.map(predict, batched=True, batch_size=8)

In [None]:
label_names = [config.id2label[i] for i in range(config.num_labels)]

In [None]:
y_true = [config.label2id[name] for name in result["label"]]
y_pred = result["predicted"]

print(y_true[:5])
print(y_pred[:5])

In [None]:
print(classification_report(y_true, y_pred, target_names=label_names))

## Clean Up

In [None]:
!rm -rf cache