In [1]:
import torchaudio
import torch

In [2]:
import os
#from deep_utils.utils.lr_scheduler_utils.warmup import warmup_cosine
from datasets import load_dataset, Audio
from transformers import AutoFeatureExtractor
from transformers import AutoModelForAudioClassification, TrainingArguments #, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def dump_pickle(file_path: str, file, mode: str = "wb"):
    import pickle

    with open(file_path, mode=mode) as f:
        pickle.dump(file, f)


def load_pickle(file_path: str, mode: str = "rb", encoding=""):
    import pickle

    with open(file_path, mode=mode) as f:
        return pickle.load(f, encoding=encoding)


In [4]:
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
feature_extractor

Downloading (…)rocessor_config.json: 100%|██████████| 159/159 [00:00<00:00, 43.9kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.84k/1.84k [00:00<00:00, 626kB/s]


Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [None]:
import pandas as pd
train = pd.read_csv('../data/cv-valid-train.csv')
test = pd.read_csv('../data/cv-valid-test.csv')

In [None]:
train = train.drop(['text', 'up_votes', 'down_votes', 'age', 'accent', 'duration'], axis=1)
train = train[train.gender != 'other']
train = train.rename(columns={'filename':'audio_path', 'gender':'label'})
train = train.dropna()
test = test.drop(['text', 'up_votes', 'down_votes', 'age', 'accent', 'duration'], axis=1)
test = test[test.gender != 'other']
test = test.rename(columns={'filename':'audio_path', 'gender':'label'})
test = test.dropna()

In [None]:
x = len(train) - 2 * len(train[train.label == 'female'])
train = train.drop(train[train['label'] == 'male'].sample(x).index)

In [None]:
x = len(test) - 2 * len(test[test.label == 'female'])
test = test.drop(test[test['label'] == 'male'].sample(x).index)

In [None]:
test.to_csv('../data/processed_test.csv')
train.to_csv('../data/processed_train.csv')

In [5]:
train_path = "../data/processed_train.csv"
test_path = '../data/processed_test.csv'
dataset = load_dataset('csv', data_files={'train': train_path,
                                          'test': test_path})
dataset = dataset.cast_column("audio_path", Audio(sampling_rate=16_000))
dataset["train"][0]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-ba4f4949eb2ec7fd/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files: 100%|██████████| 2/2 [00:00<00:00, 17225.07it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 1310.31it/s]
                                                        

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-ba4f4949eb2ec7fd/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 517.05it/s]


{'Unnamed: 0': 5,
 'audio_path': {'path': 'cv-valid-train/sample-000005.mp3',
  'array': array([ 5.49967413e-15, -7.54565261e-15,  4.40981961e-15, ...,
         -2.54557634e-08,  3.04820867e-08,  2.05002413e-08]),
  'sampling_rate': 16000},
 'label': 'female'}

Remember: pip install soundfile==0.12.1

In [6]:
import random
import IPython.display as ipd
import librosa
index = random.randint(0, len(dataset['train']))

path = dataset['train'][index]['audio_path']['path']
waveform, sr = librosa.load(path)
text = dataset['train'][index]['label']
print(text)
ipd.Audio(waveform, rate=sr, autoplay=True)

male


In [None]:
labels = set(dataset["train"]['label'])
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
label2id

{'female': '0', 'male': '1'}

In [None]:
# save label2id to be used in test
os.makedirs("results/best", exist_ok=True)
dump_pickle("results/best/label2id.pkl", label2id)

In [None]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio_path"]]
    inputs = feature_extractor(
        audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
    )
    label = [int(label2id[x]) for x in examples["label"]]
    inputs["label"] = label
    return inputs

In [None]:
encoded_dataset = dataset.map(preprocess_function, remove_columns="audio_path", batched=True)
encoded_dataset['train'][0]

  tensor = as_tensor(value)
                                                                 

{'Unnamed: 0': 5,
 'label': 0,
 'input_values': [0.0002895552897825837,
  0.0002895552897825837,
  0.0002895552897825837,
  0.0002895552897825837,
  0.0002895552897825837,
  0.0002895552897825837,
  0.0002895552897825837,
  0.0002895552897825837,
  0.00028955531888641417,
  0.0002895552897825837,
  0.0002895552897825837,
  0.0002895552897825837,
  0.0002895552897825837,
  0.00028955531888641417,
  0.0002895552315749228,
  0.0002895552897825837,
  0.0002895552897825837,
  0.0002895552897825837,
  0.00028955514426343143,
  0.0002895552897825837,
  0.00028955526067875326,
  0.000289555115159601,
  0.0002895552897825837,
  0.0002895552315749228,
  0.0002895554935093969,
  0.00028955540619790554,
  0.0002895553479902446,
  0.0002895550278481096,
  0.00028955531888641417,
  0.0002895552897825837,
  0.0002895550860557705,
  0.000289555115159601,
  0.00028955464949831367,
  0.0002895547659136355,
  0.0002895552315749228,
  0.00028955412562936544,
  0.0002895543002523482,
  0.000289555144263431

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    recall = recall_score(labels, predictions, average="weighted")
    precision = precision_score(labels, predictions, average="weighted")

    return {"accuracy": acc, "f1-score": f1, "recall-score": recall, "precision-score": precision}

## TRAIN

In [None]:
import math
import torch
from transformers import EarlyStoppingCallback
early_stopping = EarlyStoppingCallback(early_stopping_patience=5)

train_bs = 64 
epochs = 25
lr = 5e-5
lrf = lr
output_dir = "./results"
total_steps = int((np.ceil(encoded_dataset["train"].num_rows / train_bs) * epochs))

num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=epochs,
    report_to="tensorboard",
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model='loss',
    per_device_train_batch_size = train_bs,
    per_device_eval_batch_size = 64,
    logging_steps=1,
)

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, warmup_cosine(100,
                                                                       max_lr=lr,
                                                                       total_steps=total_steps,
                                                                       optimizer_lr=lr,
                                                                       min_lr=1e-6))
# reduce lr with a cosine annealing if total_steps is set to total_steps
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_steps)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)

trainer.train()
trainer.save_model(os.path.join(output_dir, "best"))

Downloading pytorch_model.bin: 100%|██████████| 380M/380M [00:05<00:00, 70.8MB/s] 
Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['quantizer.codevectors', 'quantizer.weight_proj.weight', 'quantizer.weight_proj.bias', 'project_q.weight', 'project_q.bias', 'project_hid.weight', 'project_hid.bias']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-

Epoch,Training Loss,Validation Loss


In [None]:
torch.save(model, '/data/audio-classification-pytorch/wav2vec2/results/model1.pt')

In [None]:
dump_pickle("results/best/id2label.pkl", id2label)

In [7]:
label2id = load_pickle('/data/audio-classification-pytorch/wav2vec2/results/best/label2id.pkl')
id2label = load_pickle('/data/audio-classification-pytorch/wav2vec2/results/best/id2label.pkl')

In [8]:
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=len(label2id), label2id=label2id, id2label=id2label
)

Downloading pytorch_model.bin: 100%|██████████| 380M/380M [00:03<00:00, 111MB/s]  
Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2ForSequenceClassification: ['quantizer.weight_proj.weight', 'project_hid.bias', 'quantizer.codevectors', 'quantizer.weight_proj.bias', 'project_hid.weight', 'project_q.bias', 'project_q.weight']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-

In [9]:
import torchaudio
import torch

In [10]:
checkpoint = torch.load('/data/audio-classification-pytorch/wav2vec2/results/best/pytorch_model.bin')

In [11]:
model.load_state_dict(checkpoint)

<All keys matched successfully>

## TEST

In [12]:
import torchaudio
import torch
import librosa
device = "cpu"
model = model.to(device)
path = dataset['train'][index]['audio_path']['path']
waveform, sr = librosa.load(path)
waveform = torch.from_numpy(waveform).unsqueeze(0)
waveform = torchaudio.transforms.Resample(sr, 16_000)(waveform)
inputs = feature_extractor(waveform, sampling_rate=feature_extractor.sampling_rate,
                           max_length=16000, truncation=True)
tensor = torch.tensor(inputs['input_values'][0]).to(device)
with torch.no_grad():
    output = model(tensor)
    logits = output['logits'][0]
    label_id = torch.argmax(logits).item()
label_name = id2label[str(label_id)]

print(f"{label_name}, real: {dataset['train'][index]['label']}")

male, real: male


In [15]:
import torchaudio
import torch
import librosa
device = "cpu"
model = model.to(device)
correct = 0
for i, test in enumerate(dataset['test']):
    path = test['audio_path']['path']
    waveform, sr = librosa.load(path)
    waveform = torch.from_numpy(waveform).unsqueeze(0)
    waveform = torchaudio.transforms.Resample(sr, 16_000)(waveform)
    inputs = feature_extractor(waveform, sampling_rate=feature_extractor.sampling_rate,
                            max_length=16000, truncation=True)
    tensor = torch.tensor(inputs['input_values'][0]).to(device)
    with torch.no_grad():
        output = model(tensor)
        logits = output['logits'][0]
        label_id = torch.argmax(logits).item()
    label_name = id2label[str(label_id)]
    if label_name == test['label']:
        correct += 1

NameError: name 'dataset' is not defined

In [None]:
correct / len(dataset['test'])

In [13]:
import gradio as gr

In [19]:
import torchaudio
import torch
import librosa
device = "cpu"
model = model.to(device)
path = dataset['train'][index]['audio_path']['path']
waveform, sr = librosa.load(path)
waveform = torch.from_numpy(waveform).unsqueeze(0)
waveform = torchaudio.transforms.Resample(sr, 16_000)(waveform)
inputs = feature_extractor(waveform, sampling_rate=feature_extractor.sampling_rate,
                           max_length=16000, truncation=True)
tensor = torch.tensor(inputs['input_values'][0]).to(device)
with torch.no_grad():
    output = model(tensor)
    logits = output['logits'][0]
    label_id = torch.argmax(logits).item()
label_name = id2label[str(label_id)]

print(f"{label_name}, real: {dataset['train'][index]['label']}")

[-6.9784808e-20 -1.9044174e-19 -6.4183735e-20 ... -5.9214358e-06
 -1.0099855e-06  0.0000000e+00]
female, real: female


In [1]:
import gradio as gr

def greet(name):
    return "Hello!"

demo = gr.Interface(fn=greet, inputs="mic", outputs="text")
    
if __name__ == "__main__":
    demo.launch() 

  from .autonotebook import tqdm as notebook_tqdm


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/gradio/processing_utils.py", line 142, in audio_from_file
    audio = AudioSegment.from_file(filename)
  File "/usr/local/lib/python3.8/dist-packages/pydub/audio_segment.py", line 728, in from_file
    info = mediainfo_json(orig_file, read_ahead_limit=read_ahead_limit)
  File "/usr/local/lib/python3.8/dist-packages/pydub/utils.py", line 274, in mediainfo_json
    res = Popen(command, stdin=stdin_parameter, stdout=PIPE, stderr=PIPE)
  File "/usr/lib/python3.8/subprocess.py", line 858, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "/usr/lib/python3.8/subprocess.py", line 1704, in _execute_child
    raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: 'ffprobe'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packa