In [1]:
! nvidia-smi

Mon Mar 25 08:45:31 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           Off | 00000000:AF:00.0 Off |                    0 |
| N/A   43C    P0              58W / 300W |      0MiB / 32768MiB |      2%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
pip -q install torch transformers datasets librosa noisereduce accelerate evaluate -U

Note: you may need to restart the kernel to use updated packages.


In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from datasets import Dataset, load_dataset, load_metric
import soundfile as sf
import random
import librosa
import noisereduce
from IPython.display import Audio, display
from transformers import AutoFeatureExtractor, pipeline
from transformers import AutoModelForAudioClassification, Trainer, TrainingArguments
import evaluate
from accelerate import Accelerator
import torch

2024-03-25 13:35:10.388687: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
unique_dataset = pd.read_csv('metadata_train.csv').drop_duplicates(subset=['label'])
label_to_intent_mapping = dict(zip(unique_dataset['label'], unique_dataset['intent']))
id2label = dict(sorted(label_to_intent_mapping.items()))
label2id = {v: k for k, v in id2label.items()}

In [4]:
print(label2id)
print('-'*100)
print(id2label)

{'General Weakness': 0, 'Mental Health': 1, 'Physical Health Issues': 2, 'Skin and Sensory Issues': 3}
----------------------------------------------------------------------------------------------------
{0: 'General Weakness', 1: 'Mental Health', 2: 'Physical Health Issues', 3: 'Skin and Sensory Issues'}


#### Creating custom audio datasets

In [5]:
# train_metadata = pd.read_csv('metadata_train.csv').drop(['intent', 'phrase'], axis=1)
# train_metadata.to_csv('recordings/train/metadata.csv', index=False)

In [6]:
# val_metadata = pd.read_csv('metadata_validation.csv').drop(['intent', 'phrase'], axis=1)
# val_metadata.to_csv('recordings/validation/metadata.csv', index=False)

In [7]:
# test_metadata = pd.read_csv('metadata_test.csv').drop(['intent', 'phrase'], axis=1)
# test_metadata.to_csv('recordings/test/metadata.csv', index=False)

In [8]:
# no need to run again since we have pushed this dataset to Huggingface hub
# dataset = load_dataset('audiofolder', data_dir='recordings/') 
# dataset.push_to_hub('shreyas1104/medical-intent-audio-dataset') 

dataset = load_dataset('shreyas1104/medical-intent-audio-dataset')

In [9]:
train_dataset = dataset['train']
train_dataset[0]

{'audio': {'path': '1249120_13842059_104469105.wav',
  'array': array([-0.00585938, -0.00439453, -0.00439453, ..., -0.02410889,
         -0.02560425, -0.0211792 ]),
  'sampling_rate': 48000},
 'label': 2}

In [10]:
validation_dataset = dataset['validation']
validation_dataset[0]

{'audio': {'path': '1249120_44246595_101823153.wav',
  'array': array([ 3.35693359e-04,  3.35693359e-04,  2.74658203e-04, ...,
          0.00000000e+00,  0.00000000e+00, -3.05175781e-05]),
  'sampling_rate': 44100},
 'label': 2}

In [11]:
test_dataset = dataset['test']
test_dataset[0]

{'audio': {'path': '1249120_44142156_100535941.wav',
  'array': array([-0.00067139, -0.0007019 , -0.00064087, ...,  0.00033569,
          0.00027466,  0.00018311]),
  'sampling_rate': 48000},
 'label': 2}

#### Checking a few random examples from train dataset

In [12]:
for _ in range(5):
    rand_idx = random.randint(0, len(train_dataset) - 1)
    example = train_dataset[rand_idx]
    audio = example["audio"]

    print(f"Label: {example['label']}")
    print(f"Shape: {audio['array'].shape}, sampling rate: {audio['sampling_rate']}")
    display(Audio(audio["array"], rate=audio["sampling_rate"]))
    print()

Label: 2
Shape: (237568,), sampling rate: 48000



Label: 2
Shape: (323584,), sampling rate: 48000



Label: 2
Shape: (770048,), sampling rate: 192000



Label: 0
Shape: (397312,), sampling rate: 48000



Label: 0
Shape: (122880,), sampling rate: 48000





#### Preprocessing

In [13]:
model_checkpoint = 'facebook/wav2vec2-conformer-rel-pos-large'

In [14]:
feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "Wav2Vec2Processor",
  "return_attention_mask": true,
  "sampling_rate": 16000
}

In [15]:
max_duration = 10  # seconds

In [16]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    
    # Resample audio to 16 kHz
    resampled_audio = []
    for audio in audio_arrays:
        resampled_audio.append(librosa.resample(audio, orig_sr=48000, target_sr=16000))
    
    # Apply noise reduction
    denoised_audio = []
    for audio in resampled_audio:
        noise_reduced_audio = noisereduce.reduce_noise(y=audio, sr=16000)
        denoised_audio.append(noise_reduced_audio)
    
    # Normalize audio
    normalized_audio = []
    for audio in denoised_audio:
        normalized_audio.append(audio / np.max(np.abs(audio)))
    
    # Apply dynamic range compression (optional)
    compressed_audio = []
    for audio in normalized_audio:
        compressed_audio.append(0.5 * audio / np.max(np.abs(audio)))
    
    inputs = feature_extractor(
        compressed_audio,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=int(feature_extractor.sampling_rate * max_duration),
        truncation=True,
    )
    return inputs

In [17]:
encoded_train = train_dataset.map(preprocess_function, remove_columns=['audio'], batched=True)

In [18]:
encoded_validation = validation_dataset.map(
    preprocess_function, remove_columns=['audio'], batched=True
)

In [19]:
encoded_test = test_dataset.map(
    preprocess_function, remove_columns=['audio'], batched=True
)

#### Training

In [20]:
num_labels = len(label2id.keys())

model = AutoModelForAudioClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

Some weights of Wav2Vec2ConformerForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-conformer-rel-pos-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight', 'wav2vec2_conformer.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2_conformer.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
model_name = model_checkpoint.split("/")[-1]
model_name = f"{model_name}-medical-intent"

In [22]:
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    metric1 = load_metric('accuracy')
    metric2 = load_metric('precision')
    metric3 = load_metric('recall')
    predictions = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids
    accuracy = metric1.compute(predictions=predictions, references=labels)['accuracy']
    precision = metric2.compute(predictions=predictions, references=labels, average='weighted')['precision']
    recall = metric3.compute(predictions=predictions, references=labels, average='weighted')['recall']
    return {'accuracy':accuracy, 'precision':precision, 'recall':recall}

In [27]:
args = TrainingArguments(
    model_name,
    hub_model_id=f"shreyas1104/{model_name}",
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=0.0001,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=16,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    push_to_hub=True,
)

In [28]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_train,
    eval_dataset=encoded_validation,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [29]:
torch.cuda.empty_cache()

In [None]:
accelerator = Accelerator()
trainer = accelerator.prepare(trainer)
trainer.train()

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss


In [None]:
trainer.evaluate()

In [None]:
trainer.push_to_hub()

#### Evaluation on Test set

In [None]:
finetuned_model = pipeline('audio-classification', model='shreyas1104/wav2vec2-conformer-rel-pos-large-medical-intent-v2')