# Data2Vec

## Libraries

In [1]:
from datasets import Dataset, Audio, ClassLabel, Features, Value
import pandas as pd
import torch
import torch.nn as nn
import evaluate
import numpy as np
import warnings
from transformers import Trainer, TrainingArguments, TrainerCallback
from audiomentations import Compose, AddGaussianSNR, GainTransition, Gain, ClippingDistortion, TimeStretch, PitchShift
from transformers import AutoConfig, Data2VecAudioForSequenceClassification
from copy import deepcopy
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import gc

2025-01-08 00:30:16.824931: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736289016.847422 2175216 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736289016.854046 2175216 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-08 00:30:16.876649: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Ensure GPU access

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print(f'Using device: {device}')

Using device: cuda


## Load dataset

In [3]:
# Define class labels
class_labels = ClassLabel(names=["Negative Sentiment", "Positive Sentiment"])

# Define features with audio and label columns
features = Features({
    "audio": Audio(),        # Define the audio feature
    "labels": class_labels,  # Assign the class labels
})

label2id = {
    "Negative Sentiment": 0,
    "Positive Sentiment": 1
}

In [4]:
dataset_base_path = "/home/k/kyparkypar/ondemand/data/sys/myjobs/projects/default/dataset/CMU-MOSI/"

In [5]:
# Load and preprocess the CSV file
df = pd.read_csv(dataset_base_path + "label_edited.csv", 
                 sep='\t', 
                 encoding='utf-8', 
                 header=0)

df = df[['video_id', 'clip_id', 'mode', 'text', 'processed_text']].dropna()

In [6]:
df

Unnamed: 0,video_id,clip_id,mode,text,processed_text
0,03bSnISJMiM,11,train,A LOT OF SAD PARTS,a lot of sad part
1,03bSnISJMiM,10,train,THERE IS SAD PART,there is sad part
2,03bSnISJMiM,13,train,AND ITS A REALLY FUNNY,and it a really funny
3,03bSnISJMiM,12,train,BUT IT WAS REALLY REALLY AWESOME,but it wa really really awesome
4,03bSnISJMiM,1,train,ANYHOW IT WAS REALLY GOOD,anyhow it wa really good
...,...,...,...,...,...
2194,zhpQhgha_KU,30,test,BECAUSE THERE REALLY WASNT ALL THAT MUCH TO IT...,because there really wa not all that much to i...
2195,zhpQhgha_KU,35,test,UM SO IF YOU LIKE TO HEAR A UM LIKE MORE POSIT...,so if you like to hear a like more positive re...
2196,zhpQhgha_KU,34,test,AND SHE REALLY ENJOYED THE FILM,and she really enjoyed the film
2197,zhpQhgha_KU,33,test,IF YOU DO WANNA SEE SOMEBODY WHOS POSSIBLY CRI...,if you do want to see somebody who is possibly...


In [7]:
# This function is used to return the full path of a video in the CMU-MOSI dataset format
def get_audio_path(video_id, clip_id):
    return dataset_base_path + f"Splited/Raw_onlyAudio/{video_id}/{clip_id}.wav"

In [8]:
df['audio_path'] = df.apply(lambda row: get_audio_path(row['video_id'], row['clip_id']), axis=1)

In [9]:
df

Unnamed: 0,video_id,clip_id,mode,text,processed_text,audio_path
0,03bSnISJMiM,11,train,A LOT OF SAD PARTS,a lot of sad part,/home/k/kyparkypar/ondemand/data/sys/myjobs/pr...
1,03bSnISJMiM,10,train,THERE IS SAD PART,there is sad part,/home/k/kyparkypar/ondemand/data/sys/myjobs/pr...
2,03bSnISJMiM,13,train,AND ITS A REALLY FUNNY,and it a really funny,/home/k/kyparkypar/ondemand/data/sys/myjobs/pr...
3,03bSnISJMiM,12,train,BUT IT WAS REALLY REALLY AWESOME,but it wa really really awesome,/home/k/kyparkypar/ondemand/data/sys/myjobs/pr...
4,03bSnISJMiM,1,train,ANYHOW IT WAS REALLY GOOD,anyhow it wa really good,/home/k/kyparkypar/ondemand/data/sys/myjobs/pr...
...,...,...,...,...,...,...
2194,zhpQhgha_KU,30,test,BECAUSE THERE REALLY WASNT ALL THAT MUCH TO IT...,because there really wa not all that much to i...,/home/k/kyparkypar/ondemand/data/sys/myjobs/pr...
2195,zhpQhgha_KU,35,test,UM SO IF YOU LIKE TO HEAR A UM LIKE MORE POSIT...,so if you like to hear a like more positive re...,/home/k/kyparkypar/ondemand/data/sys/myjobs/pr...
2196,zhpQhgha_KU,34,test,AND SHE REALLY ENJOYED THE FILM,and she really enjoyed the film,/home/k/kyparkypar/ondemand/data/sys/myjobs/pr...
2197,zhpQhgha_KU,33,test,IF YOU DO WANNA SEE SOMEBODY WHOS POSSIBLY CRI...,if you do want to see somebody who is possibly...,/home/k/kyparkypar/ondemand/data/sys/myjobs/pr...


#### Get audio paths and labels of the full dataset

In [10]:
# Construct audio file paths and labels
audio_paths = df.apply(lambda row: get_audio_path(row['video_id'], row['clip_id']), axis=1).tolist()
# labels = df['annotation_label'].astype(int).tolist()  # Convert labels to integers if necessary

Assure they are loaded correctly

In [11]:
# labels

In [12]:
audio_paths

['/home/k/kyparkypar/ondemand/data/sys/myjobs/projects/default/dataset/CMU-MOSI/Splited/Raw_onlyAudio/03bSnISJMiM/11.wav',
 '/home/k/kyparkypar/ondemand/data/sys/myjobs/projects/default/dataset/CMU-MOSI/Splited/Raw_onlyAudio/03bSnISJMiM/10.wav',
 '/home/k/kyparkypar/ondemand/data/sys/myjobs/projects/default/dataset/CMU-MOSI/Splited/Raw_onlyAudio/03bSnISJMiM/13.wav',
 '/home/k/kyparkypar/ondemand/data/sys/myjobs/projects/default/dataset/CMU-MOSI/Splited/Raw_onlyAudio/03bSnISJMiM/12.wav',
 '/home/k/kyparkypar/ondemand/data/sys/myjobs/projects/default/dataset/CMU-MOSI/Splited/Raw_onlyAudio/03bSnISJMiM/1.wav',
 '/home/k/kyparkypar/ondemand/data/sys/myjobs/projects/default/dataset/CMU-MOSI/Splited/Raw_onlyAudio/03bSnISJMiM/3.wav',
 '/home/k/kyparkypar/ondemand/data/sys/myjobs/projects/default/dataset/CMU-MOSI/Splited/Raw_onlyAudio/03bSnISJMiM/2.wav',
 '/home/k/kyparkypar/ondemand/data/sys/myjobs/projects/default/dataset/CMU-MOSI/Splited/Raw_onlyAudio/03bSnISJMiM/5.wav',
 '/home/k/kyparkypar

## Load feature extractor

In [13]:
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, Data2VecAudioForCTC

# we define which pretrained model we want to use and instantiate a feature extractor
# pretrained_model = "openai/whisper-tiny"
# pretrained_model = "facebook/wav2vec2-base-960h"
processor = AutoProcessor.from_pretrained("facebook/data2vec-audio-base-960h")

tokenizer_config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [14]:
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "Wav2Vec2Processor",
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='facebook/data2vec-audio-base-960h', vocab_size=32, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	1: AddedToken("<s>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False),
	2: AddedToken("</s>", rstrip=True, lstrip=True, single_word=False, normalized=False, special=False)

### View extractor's inputs

In [16]:
# model = AutoModelForSpeechSeq2Seq.from_pretrained(
#     pretrained_model,
#     torch_dtype=torch.float16,
#     low_cpu_mem_usage=True, 
#     use_safetensors=True
# )

model = Data2VecAudioForCTC.from_pretrained(
    "facebook/data2vec-audio-base-960h",
    # torch_dtype=torch.float16,
    # low_cpu_mem_usage=True, 
    # use_safetensors=True
)

model.to(device)

Data2VecAudioForCTC(
  (data2vec_audio): Data2VecAudioModel(
    (feature_extractor): Data2VecAudioFeatureEncoder(
      (conv_layers): ModuleList(
        (0): Data2VecAudioConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Data2VecAudioConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Data2VecAudioConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Data2VecAudioFeatureProjection(
      (layer_norm): LayerNorm((512,),

In [17]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch.float16,
    device=device,
)

In [18]:
import librosa

audio_path = audio_paths[3]

# Define the required sampling rate
SAMPLING_RATE = 16000

# Load the audio and resample it to the required sampling rate
audio, sampling_rate = librosa.load(audio_path, sr=SAMPLING_RATE)

# Convert audio to the format expected by the pipeline
inputs = {
    "raw": audio,  # Correct key name for audio waveform
    "sampling_rate": SAMPLING_RATE,  # Correct key name for sampling rate
}

result = pipe(inputs)

In [19]:
print(result["text"])

IT WAS REALLY REALLY ALESOME


In [20]:
# Function to transcribe audio using the pipeline
def transcribe_audio(audio_path):
    try:
        # Load audio file
        audio, sampling_rate = librosa.load(audio_path, sr=SAMPLING_RATE)
        
        # Prepare inputs for the pipeline
        inputs = {
            "raw": audio,
            "sampling_rate": SAMPLING_RATE,
        }
        
        # Run the pipeline
        result = pipe(inputs)
        
        return result['text']
        
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

In [21]:
# def transcribe_audio_in_chunks(audio_path, max_duration=10):
#     try:
#         # Load audio file
#         audio, sampling_rate = librosa.load(audio_path, sr=SAMPLING_RATE)
        
#         # Calculate the number of chunks
#         max_samples = max_duration * SAMPLING_RATE
#         total_samples = len(audio)
#         chunks = [audio[i:i + max_samples] for i in range(0, total_samples, max_samples)]
        
#         # Transcribe each chunk
#         transcriptions = []
#         for chunk in chunks:
#             inputs = {
#                 "raw": chunk,
#                 "sampling_rate": SAMPLING_RATE,
#             }
#             result = pipe(inputs)
#             transcriptions.append(result['text'])
        
#         # Combine transcriptions
#         # print(" ".join(transcriptions))
#         return " ".join(transcriptions)
#     except Exception as e:
#         print(f"Error processing {audio_path}: {e}")
#         return None

import time

def transcribe_audio_in_chunks(audio_path, max_duration=10):
    try:
        # Load audio file
        audio, sampling_rate = librosa.load(audio_path, sr=SAMPLING_RATE)
        
        # Calculate the number of chunks
        max_samples = max_duration * SAMPLING_RATE
        total_samples = len(audio)
        chunks = [audio[i:i + max_samples] for i in range(0, total_samples, max_samples)]
        
        # Transcribe each chunk and track inference time (in seconds)
        transcriptions = []
        start_time = time.time()
        for chunk in chunks:
            inputs = {
                "raw": chunk,
                "sampling_rate": SAMPLING_RATE,
            }
            result = pipe(inputs)
            transcriptions.append(result['text'])
        end_time = time.time()
        
        # Calculate inference time
        inference_time = end_time - start_time
        
        # Combine transcriptions
        transcription = " ".join(transcriptions)
        return transcription, inference_time
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None, None

In [22]:
def get_audio_length(audio_path, max_duration=None):
    """
    Calculate the length of an audio file in seconds.

    Parameters:
    audio_path (str): Path to the audio file.

    Returns:
    float: Length of the audio file in seconds.
    """
    try:
        # Load audio file without downsampling to get the original length
        audio, sampling_rate = librosa.load(audio_path, sr=None)  # sr=None preserves original sampling rate
        audio_length = len(audio) / sampling_rate

        if max_duration is not None:
            audio_length = min(audio_length, max_duration)
        
        return audio_length
    except Exception as e:
        print(f"Error calculating audio length for {audio_path}: {e}")
        return None

In [23]:
# df['model_text'] = df['audio_path'].apply(lambda path: transcribe_audio_in_chunks(path))

# Apply function and store results in separate columns
df[['model_text', 'inference_time']] = df['audio_path'].apply(lambda path: pd.Series(transcribe_audio_in_chunks(path)))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [24]:
df['audio_length'] = df['audio_path'].apply(lambda path: get_audio_length(path, max_duration=10))

In [25]:
df[['text', 'model_text', 'inference_time', 'audio_length']]

Unnamed: 0,text,model_text,inference_time,audio_length
0,A LOT OF SAD PARTS,LASHAPARTS,0.082763,2.146395
1,THERE IS SAD PART,EVER SAT A PARCH,0.331756,1.519456
2,AND ITS A REALLY FUNNY,AND IT'S A REALLY FUNNY,0.021338,1.658776
3,BUT IT WAS REALLY REALLY AWESOME,IT WAS REALLY REALLY ALESOME,0.016205,2.703673
4,ANYHOW IT WAS REALLY GOOD,ANYHOW IT WAS REALLY GOOD,0.022179,3.330612
...,...,...,...,...
2194,BECAUSE THERE REALLY WASNT ALL THAT MUCH TO IT...,BECAUSE THERE REALLY WASN'T ALL THAT MUCH TO I...,0.016572,3.539592
2195,UM SO IF YOU LIKE TO HEAR A UM LIKE MORE POSIT...,AN SO FEW LIKE TO HEAR A VERY ELIOR POSITIVE R...,0.019238,6.627846
2196,AND SHE REALLY ENJOYED THE FILM,AND SHE REALLY ENTERED THE FILM,0.015972,1.751655
2197,IF YOU DO WANNA SEE SOMEBODY WHOS POSSIBLY CRI...,YOU DO WANT TO SEE SOMEBODY WHO WAS PASSIVELY ...,0.018585,4.723810


In [26]:
import textPreprocess

In [27]:
df['model_processed_text'] = df['model_text'].apply(lambda x: textPreprocess.sentencePreprocess(x, remove_stopwords=False))

In [28]:
df[['processed_text', 'model_processed_text']]

Unnamed: 0,processed_text,model_processed_text
0,a lot of sad part,lashaparts
1,there is sad part,ever sat a parch
2,and it a really funny,and it is a really funny
3,but it wa really really awesome,it wa really really alesome
4,anyhow it wa really good,anyhow it wa really good
...,...,...
2194,because there really wa not all that much to i...,because there really wa not all that much to i...
2195,so if you like to hear a like more positive re...,an so few like to hear a very elior positive r...
2196,and she really enjoyed the film,and she really entered the film
2197,if you do want to see somebody who is possibly...,you do want to see somebody who wa passively c...


In [29]:
import jiwer

# Word Error Rate (WER): Measures the proportion of word substitutions, deletions, and insertions.
def calculate_wer(reference, hypothesis):
    measures = jiwer.compute_measures(reference, hypothesis)
    
    return measures['wer']

# Character Error Rate (CER): Similar to WER but at the character level.
def calculate_cer(reference, hypothesis):
    cer_value = jiwer.cer(reference, hypothesis)
    
    return cer_value

# The Real-Time Factor (RTF) is calculated as the ratio of the inference time to the audio duration for each clip. The formula is:
# RTF = Inference Time / Audio Duration
# Inference Time is the time taken by the ASR model to process the audio and generate the transcription.
# Audio Duration is the length of the audio clip in seconds.
def calculate_rtf(inference_time, audio_length):
    return inference_time/audio_length

In [30]:
df['wer'] = df.apply(lambda row: calculate_wer(row['processed_text'], row['model_processed_text']), axis=1)
df['cer'] = df.apply(lambda row: calculate_cer(row['processed_text'], row['model_processed_text']), axis=1)
df['rtf'] = df.apply(lambda row: calculate_rtf(row['inference_time'], row['audio_length']), axis=1)

In [31]:
df[['processed_text', 'model_processed_text', 'wer', 'cer', 'rtf', 'inference_time', 'audio_length']]

Unnamed: 0,processed_text,model_processed_text,wer,cer,rtf,inference_time,audio_length
0,a lot of sad part,lashaparts,1.000000,0.705882,0.038559,0.082763,2.146395
1,there is sad part,ever sat a parch,1.000000,0.588235,0.218339,0.331756,1.519456
2,and it a really funny,and it is a really funny,0.200000,0.142857,0.012864,0.021338,1.658776
3,but it wa really really awesome,it wa really really alesome,0.333333,0.161290,0.005994,0.016205,2.703673
4,anyhow it wa really good,anyhow it wa really good,0.000000,0.000000,0.006659,0.022179,3.330612
...,...,...,...,...,...,...,...
2194,because there really wa not all that much to i...,because there really wa not all that much to i...,0.083333,0.018519,0.004682,0.016572,3.539592
2195,so if you like to hear a like more positive re...,an so few like to hear a very elior positive r...,0.421053,0.252874,0.002903,0.019238,6.627846
2196,and she really enjoyed the film,and she really entered the film,0.166667,0.096774,0.009118,0.015972,1.751655
2197,if you do want to see somebody who is possibly...,you do want to see somebody who wa passively c...,0.416667,0.200000,0.003934,0.018585,4.723810


In [32]:
print("Mean WER:", df['wer'].mean())

Mean WER: 0.42043549609961123


In [33]:
print("Mean CER:", df['cer'].mean())

Mean CER: 0.2588298332930254


In [34]:
print("Mean RTF:", df['rtf'].mean())

Mean RTF: 0.007103018987016829


Real-world datasets = Far more challenging environments (e.g., noisy audio, diverse accents), good WER values can range from 10% to 30%, depending on the complexity of the data.

---

A good Mean Real-Time Factor (RTF) value depends on the specific use case, hardware, and the desired latency of the speech transcription system.
- An RTF of 1.0 means the system transcribes audio in real-time (i.e., it takes the same amount of time to transcribe as the duration of the audio).
- An RTF < 1.0 means the system processes audio faster than real-time.
- An RTF > 1.0 means the system is slower than real-time.
RTF ≤ 0.5: Excellent for real-time applications, such as live captioning or interactive systems.
RTF ~ 1.0: Acceptable for real-time or near-real-time processing.
RTF > 1.0: Suitable for offline or batch processing where real-time speed is not critical.

Real-time applications (e.g., live transcription):

A good mean RTF should be ≤ 0.5 to ensure low latency and allow for additional overhead (e.g., UI rendering).
Offline or post-processing applications:

A good mean RTF can be ~1.0 or slightly higher if latency is less critical.
High-performance systems:

With optimized models and hardware (e.g., GPUs), RTF values as low as 0.1–0.3 are achievable.

In [35]:
df.to_csv('./data2vec-base_wer.csv', index=False)