In [1]:
import pandas as pd
import os

# Path to the directory and CSV file using os.path.join for OS independence
base_dir = os.path.join('..', 'data', 'raw')
recordings_dir = os.path.join(base_dir, 'recordings')
csv_file_path = os.path.join(base_dir, 'overview-of-recordings.csv')

# Reading the CSV file with error handling
try:
    data = pd.read_csv(csv_file_path)
    data = data[['file_name', 'phrase']]
except FileNotFoundError:
    print("CSV file not found at the specified path. Please check the path and try again.")
except pd.errors.EmptyDataError:
    print("CSV file is empty. Please check the file content.")
except pd.errors.ParserError:
    print("Error parsing CSV. Please check the CSV format.")

In [2]:
import os
import string

# Check for audio file existence by combining the directory path with the file names
data['file_exists'] = data['file_name'].apply(lambda x: os.path.isfile(os.path.join(recordings_dir, x)))

# Check for invalid transcriptions
data['valid_transcription'] = data['phrase'].apply(lambda x: isinstance(x, str) and x.strip() != "")

# Normalize transcriptions by converting them to lowercase and removing trailing and leading whitespaces
data['phrase'] = data['phrase'].str.lower().str.strip()

# Remove punctuation from transcriptions
data['phrase'] = data['phrase'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))


In [3]:
# Check if there is any file name that does not exist
num_files_not_exist = data['file_exists'].value_counts().get(False, 0)
print("Number of files that do not exist: ", num_files_not_exist)

# Check for invalid transcriptions
num_invalid_transcriptions = data['valid_transcription'].value_counts().get(False, 0)
print("Number of files with invalid transcriptions: ", num_invalid_transcriptions)

Number of files that do not exist:  0
Number of files with invalid transcriptions:  0


In [4]:
import librosa
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

# Initialize processor and model
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def prepare_dataset(row):
    try:
        # Define the sampling rate
        sampling_rate = 16000

        # Load audio file and resample it to the target sampling rate
        speech, rate = librosa.load(os.path.join('../data/raw/recordings', row["file_name"]), sr=sampling_rate)
        
        # Process audio file
        input_values = processor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding="longest").input_values

        # Encode transcription
        with processor.as_target_processor():
            labels = processor(row["phrase"], return_tensors="pt").input_ids

        return {
            "input_values": input_values, 
            "labels": labels
        }
    except Exception as e:
        print(f"Error processing file {row['file_name']}: {str(e)}")
        return None

# Test the function with the first row of the dataframe
row = data.iloc[0]
output = prepare_dataset(row)
print("Output: ", output)
print("Type: ", type(output))
print("Keys: ", output.keys() if output else "No keys, output is None")
print("Input values shape: ", output["input_values"].shape if output else "No shape, output is None")
print("Labels shape: ", output["labels"].shape if output else "No shape, output is None")

# Apply the function across the dataframe and create a new dataframe
# Convert the Series of dictionaries to a DataFrame
dataset = pd.DataFrame(list(data.apply(prepare_dataset, axis=1).dropna()))


Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
Y

Output:  {'input_values': tensor([[-0.0758, -0.1674, -0.1839,  ...,  0.7457,  0.6541,  0.8108]]), 'labels': tensor([[29, 14, 11, 20,  4, 15,  4, 24, 11, 19, 11, 19,  8, 11, 24,  4, 14, 11,
         24,  4, 15,  4, 12, 11, 11, 18,  4, 10, 21, 29, 20]])}
Type:  <class 'dict'>
Keys:  dict_keys(['input_values', 'labels'])
Input values shape:  torch.Size([1, 40960])
Labels shape:  torch.Size([1, 31])


In [25]:
from torch.nn.utils.rnn import pad_sequence
import torch

def collate_fn(batch):
    # Extract all input values and labels from the batch
    input_values = [torch.tensor(item['input_values']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]

    # Ensure each tensor is a flat sequence if it's not already
    input_values = [iv.flatten() for iv in input_values]
    labels = [l.flatten() for l in labels]

    # Pad input values and labels to the maximum length in the batch
    input_values_padded = pad_sequence(input_values, batch_first=True, padding_value=0.0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)  # Use an ignore index or suitable padding value

    return {
        'input_values': input_values_padded,
        'labels': labels_padded
    }


In [6]:
import torch
import pandas as pd
from datasets import Dataset

# Define a function to flatten and concatenate tensors in each row
def flatten_tensors(row):
    # Check and convert list to tuple, then concatenate tensors
    input_values = torch.cat(tuple(row['input_values']), dim=0) if isinstance(row['input_values'], list) else row['input_values']
    labels = torch.cat(tuple(row['labels']), dim=0) if isinstance(row['labels'], list) else row['labels']
    return pd.Series([input_values, labels], index=['input_values', 'labels'])

# Apply the function to each row in the DataFrame
processed_data = dataset.apply(flatten_tensors, axis=1)

# Convert the processed DataFrame to a Hugging Face Dataset
hf_dataset = Dataset.from_dict({
    'input_values': processed_data['input_values'].tolist(),
    'labels': processed_data['labels'].tolist()
})

print(hf_dataset)


Dataset({
    features: ['input_values', 'labels'],
    num_rows: 6661
})


In [7]:
from datasets import Dataset, DatasetDict

# Define the split proportions
train_test_val_split = hf_dataset.train_test_split(test_size=0.1)  # 10% for testing

# Now split the remaining data into training and validation
train_val_split = train_test_val_split['train'].train_test_split(test_size=0.1111)  # About 10% of 90% for validation

# Create a DatasetDict to hold the splits conveniently
dataset_splits = DatasetDict({
    'train': train_val_split['train'],
    'validation': train_val_split['test'],
    'test': train_test_val_split['test']
})

# Now you have dataset_splits with train, validation, and test datasets
print("Training set size:", dataset_splits['train'].num_rows)
print("Validation set size:", dataset_splits['validation'].num_rows)
print("Testing set size:", dataset_splits['test'].num_rows)


Training set size: 5328
Validation set size: 666
Testing set size: 667


In [28]:
import numpy as np
import torch
from transformers import AutoModelForCTC, AutoConfig, Trainer, TrainingArguments
from datasets import load_metric

wer_metric = load_metric("wer")

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = model.config.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    # Assuming labels are already decoded in the dataset
    label_str = pred.label_ids  
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=1,
    num_train_epochs=3,
    warmup_steps=500,
    save_strategy="epoch",
    logging_dir="./logs",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_splits['train'],
    eval_dataset=dataset_splits['validation'],
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    tokenizer=processor
)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [29]:
trainer.train()

  0%|          | 0/1998 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 352.00 MiB. GPU 0 has a total capacty of 23.99 GiB of which 0 bytes is free. Of the allocated memory 36.10 GiB is allocated by PyTorch, and 1.63 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF