In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import warnings 
import string
from mpl_toolkits.mplot3d import Axes3D  # Import the 3D plotting tool
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import MeanShift
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering, DBSCAN, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors
import librosa
warnings.filterwarnings('ignore')

# Path to the directory and CSV file
raw_data_dir = '..\\data\\raw\\recordings'
csv_file_path = '..\\data\\raw\\overview-of-recordings.csv'

data = pd.read_csv('..\\data\\raw\overview-of-recordings.csv')

data = data[['file_name','phrase']]

In [26]:
data.head()

Unnamed: 0,file_name,phrase
0,1249120_43453425_58166571.wav,When I remember her I feel down
1,1249120_43719934_43347848.wav,When I carry heavy things I feel like breaking...
2,1249120_43719934_53187202.wav,there is too much pain when i move my arm
3,1249120_31349958_55816195.wav,My son had his lip pierced and it is swollen a...
4,1249120_43719934_82524191.wav,My muscles in my lower back are aching


In [27]:
# Check for audio file existence by combining the directory path with the file names
data['file_exists'] = data['file_name'].apply(lambda x: os.path.isfile(os.path.join(raw_data_dir, x)))

# Check for invalid transcriptions
data['valid_transcription'] = data['phrase'].apply(lambda x: isinstance(x, str) and x.strip() != "")

# Normalize transcriptions by converting them to lowercase and removing trailing and leading whitespaces
data['phrase'] = data['phrase'].str.lower().str.strip()

# Remove punctuation from transcriptions
data['phrase'] = data['phrase'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [28]:
data.head()

Unnamed: 0,file_name,phrase,file_exists,valid_transcription
0,1249120_43453425_58166571.wav,when i remember her i feel down,True,True
1,1249120_43719934_43347848.wav,when i carry heavy things i feel like breaking...,True,True
2,1249120_43719934_53187202.wav,there is too much pain when i move my arm,True,True
3,1249120_31349958_55816195.wav,my son had his lip pierced and it is swollen a...,True,True
4,1249120_43719934_82524191.wav,my muscles in my lower back are aching,True,True


In [29]:
# Check if there is any file name that does not exist
num_files_not_exist = data['file_exists'].value_counts().get(False, 0)
print("Number of files that do not exist: ", num_files_not_exist)

# Check for invalid transcriptions
num_invalid_transcriptions = data['valid_transcription'].value_counts().get(False, 0)
print("Number of files with invalid transcriptions: ", num_invalid_transcriptions)

Number of files that do not exist:  0
Number of files with invalid transcriptions:  0


In [30]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")

def prepare_dataset(row):
    # Define the sampling rate
    sampling_rate = 16000
    
    # Load audio file and resample it to the target sampling rate
    speech, rate = librosa.load(f'../data/raw/recordings/{row["file_name"]}', sr=sampling_rate)
    
    # Process audio file
    input_values = processor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding="longest").input_values  # Batch size 1 because only one file is processed at a time
    
    # Encode transcription
    with processor.as_target_processor():
        labels = processor(row["phrase"], return_tensors="pt").input_ids
    
    return {
        "input_values": input_values, 
        "labels": labels
    }


# Apply the function across the dataframe and create a new dataframe
dataset = data.apply(prepare_dataset, axis=1)

Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
Y

In [31]:
import torch
from torch.cuda import is_available

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Send the model to the device
model.to(device)

print(f"Using device: {device}")


Using device: cuda


In [32]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    """
    The collate_fn function is used to merge a list of samples to form a mini-batch.
    """
    input_values = [item['input_values'] for item in batch]
    labels = [item['labels'] for item in batch]

    input_values = processor.pad(input_values, return_tensors="pt")
    labels = processor.pad(labels, return_tensors="pt")

    # Move batch to the same device as the model
    input_values = input_values.to(device)
    labels = labels.to(device)

    return {"input_values": input_values.input_values, "labels": labels.input_ids}


In [33]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="..\models",
    group_by_length=True,  # group by lengths of input lengths to minimize padding
    per_device_train_batch_size=8,
    gradient_accumulation_steps=16,  # Adjust based on the GPU memory
    evaluation_strategy="steps",
    num_train_epochs=3,
    fp16=True,  # Use mixed precision to speed up training (only works on GPUs with Tensor Cores)
    save_steps=500,
    eval_steps=500,
    logging_steps=500,
    learning_rate=1e-4,
    warmup_steps=500,
    save_total_limit=2,
)

In [34]:
import torch

# Assuming train_data is your original DataFrame
# Convert train_data to list of dictionaries if not already done
dataset = dataset.to_dict(orient='records')

# Ensure all data types are correct
for item in dataset:
    item['input_values'] = torch.tensor(item['input_values'], dtype=torch.float32)  # Convert input_values to tensor
    item['labels'] = torch.tensor(item['labels'], dtype=torch.long)  # Convert labels to tensor

# Now convert to Hugging Face dataset
from datasets import Dataset
dataset = Dataset.from_dict(dataset)



from sklearn.model_selection import train_test_split

# Assuming 'dataset' is a list of dictionaries
train_size = 0.8 # 80% for training
valid_size = 0.1 # 10% for validation
test_size = 0.1  # 10% for testing

# First, split into train+valid and test
train_valid_data, test_data = train_test_split(dataset, test_size=test_size, random_state=42)

# Now split the train+valid into train and valid
train_data, valid_data = train_test_split(train_valid_data, test_size=valid_size / (1 - test_size), random_state=42)


TypeError: Series.to_dict() got an unexpected keyword argument 'orient'

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,  # Use the dataset directly
    eval_dataset=valid_data,   # Use the dataset directly
    data_collator=collate_fn,
    tokenizer=processor,
)

# Start training
trainer.train()


KeyError: 0