In [None]:
import librosa
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def extract_mfcc_features(audio_path, start_time, end_time, sr=None):
    """
    Extract 39 MFCC features (13 MFCCs + 13 Delta + 13 Delta-Delta) from a specific segment of an audio file.
    
    Parameters:
    - audio_path: Path to the audio file.
    - start_time: Start time of the word segment in seconds.
    - end_time: End time of the word segment in seconds.
    - sr: Sample rate to use. If None, librosa's default will be used.
    
    Returns:
    - mfcc_features: A numpy array containing 39 MFCC features for the segment.
    """
    
    # Load the audio file
    y, sr = librosa.load(audio_path, sr=sr)
    
    # Extract the segment
    start_sample = int(start_time * sr)
    end_sample = int(end_time * sr)
    word_segment = y[start_sample:end_sample]
    
    # Compute 13 MFCCs
    mfccs = librosa.feature.mfcc(y=word_segment, sr=sr, n_mfcc=13)
    
    # Compute Delta and Delta-Delta features
    mfcc_delta = librosa.feature.delta(mfccs)
    mfcc_delta2 = librosa.feature.delta(mfccs, order=2)
    
    # Concatenate to get 39 features
    mfcc_features = np.concatenate((mfccs, mfcc_delta, mfcc_delta2), axis=0)
    
    return mfcc_features

In [None]:
# Example usage
audio_path = '/home/ldap-users/Share/Data/librispeech/train-clean-100/19/198/19-198-0000.flac'
start_time = 1.0  # Start time of the word in seconds
end_time = 1.3    # End time of the word in seconds

mfcc_features = extract_mfcc_features(audio_path, start_time, end_time)
print("MFCC Features Shape:", mfcc_features.shape)

In [None]:
# Plotting
plt.figure(figsize=(10, 4))

# librosa.display.specshow for MFCC (Choose what to display: mfccs, mfcc_delta, mfcc_delta2, or mfcc_combined)
librosa.display.specshow(mfcc_features, x_axis='time', sr=16000, cmap='viridis')

plt.colorbar(format='%+2.0f dB')
plt.title('MFCC')
plt.tight_layout()
plt.show()

In [None]:
import json

def extract_mfcc_features(audio_path, start_time, end_time, sr=None):
    """
    Extract 39 MFCC features (13 MFCCs + 13 Delta + 13 Delta-Delta) from a specific segment of an audio file.
    
    Parameters:
    - audio_path: Path to the audio file.
    - start_time: Start time of the word segment in seconds.
    - end_time: End time of the word segment in seconds.
    - sr: Sample rate to use. If None, librosa's default will be used.
    
    Returns:
    - mfcc_features: A numpy array containing 39 MFCC features for the segment.
    """
    
    # Load the audio file
    y, sr = librosa.load(audio_path, sr=sr)
    
    # Extract the segment
    start_sample = int(start_time * sr)
    end_sample = int(end_time * sr)
    word_segment = y[start_sample:end_sample]
    
    # Compute 13 MFCCs
    mfccs = librosa.feature.mfcc(y=word_segment, sr=sr, n_mfcc=13)
    
    # Compute Delta and Delta-Delta features
    mfcc_delta = librosa.feature.delta(mfccs)
    mfcc_delta2 = librosa.feature.delta(mfccs, order=2)
    
    # Concatenate to get 39 features
    mfcc_features = np.concatenate((mfccs, mfcc_delta, mfcc_delta2), axis=0)
    
    return mfcc_features

# Load the updated_processed_data.json file
with open('Temp_data/new_updated_processed_data.json', 'r') as file:
    data = json.load(file)

items_to_remove = []
items_processed = 0

output_data = {}

# Process each word and its audio segments
for word, sub_json in data.items():
    for audio_path, intervals in sub_json.items():
        # print(f"Audio path: {audio_path}")
        # print(f"Intervals: {intervals}")
        remove_sub_json = False
        # Process each interval
        for i, interval in enumerate(intervals): 
            if isinstance(interval, list):
                start_time, end_time = interval
                if end_time - start_time >= 0.5:
                    # print(f"Start: {start_time}, End: {end_time}")
                    # Extract the MFCC features for this segment
                    mfcc_features = extract_mfcc_features(audio_path, start_time, end_time)
                    mfcc_features_list = mfcc_features.tolist() if mfcc_features is not None else None
                    # Replace the interval with the extracted MFCC features
                    print(type(mfcc_features))
                    output_data[word] = mfcc_features_list
                    print(type(output_data))
                    items_processed += 1
                    print(f"Processed: {(items_processed)}")
                    break
                else: 
                    remove_sub_json = True
                    break

        if remove_sub_json:
            items_to_remove.append((word, audio_path))

    if items_processed == 10:
        break

# Remove the marked items from data
for word, audio_path in items_to_remove:
    # Check if word is still in data and has the specific audio_path, then remove it
    if word in data and audio_path in data[word]:
        del data[word][audio_path]
        # If this was the last audio_path for the word, remove the word entry as well
        if data[word]:  # Check if sub_json is empty
            del data[word]

# Save the updated data back to a new JSON file
with open('Temp_data/mfcc_processed_data.json', 'w') as file:
    json.dump(output_data, file, indent=2)


## This part is for processing the data

In [None]:
import sys
from pathlib import Path

parent_dir = "/home/ldap-users/s2210403"
sys.path.append(f'{parent_dir}/VG-HuBERT')

import torch
import soundfile as sf
import os
import pickle
from models import audio_encoder
from itertools import groupby
from operator import itemgetter

model_path = "/home/ldap-users/s2210403/VG-HuBERT/vg-hubert_3"
wav_file = "/home/ldap-users/Share/Data/librispeech/train-clean-100/19/198/19-198-0037.flac"
tgt_layer = 9
threshold = 0.7

# setup model
with open(os.path.join(model_path, "args.pkl"), "rb") as f:
    model_args = pickle.load(f)
model = audio_encoder.AudioEncoder(model_args)
bundle = torch.load(os.path.join(model_path, "best_bundle.pth"))
model.carefully_load_state_dict(bundle['dual_encoder'], load_all=True)
model.eval()
model = model.cuda()

def get_segmented(audio_file):
    # load waveform (do not layer normalize the waveform!)
    audio, sr = sf.read(audio_file, dtype = 'float32')
    assert sr == 16000
    audio_len_in_sec = len(audio) / sr
    audio = torch.from_numpy(audio).unsqueeze(0).cuda() # [T] -> [1, T]

    # model forward
    with torch.no_grad():
        model_out = model(audio, padding_mask=None, mask=False, need_attention_weights=True, tgt_layer=tgt_layer)
    feats = model_out['features'].squeeze(0)[1:] # [1, T+1, D] -> [T, D]
    spf = audio.shape[-1]/sr/feats.shape[-2]
    attn_weights = model_out['attn_weights'].squeeze(0) # [1, num_heads, T+1, T+1] -> [num_heads, T+1, T+1] (for the two T+1, first is target length then the source)
    cls_attn_weights = attn_weights[:, 0, 1:] # [num_heads, T+1, T+1] -> [num_heads, T]
    out = cls_attn_seg(cls_attn_weights, threshold, spf, audio_len_in_sec) # out contains attn boundaries and word boundaries in intervals
    return out



In [None]:
# Read the processed data JSON file
with open('updated_processed_data.json', 'r') as file:
    data = json.load(file)

success = 0

# Process each word and its sub-jsons
for word, sub_json in data.items():
    for audio_key, value_indices in sub_json.items():
        # Call the get_segmented function with the audio path
        result = get_segmented(audio_key)
        segmented_list = result["attn_boundary_intervals"]
        # Ensure the list has enough elements
        if not segmented_list or max(value_indices) >= len(segmented_list):
            print(f"Error: The list returned by get_segmented for {audio_key} does not have an index {value_indices}.Length is {len(segmented_list)}")
            # continue
        # Replace the value of the sub-json with the value-th element from the list
        # If the value is a list of indices, this will take all corresponding elements.
        else:
            success += 1
            new_values = [segmented_list[index] for index in value_indices]
            sub_json[audio_key] = new_values
            print(f"Success: {success}")


In [None]:
print(len(out["attn_boundary_intervals"]), "   ", out["attn_boundary_intervals"])
print(len(out["word_boundary_intervals"]), "   ", out["word_boundary_intervals"])

In [None]:
import json

# Assume 'base_dir' is the base directory where the audio files are located
base_dir = '/home/ldap-users/Share/Data/librispeech/train-clean-100'

# Read the processed data JSON file
with open('processed_data.json', 'r') as file:
    data = json.load(file)

# Update the audio file paths
for word, sub_json in data.items():
    new_sub_json = {}
    for audio_key, transcript_index in sub_json.items():
        relative_audio_key = audio_key.replace(base_dir, '')
        relative_audio_key = relative_audio_key.split("/")[-1]
        # Extract the parts of the audio file name
        parts = relative_audio_key.split('-')
        if len(parts) == 3:
            first_part, second_part, third_part = parts
            # Construct the new path
            new_audio_path = f"{base_dir}/{first_part}/{second_part}/{relative_audio_key}"
            new_sub_json[new_audio_path] = transcript_index
        else:
            print(f"Unexpected audio key format: {audio_key}")
    # Update the data dictionary with new sub-json
    data[word] = new_sub_json

# Write the updated data back to a new JSON file
with open('updated_processed_data.json', 'w') as file:
    json.dump(data, file, indent=2)


In [None]:
import json
import os

parent_dir = '/home/ldap-users/Share/Data/librispeech/train-clean-100'
json_file = '/home/ldap-users/s2210403/Multi-View-AWE/buckeye_words_found_in_libri100_with_sentences.json'

In [None]:
# Read the JSON file
with open(json_file, 'r') as file:
    data = json.load(file)

In [None]:
# Process the JSON data
new_data = {}
for word, sub_json in data.items():
    new_sub_json = {}
    for audio_key, transcript in sub_json.items():
        # Split the audio_key into its parts
        first_part, second_part, third_part = audio_key.split('-')
        # Construct the new key with the full path
        new_key = f"{parent_dir}/{first_part}/{second_part}/{third_part}/{audio_key}.flac"
        # Find the index of the word in the transcript
        transcript_words = transcript.split()
        word_indices = [index for index, w in enumerate(transcript_words) if w.upper() == word.upper()]
        # Update the sub-json with the new key and the word index
        new_sub_json[new_key] = word_indices
    # Update the new data dictionary
    new_data[word] = new_sub_json

In [None]:
# Write the processed data back to a new JSON file
with open('processed_data.json', 'w') as file:
    json.dump(new_data, file, indent=2)

In [None]:
import json
import numpy as np

In [None]:
with open('Temp_data/mfcc_processed_data.json', 'r') as file:
    data = json.load(file)

mfcc_arrays = {word: np.array(features) for word, features in data.items()}

np.savez('Temp_data/mfcc_features.npz', **mfcc_arrays)

In [None]:
import numpy as np

# Load the .npz file
npz_file = np.load('Temp_data/mfcc_features.npz')

# Iterate over items and print
for word, features in npz_file.items():
    print(f"Word: {word}")
    print("Features:", features)
    # If you only want to print the shape or a summary, you can replace the above line with:
    # print("Shape of features:", features.shape)

# Close the file after use to free resources
npz_file.close()


In [None]:
import numpy as np
import string

def word_to_char_embeddings(word):
    # Filter out non-English letters and convert to lowercase
    filtered_word = ''.join(filter(str.isalpha, word)).lower()
    # Initialize the embedding matrix with zeros
    embedding = np.zeros((len(filtered_word), 26), dtype=np.int8)
    for i, char in enumerate(filtered_word):
        if char.isalpha():
            # Subtract 97 to get 0-based index since 'a' is 97 in ASCII
            embedding[i, ord(char) - 97] = 1
    return embedding

# Load the .npz file
npz_file = np.load('Temp_data/mfcc_features.npz')

# Initialize lists to hold the MFCC features and character embeddings
mfcc_features_list = []
char_embeddings_list = []

# Iterate over items in the .npz file
for word, features in npz_file.items():
    # print((features.shape))
    mfcc_features_list.append(features)
    char_embedding = word_to_char_embeddings(word)
    char_embeddings_list.append(char_embedding)

max_length = max(features.shape[1] for features in mfcc_features_list)
padded_mfcc_features_list = [np.pad(features, ((0, 0), (0, max_length - features.shape[1])), mode='constant', constant_values=0) for features in mfcc_features_list]


# Close the .npz file
npz_file.close()

# Convert lists to numpy arrays
# mfcc_features_array = np.array(mfcc_features_list, dtype=object)
mfcc_features_array = np.array(padded_mfcc_features_list, dtype=object)
char_embeddings_array = np.array(char_embeddings_list, dtype=object)

# Save the arrays to .npy files
np.save('input1.npy', mfcc_features_array)
np.save('input2.npy', char_embeddings_array)

In [56]:
import numpy as np

# Load the .npz file
npz_file = np.load('Data/test.npz')

# Determine the maximum width among all MFCC feature arrays
max_width = max(features.shape[1] for features in npz_file.values())

# Initialize a dictionary to hold the padded MFCC features
padded_mfcc_features = {}

# Iterate over items in the .npz file and pad each MFCC feature array
for word, features in npz_file.items():
    # Calculate the padding width for the current array
    # padding_width = max_width - features.shape[1]
    # # Pad the array on the right (along the second axis)
    # padded_features = np.pad(features, pad_width=((0, 0), (0, padding_width)), mode='constant', constant_values=0)
    # # Store the padded features in the dictionary
    # padded_mfcc_features[word] = padded_features
    print(features.shape)

# Optionally, save the padded MFCC feature arrays to a new .npz file
# np.savez('padded_mfcc_features.npz', **padded_mfcc_features)
# # Remember to close the loaded .npz file
# npz_file.close()

(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(39, 37)
(