In [1]:
import syllables
import re
from IPython.display import Audio
def play_audio(audio_path):
 
    audio = Audio(audio_path)
    display(audio)

In [2]:

import webrtcvad
from tqdm import tqdm
from pydub import AudioSegment
import matplotlib.pyplot as plt 
import os 
import numpy as np
import pandas as pd
def get_speech_length(wav_path):
    # Load the audio file
    audio = AudioSegment.from_wav(wav_path)
    # Convert the audio to 16-bit PCM format with a sample rate of 16000 Hz
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
    # Get the raw audio data
    raw_audio = audio.raw_data
    # Initialize the VAD object
    vad = webrtcvad.Vad()
    # Set the aggressiveness level of the VAD (1, 2, or 3, where 3 is the most aggressive)
    vad.set_mode(3)
    frame_duration = 30  # in milliseconds
    frame_length = int(audio.frame_rate * (frame_duration / 1000)) * audio.sample_width
    speech_length = 0
    for i in range(0, len(raw_audio), frame_length):
        frame = raw_audio[i:i + frame_length]
        if len(frame) == frame_length:
            if vad.is_speech(frame, audio.frame_rate):
                speech_length += frame_duration / 1000

    return speech_length

def count_tokens_in_transcript(transcript, tokenizer,use_tokenizer=False):
    
        # Open the text file and read the first line

    # Tokenize the transcript
    # tokens = tokenizer.tokenize(transcript)
    if use_tokenizer:
            
        tokens = tokenizer.encode_plus(
            transcript,
            add_special_tokens=False,
            return_tensors='np'
        )['input_ids'][0]
        # print (transcript)
        # print (tokens)
        # Count the number of tokens
        token_count = len(tokens)
        return token_count
    else:
        # print (transcript)
        transcript = transcript.replace(" ", "")
        return len(transcript )
def remove_non_word_text(transcript):
    # The regular expression pattern [^a-zA-Z\s] matches any character that is not
    # an alphabetic character (either uppercase or lowercase) or a whitespace character.
    processed_text = re.sub('[^a-zA-Z\s]', '', transcript)
    # Optionally, you can also collapse multiple consecutive spaces into a single space
    processed_text = re.sub('\s+',' ', processed_text).strip()
    return processed_text
def count_syllables_in_transcript(transcript):
    # Count the number of syllables
    syllable_count = 0 
    transcript = remove_non_word_text(transcript)
    for word in transcript.split():
        syllable_count += syllables.estimate(word)
    return syllable_count
      

def calculate_token_ratios(path_list, tokenizer, use_tokenizer=False):
    ratios = []
    log = {}
    for wav_path, text_caption in tqdm(path_list):
        speech_length = get_speech_length(wav_path)
        token_count = count_tokens_in_transcript(text_caption, tokenizer, use_tokenizer)
        if speech_length and token_count:
            ratio = speech_length / token_count
            ratios.append(ratio)
            log[os.path.basename(wav_path)] = ratio
    return ratios, log

def calculate_syllable_ratios(path_list):
    ratios = []
    log = {}
    for wav_path, text_caption in tqdm(path_list):
        speech_length = get_speech_length(wav_path)
        syllable_count = count_syllables_in_transcript(text_caption)
        if speech_length and syllable_count:
            ratio = speech_length / syllable_count
            ratios.append(ratio)
            log[os.path.basename(wav_path)] = ratio
    return ratios, log


def plot_ratios(ratios):
    if not ratios:
        print("No valid ratios were calculated.")
        return

    # Use quantiles to divide the ratios into 5 balanced categories
    quantiles = np.quantile(ratios, [0, 0.3, 0.6, 1])
    category_counts = np.histogram(ratios, bins=quantiles)[0]

    # Create category labels
    category_labels = []
    for i in range(len(quantiles) - 1):
        label = f"{quantiles[i]:.2f}-{quantiles[i + 1]:.2f}"
        category_labels.append(label)

    # Plot the distribution of ratios in 5 categories
    plt.bar(category_labels, category_counts)
    plt.xlabel('Speech Length to Token Ratio Categories')
    plt.ylabel('Frequency')
    plt.title('Distribution of Speech Length to Token Ratios in 5 Balanced Categories')
    plt.xticks(rotation=45)
    plt.show()

def sameSpk_differentSpeed(speed_log, slow_threshold=0.3, fast_threshold=0.17):
    """
    speed_log: {
        wav_filename: speed_ratio
    }
    ->
    {
        spk_name: {
            'slow': [wav_filename],
            'fast': [wav_filename]
        }
    }
    """
    spk_dict = {}
    for wav_filename, speed_ratio in speed_log.items():
        spk_name = wav_filename.split('_')[0]
        if spk_name not in spk_dict:
            spk_dict[spk_name] = {
                'slow': [],
                'fast': [],
                'medium': [],
            }
        if speed_ratio > slow_threshold:
            spk_dict[spk_name]['slow'].append(wav_filename)
        elif speed_ratio < fast_threshold:
            spk_dict[spk_name]['fast'].append(wav_filename)
        else:
            spk_dict[spk_name]['medium'].append(wav_filename)
    return spk_dict


#### Get (wav_path, text caption) pairs

In [3]:
path_list = []
### Genshin/wav stores wav format audio files in 16khz
# Genshin/metadata.csv stores the corresponding text captions in format of 'Name|caption'
wav_folder = '/share5/users/jiachuan/data/llasa_ft_data/Genshin/wav'
txt_csv = '/share5/users/jiachuan/data/llasa_ft_data/Genshin/metadata.csv'
column_names = ['Name', 'caption']
meta_df = pd.read_csv(txt_csv,header=None,sep='|',names=column_names)

for wav_f in tqdm(os.listdir(wav_folder)[:]):
    if not wav_f.endswith('.wav'):
        continue
    wav_path = os.path.join(wav_folder, wav_f)
    query_name = wav_f.replace('.wav', '')
    text_caption_result = meta_df.query(f"Name == '{query_name}'")
    if not text_caption_result.empty:
        text_caption = text_caption_result['caption'].values[0]
        if len(text_caption.split()) > 5: # we only keep the audio with more than 5 words
            path_list.append((wav_path, text_caption))

# log = calculate_and_plot_ratios(path_list, tokenizer,True)

100%|██████████| 125581/125581 [07:17<00:00, 286.93it/s]


#### Calculate Sec Per Syllable for each pair

In [4]:
# count the number of syllables for each audio sample
sps_list, log = calculate_syllable_ratios(path_list)

 38%|███▊      | 41201/107406 [08:57<14:22, 76.72it/s]   


KeyboardInterrupt: 

#### Only select speakers with more than 2 fast and slow pairs

In [None]:
spk_speed_dict = sameSpk_differentSpeed(log)

In [None]:
valid_spk = set()
for spk in spk_speed_dict:
    if len(spk_speed_dict[spk]['slow']) > 2 and len(spk_speed_dict[spk]['fast']) > 2:
        valid_spk .add(spk)
len(valid_spk)

#### Sort and store  the selected pairs based on different speeds

In [None]:
### store valid speakers and corresponding audio files with different speed


"""
The stored folder structure is as follows:
speaker_withDifferent_speed
    spk1
        slow.txt
        fast.txt
        medium.txt
    spk2
        ...
"""

speaker_withDifferent_speed_folder = '/share5/users/jiachuan/data/llasa_ft_data/Genshin/speaker_withDifferent_speed'

if not os.path.exists(speaker_withDifferent_speed_folder):
    os.makedirs(speaker_withDifferent_speed_folder)
for spk in valid_spk:
    spk_folder = os.path.join(speaker_withDifferent_speed_folder, spk)
    if not os.path.exists(spk_folder):
        os.makedirs(spk_folder)
    for speed in ['slow', 'fast', 'medium']:
        with open(os.path.join(spk_folder, f'{speed}.txt'), 'w') as f:
            for wav_f in spk_speed_dict[spk][speed]:
                f.write(wav_f + '\n')
         
