In [1]:
import whisper
import os
import shutil
from zipfile import ZipFile
whisper_model = whisper.load_model("tiny.en")

def speech_to_text(audio_file):
    result = whisper_model.transcribe(audio_file)
    return result['text'].strip()


from pydub import AudioSegment
from pydub.silence import split_on_silence
from pydub import AudioSegment
from IPython.display import Audio
from IPython.core.display import display
import shutil
import os

def remove_silence(file_path, silence_threshold=0.1):
    # print(f"Silence Remove")
    global base_path
    store_path=f"{base_path}/remove_silence"
    if not os.path.exists(store_path):
      os.makedirs(store_path)
    f_name=os.path.basename(file_path)
    output_path=f"{store_path}/{f_name}"
    """
    Remove or keep silence from an audio file based on the given parameter.

    :param file_path: Path to the input audio file.
    :param output_path: Path to the output audio file.
    :param silence_threshold: Parameter between 0.0 and 1.0 indicating the maximum silence gap to keep.
                              0.0 means remove all silence, 1.0 means keep silence gaps of up to 1.0 seconds.
    """
    # Extract file name and format from the provided path
    file_name = os.path.basename(file_path)
    audio_format = "wav"

    # Reading the audio file
    sound = AudioSegment.from_file(file_path, format=audio_format)

    # Convert silence_threshold to milliseconds
    max_silence_len = silence_threshold * 1000  # Convert to milliseconds

    # Split the audio on silence
    audio_chunks = split_on_silence(sound,
                                    min_silence_len=1,  # Smallest chunk of silence considered
                                    silence_thresh=-45,
                                    keep_silence=max_silence_len)

    # Combine chunks into one audio segment
    combined = AudioSegment.empty()
    for chunk in audio_chunks:
        combined += chunk
    combined = combined.set_frame_rate(44100)
    # Export the processed audio
    combined.export(output_path, format=audio_format)
    # print(f"Silence Remove")
    return output_path


def get_last_index(dataset_path):
  # List of files
  old_files = os.listdir(dataset_path)
  if old_files:
    # Filter to include only .wav files and remove extensions
    wav_files = [f for f in old_files if f.endswith('.wav')]
    # Sort based on file names without extension
    sorted_wav_files = sorted(wav_files, key=lambda x: os.path.splitext(x)[0])

    # Get the last file in the sorted list
    last_file = sorted_wav_files[-1] if sorted_wav_files else None
    last_index=int(last_file.replace(".wav",""))
    return last_index
  else:
    return 0

from pydub import AudioSegment

def convert_to_44100hz(input_wav):
    global base_path
    store_path=f"{base_path}/remove_silence"
    if not os.path.exists(store_path):
      os.makedirs(store_path)
    output_wav=f"{store_path}/44100hz_{os.path.basename(input_wav)}"
    # Load the WAV file
    audio = AudioSegment.from_wav(input_wav)
    
    # Set the frame rate to 44100 Hz
    audio = audio.set_frame_rate(44100)
    
    # Export the file as a new WAV with 44100 Hz format
    audio.export(output_wav, format="wav")
    return output_wav
# Example usage:
# convert_to_44100hz("input.wav", "output_44100hz.wav")


def make_zip(dataset_folder,zip_path):
    if os.path.exists(zip_path):
        os.remove(zip_path)
    with ZipFile(zip_path, 'w') as zipf:
        for root, dirs, files in os.walk(dataset_folder):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, dataset_folder)
                zipf.write(file_path, arcname=arcname)
                
def make_dataset(audio_folder,voice_name,language_code_name,update_dataset,no_silence,silence_threshold):
    global base_path
    dataset_folder=f"{base_path}/dataset"
    if not os.path.exists(dataset_folder):
        os.makedirs(dataset_folder) 
    original_dataset_folder=f"{dataset_folder}/{voice_name.strip().upper()}"
    dataset_path = f"{original_dataset_folder}_with_text"
    model_name = f"{language_code_name.lower()}-{voice_name.lower()}"
    last_index=0
    if update_dataset:
       if not os.path.exists(dataset_path):
           os.makedirs(dataset_path) 
       last_index=get_last_index(dataset_path)
    else:
        if os.path.exists(dataset_path):
            shutil.rmtree(dataset_path)
        os.makedirs(dataset_path)
    metadata_text_file =f"{dataset_path}/metadata.list" 
    if update_dataset:
      mode='a'
    else:
      mode='w'       
    with open(metadata_text_file, mode) as f:
        for i in os.listdir(audio_folder):
            if i.endswith(".wav"):
                audio_path=f"{audio_folder}/{i}"
                text=speech_to_text(audio_path)
                no_of_words = len(text.split())
                if no_of_words >= 3:
                    f_name = f"{last_index:06d}"
                    last_index+=1
                    text_filename = f"{dataset_path}/{f_name}.txt"
                    with open(text_filename, 'w') as text_file:
                        text_file.write(text)
                    output_wav = f"{dataset_path}/{f_name}.wav"
                    if no_silence:
                        file_path=remove_silence(audio_path, silence_threshold)
                    else:
                        file_path=convert_to_44100hz(audio_path)
                    shutil.copy(file_path, output_wav)
                    line = f"{f_name}.wav|{model_name}|{language_code_name}|{text}\n"
                    f.write(line)
    if os.path.exists(original_dataset_folder):
        shutil.rmtree(original_dataset_folder)
    os.makedirs(original_dataset_folder)
    for i in os.listdir(dataset_path):
        selected_file = f"{dataset_path}/{i}"
        #except .txt files
        if not selected_file.endswith(".txt"):
            shutil.copy(selected_file, original_dataset_folder)
    zip_path=f"{original_dataset_folder}.zip"
    make_zip(original_dataset_folder,zip_path)
    return zip_path
    
        

  checkpoint = torch.load(fp, map_location=device)
  from IPython.core.display import display


In [2]:
if __name__ == "__main__":
    base_path="."
    your_recorded_audio_folder = "./audio"
    voice_name = "Ronaldo"  # @param {type: "string"}
    language_code_name = "EN"  # @param ['EN', 'ES', 'FR', 'ZH','JA','KO']
    update_dataset = False  # @param {type: "boolean"}
    no_silence=True 
    silence_threshold=0.1 
    zip_path=make_dataset(your_recorded_audio_folder,voice_name,language_code_name,update_dataset,no_silence,silence_threshold)
    zip_full_path=os.path.abspath(zip_path)
    print(f"Dataset created at {zip_full_path}")

Dataset created at c:\Users\sanjib\Downloads\c\dataset\SANJI.zip
