In [2]:
from pydub import AudioSegment
import os

def split_audio(file_path, chunk_length_ms, output_dir="audio_chunks"):
    """
    Splits an audio file into chunks of the specified length and saves them in the output directory.
    
    Parameters:
        file_path (str): Path to the input audio file.
        chunk_length_ms (int): Length of each chunk in milliseconds.
        output_dir (str): Directory where the chunks will be saved.
    """
    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Load the audio file
    audio = AudioSegment.from_file(file_path)

    # Calculate the number of chunks
    total_length = len(audio)
    num_chunks = total_length // chunk_length_ms

    # Split and save chunks
    for i in range(num_chunks + 1):
        start_time = i * chunk_length_ms
        end_time = min(start_time + chunk_length_ms, total_length)
        chunk = audio[start_time:end_time]

        # Define the filename for the chunk
        chunk_filename = os.path.join(output_dir, f"chunk_{i + 1}.wav")
        chunk.export(chunk_filename, format="wav")
        print(f"Saved {chunk_filename}")

    print(f"Audio file has been split into {num_chunks + 1} chunks and saved in '{output_dir}'.")

# Example usage
audio_file = "output_audio.mp3"  # Replace with your audio file path
chunk_duration = 30000  # 30 seconds in milliseconds
split_audio(audio_file, chunk_duration)

Saved audio_chunks\chunk_1.wav
Saved audio_chunks\chunk_2.wav
Saved audio_chunks\chunk_3.wav
Saved audio_chunks\chunk_4.wav
Saved audio_chunks\chunk_5.wav
Saved audio_chunks\chunk_6.wav
Saved audio_chunks\chunk_7.wav
Saved audio_chunks\chunk_8.wav
Saved audio_chunks\chunk_9.wav
Saved audio_chunks\chunk_10.wav
Saved audio_chunks\chunk_11.wav
Audio file has been split into 11 chunks and saved in 'audio_chunks'.


In [3]:
import os
import csv
import speech_recognition as sr
from pydub import AudioSegment

def transcribe_audio_to_text(audio_path):
    """
    Transcribe audio file to text using Google's Web Speech API.
    Args:
        audio_path (str): Path to the audio file to be transcribed.
    Returns:
        str: Transcription of the audio.
    """
    recognizer = sr.Recognizer()

    # Convert audio to WAV if it's not already in WAV format
    if not audio_path.endswith(".wav"):
        audio = AudioSegment.from_file(audio_path)
        audio_path = audio_path.replace(os.path.splitext(audio_path)[1], ".wav")
        audio.export(audio_path, format="wav")

    # Load the audio file
    with sr.AudioFile(audio_path) as source:
        audio_data = recognizer.record(source)

    # Use Google's Web Speech API to transcribe
    try:
        text = recognizer.recognize_google(audio_data)
        return text
    except sr.UnknownValueError:
        print(f"Could not understand audio: {audio_path}")
        return ""
    except sr.RequestError:
        print("Could not request results from Google Speech Recognition service")
        return ""

def create_transcriptions_csv(audio_dir, output_csv="transcriptions.csv"):
    """
    Create a CSV file with transcriptions of all audio files in the given directory.
    Args:
        audio_dir (str): Directory containing the audio files to be transcribed.
        output_csv (str): Name of the output CSV file.
    """
    # Open CSV file for writing
    with open(output_csv, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["audio_file", "transcription"])

        # Iterate over all audio files in the directory
        for audio_file in os.listdir(audio_dir):
            if audio_file.endswith(('.wav', '.mp3', '.flac', '.ogg')):
                audio_path = os.path.join(audio_dir, audio_file)
                transcription = transcribe_audio_to_text(audio_path)

                # Write audio file name and transcription to the CSV file
                writer.writerow([audio_file, transcription])
                print(f"Processed {audio_file}: {transcription}")

    print(f"Transcriptions saved to {output_csv}")

# Example usage
audio_directory = "audio_chunks"  # Directory containing your audio files
create_transcriptions_csv(audio_directory)


Processed chunk_1.wav: we should know how to handle not only how to handle success how to handle the failures particularly you are in the management environment water it I want the young people to understand how to manage the failure because any task you do you have to come across problem problem should not become the captain of the
Processed chunk_10.wav: great human being sometime they will become better than you but better than the teacher so that opportunity to have I will have great you I will continuously acquire knowledge how I will do hard work I will Persevere and succeed
Could not understand audio: audio_chunks\chunk_11.wav
Processed chunk_11.wav: 
Processed chunk_2.wav: individual or a project chief the project chief should become the captain of the problems and defeat the problem and succeed learning use creativity creativity leads to thinking thinking provides knowledge makes you great
Processed chunk_3.wav: the those who directed Imagine The Impossible are the ones who br

In [1]:
import os
import espnet2
from espnet2.bin.tts_train import Text2Speech
from espnet2.tasks.tts import TTSTask
from espnet2.train import Trainer
from espnet_model_zoo.downloader import ModelDownloader

def prepare_training_data(audio_dir, metadata_file):
    """
    Prepare dataset for training.
    Args:
        audio_dir (str): Directory containing audio chunks.
        metadata_file (str): Path to the metadata file with transcriptions.
    Returns:
        dataset (list of tuples): List containing (audio_path, transcription).
    """
    dataset = []
    with open(metadata_file, 'r', encoding='utf-8') as f:
        for line in f:
            audio_file, transcription = line.strip().split(',')
            audio_path = os.path.join(audio_dir, audio_file)
            if os.path.exists(audio_path):
                dataset.append((audio_path, transcription))
    return dataset

def fine_tune_tts(pretrained_model, dataset, output_dir="fine_tuned_model", epochs=5, batch_size=8):
    """
    Fine-tune a pre-trained TTS model using custom audio and transcriptions.
    Args:
        pretrained_model (str): Path or identifier for the pre-trained model.
        dataset (list): List of (audio_path, transcription).
        output_dir (str): Directory to save the fine-tuned model.
        epochs (int): Number of training epochs.
        batch_size (int): Size of each batch for training.
    """
    downloader = ModelDownloader()
    model = downloader.download_and_unpack(pretrained_model)

    train_data = TTSTask.build_data_loader(dataset, batch_size=batch_size)

    # Initialize Trainer
    trainer = Trainer(model=model, output_dir=output_dir)

    # Start Fine-Tuning
    trainer.run(train_data, epochs=epochs)

# Example usage
audio_directory = "audio_chunks"  # Directory with your audio chunks
metadata_csv = "metadata.csv"     # Path to your metadata file
dataset = prepare_training_data(audio_directory, metadata_csv)

# Specify the pre-trained model name you want to fine-tune
pretrained_tts_model = "espnet/kan-bayashi_ljspeech_tacotron2"

# Fine-tune the TTS model
fine_tune_tts(pretrained_tts_model, dataset, epochs=10, batch_size=4)

ModuleNotFoundError: No module named 'espnet2'

In [4]:
pip install tensorflow==2.12.0


Note: you may need to restart the kernel to use updated packages.Collecting tensorflow==2.12.0
  Downloading tensorflow-2.12.0-cp311-cp311-win_amd64.whl.metadata (2.5 kB)
Collecting tensorflow-intel==2.12.0 (from tensorflow==2.12.0)
  Downloading tensorflow_intel-2.12.0-cp311-cp311-win_amd64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.12.0->tensorflow==2.12.0)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.12.0->tensorflow==2.12.0)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=2.0 (from tensorflow-intel==2.12.0->tensorflow==2.12.0)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow-intel==2.12.0->tensorflow==2.12.0)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.12.0->tensorflow==2.12.0)
  Downloadi

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.0 requires FuzzyTM>=0.4.0, which is not installed.
tables 3.8.0 requires blosc2~=2.0.0, which is not installed.
tables 3.8.0 requires cython>=0.29.21, which is not installed.


In [5]:
pip install TensorFlowTTS

Collecting TensorFlowTTS
  Using cached TensorFlowTTS-1.8-py3-none-any.whl.metadata (24 kB)
INFO: pip is looking at multiple versions of tensorflowtts to determine which version is compatible with other requirements. This could take a while.
  Using cached TensorFlowTTS-1.6.1-py3-none-any.whl.metadata (23 kB)
  Using cached TensorFlowTTS-1.6-py3-none-any.whl.metadata (23 kB)
  Using cached TensorFlowTTS-1.1-py3-none-any.whl.metadata (22 kB)
  Using cached TensorFlowTTS-0.11-py3-none-any.whl.metadata (22 kB)
Collecting tensorflow-gpu>=2.3.1 (from TensorFlowTTS)
  Using cached tensorflow-gpu-2.12.0.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [44 lines of output]
  Traceback (most recent call last):
    File "C:\Users\NIHANTH\New folder\Lib\site-packages\setuptools\_vendor\packaging\requirements.py", line 35, in __init__
      parsed = parse_requirement(requirement_string)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "C:\Users\NIHANTH\New folder\Lib\site-packages\setuptools\_vendor\packaging\_parser.py", line 64, in parse_requirement
      return _parse_requirement(Tokenizer(source, rules=DEFAULT_RULES))
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "C:\Users\NIHANTH\New folder\Lib\site-packages\setuptools\_vendor\packaging\_parser.py", line 82, in _parse_requirement
      url, specifier, marker = _parse_requirement_details(tokenizer)
                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    File "C:\Users\NIHANTH\New folder\Lib\site-packages\set

In [6]:
pip install git+https://github.com/TensorSpeech/TensorFlowTTS.git


Collecting git+https://github.com/TensorSpeech/TensorFlowTTS.gitNote: you may need to restart the kernel to use updated packages.

  Cloning https://github.com/TensorSpeech/TensorFlowTTS.git to c:\users\nihanth\appdata\local\temp\pip-req-build-__ub7nxm
  Resolved https://github.com/TensorSpeech/TensorFlowTTS.git to commit 136877136355c82d7ba474ceb7a8f133bd84767e
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
INFO: pip is looking at multiple versions of tensorflowtts to determine which version is compatible with other requirements. This could take a while.


  Running command git clone --filter=blob:none --quiet https://github.com/TensorSpeech/TensorFlowTTS.git 'C:\Users\NIHANTH\AppData\Local\Temp\pip-req-build-__ub7nxm'
ERROR: Could not find a version that satisfies the requirement tensorflow-gpu==2.7.0 (from tensorflowtts) (from versions: 2.12.0)
ERROR: No matching distribution found for tensorflow-gpu==2.7.0
