<a href="https://colab.research.google.com/github/Maru8735/Infosys-Live-Meeting-Summary/blob/main/audio_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy soundfile


In [None]:
import numpy as np
import soundfile as sf
from IPython.display import Audio, display

# ====== SETTINGS ======
duration = 3        # seconds
sample_rate = 16000 # 16 kHz audio
frequency = 440     # A4 tone (can change)

# ====== GENERATE SINE WAVE ======
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
audio_data = 0.5 * np.sin(2 * np.pi * frequency * t)

# ====== SAVE AUDIO FILE ======
output_file = "generated_audio.wav"
sf.write(output_file, audio_data, sample_rate)

# ====== PRINT OUTPUT ======
print("Audio file created successfully!")
print("Saved as:", output_file)

# ====== PLAY AUDIO IN COLAB ======
display(Audio(output_file, autoplay=True))


In [None]:
!pip install soundfile mutagen


In [None]:
import numpy as np
import soundfile as sf
from IPython.display import Audio, display
from mutagen.wave import WAVE
from mutagen.id3 import TextFrame

# ====== SETTINGS ======
duration = 3        # seconds
sample_rate = 16000 # 16 kHz audio
frequency = 440     # Sine tone frequency

# ====== GENERATE SINE WAVE ======
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
audio_data = 0.5 * np.sin(2 * np.pi * frequency * t)

# ====== SAVE AUDIO FILE ======
output_file = "generated_audio_with_info.wav"
sf.write(output_file, audio_data, sample_rate)

# ====== ADD METADATA ======
metadata = WAVE(output_file)

# Use TextFrame for metadata values
metadata["INAM"] = TextFrame(encoding=3, text=["Sample Tone Audio"])# title or name of the audio
metadata["IART"] = TextFrame(encoding=3, text=["Your Name"]) # artist
metadata["ICMT"] = TextFrame(encoding=3, text=["This is a generated audio tone with embedded metadata."]) #comments
metadata["ICRD"] = TextFrame(encoding=3, text=["2025-02-14"]) # creation date
metadata.save()

# ====== PLAY AUDIO ======
print("Audio file created with metadata!\n")
display(Audio(output_file, autoplay=False))

# ====== READ AND PRINT METADATA ======
print("=== Embedded Metadata in WAV File ===")
for key, value in metadata.items():
    print(f"{key}: {value.text[0] if hasattr(value, 'text') else value}") # Access text attribute for TextFrame

In [None]:
!pip install gTTS soundfile # google text to speech


In [None]:
from gtts import gTTS
from IPython.display import Audio, display
import soundfile as sf
import numpy as np

# ====== TEXT TO SPEECH INPUT ======
text = "Hello! This is an automatically generated audio message created in Google Colab."

# ====== GENERATE AUDIO FROM TEXT ======
tts = gTTS(text=text, lang='en')
tts.save("text_audio.mp3")

# Convert MP3 to WAV (optional)
# Load MP3 using audio libraries
import librosa
audio_data, sr = librosa.load("text_audio.mp3", sr=16000)
sf.write("text_audio.wav", audio_data, sr)

# ====== PLAY AUDIO ======
print("Text converted to speech and saved as 'text_audio.wav'")
display(Audio("text_audio.wav", autoplay=False))


In [None]:
!pip install sounddevice vosk faster-whisper soundfile
!apt-get install -y portaudio19-dev

In [None]:
from gtts import gTTS
from IPython.display import Audio

tts = gTTS("Hello, this is a test audio for speech to text models.", lang="en")
tts.save("test_audio.mp3")

Audio("test_audio.mp3")


convert to WAV

In [None]:
!ffmpeg -i test_audio.mp3 -ar 16000 -ac 1 test.wav -y


Run in vosk

In [None]:
!wget -q https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
!unzip -q vosk-model-small-en-us-0.15.zip

from vosk import Model, KaldiRecognizer
import wave, json

wf = wave.open("test.wav", "rb")
rec = KaldiRecognizer(Model("vosk-model-small-en-us-0.15"), 16000)

result = ""
while True:
    data = wf.readframes(4000)
    if not data:
        break
    if rec.AcceptWaveform(data):
        result += json.loads(rec.Result())["text"] + " "

result += json.loads(rec.FinalResult())["text"]
print("VOSK:", result)


RUN IN WHISPER


In [None]:
!pip install -q faster-whisper

from faster_whisper import WhisperModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model = WhisperModel("small", device=device)

segments, info = model.transcribe("test.wav")
text = " ".join([s.text for s in segments])

print("WHISPER:", text)


In [None]:
# ================================
# Single-Cell STT Pipeline (Colab)
# TTS → WAV → Vosk → Whisper
# ================================

# Install dependencies
!pip install -q gTTS vosk faster-whisper soundfile sentencepiece
!apt-get -qq install -y ffmpeg

from gtts import gTTS
from IPython.display import Audio
import subprocess, wave, json, os
import torch

# 1) Generate Speech Audio using TTS
text_input = "Hello, this is an automatic speech recognition test using Vosk and Whisper."
tts = gTTS(text_input, lang="en")
tts.save("tts.mp3")
print("Generated audio from text:", text_input)
Audio("tts.mp3")

# 2) Convert MP3 → WAV (16 kHz mono)
subprocess.run(["ffmpeg", "-y", "-i", "tts.mp3", "-ar", "16000", "-ac", "1", "audio.wav"],
               stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
print("Converted to audio.wav (16k mono)")

# 3) --- VOSK STT ---
if not os.path.exists("vosk-model"):
    !wget -q https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip -O model.zip
    !unzip -q model.zip
    !mv vosk-model-small-en-us-0.15 vosk-model
    !rm model.zip

from vosk import Model, KaldiRecognizer

wf = wave.open("audio.wav", "rb")
rec = KaldiRecognizer(Model("vosk-model"), 16000)

vosk_text = ""
while True:
    data = wf.readframes(4000)
    if not data: break
    if rec.AcceptWaveform(data):
        vosk_text += json.loads(rec.Result())["text"] + " "
vosk_text += json.loads(rec.FinalResult())["text"]

# 4) --- WHISPER STT ---
from faster_whisper import WhisperModel

device = "cuda" if torch.cuda.is_available() else "cpu"
whisper = WhisperModel("small", device=device)

segments, _ = whisper.transcribe("audio.wav")
whisper_text = " ".join([s.text for s in segments]).strip()

# 5) Results
print("\n===== VOSK OUTPUT =====")
print(vosk_text)

print("\n===== WHISPER OUTPUT =====")
print(whisper_text)

print("\n===== ORIGINAL TEXT =====")
print(text_input)


DOWNLOAD AMI SAMPLE


In [None]:
!mkdir -p ami_sample
!wget -q https://groups.inf.ed.ac.uk/ami/AMICorpusSamples/ES2002a.Mix-Headset.wav -O ami_sample/ES2002a.wav

print("Downloaded AMI sample file:")
!ls -lh ami_sample


Load the AMI sample audio

In [None]:
from IPython.display import Audio

Audio("ami_sample/ES2002a.wav")


Download a public speech dataset (LibriSpeech test subset)

In [None]:
!mkdir -p librispeech_sample
!wget -q https://www.openslr.org/resources/12/dev-clean.tar.gz -O dev-clean.tar.gz
!tar -xzf dev-clean.tar.gz --directory librispeech_sample --wildcards "*.flac"

print("Sample files:")
!find librispeech_sample -name "*.flac" | head


In [None]:
!mkdir -p librispeech_sample
!wget -q https://www.openslr.org/resources/12/dev-clean.tar.gz -O dev-clean.tar.gz
!tar -xzf dev-clean.tar.gz --directory librispeech_sample --wildcards "*.flac"

print("Sample files:")
!find librispeech_sample -name "*.flac" | head


Convert dataset audio (FLAC → WAV, 16k mono)

In [None]:
import subprocess, os

source = "librispeech_sample/LibriSpeech/dev-clean/1272/128104/1272-128104-0000.flac"
target = "sample.wav"

subprocess.run(["ffmpeg", "-y", "-i", source, "-ar", "16000", "-ac", "1", target])

print("Converted to:", target)



Create a synthetic dataset automatically

In [None]:
!pip install gTTS soundfile
from gtts import gTTS
import os

os.makedirs("synthetic_dataset", exist_ok=True)

sentences = [
    "Hello, welcome to the speech recognition test.",
    "This is a synthetic dataset created using text to speech.",
    "Speech models must be evaluated for accuracy.",
    "Different speakers and accents should be tested.",
    "Background noise can affect transcription quality.",
    "We will benchmark whisper and vosk models.",
    "This sentence is intentionally longer to test robustness.",
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is transforming industries.",
    "Thank you for participating in this project."
]

for i, text in enumerate(sentences):
    tts = gTTS(text=text, lang="en")
    path = f"synthetic_dataset/audio_{i}.mp3"
    tts.save(path)

print("Synthetic dataset created:")
!ls -1 synthetic_dataset



conversion of files of a folder into audio

In [None]:
!pip install gTTS soundfile
from gtts import gTTS
import os

input_folder = "text_files"
output_folder = "audio_output"

os.makedirs(output_folder, exist_ok=True)

text_files = [f for f in os.listdir(input_folder) if f.endswith(".txt")]

for fname in text_files:
    file_path = os.path.join(input_folder, fname)

    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read().strip()

    if len(text) == 0:
        print(f"Skipping empty file: {fname}")
        continue

    tts = gTTS(text=text, lang="en")

    base_name = os.path.splitext(fname)[0]
    output_path = os.path.join(output_folder, base_name + ".mp3")

    tts.save(output_path)
    print("Created:", output_path)

print("\n All text files converted to audio!")


Convert synthetic audios to WAV


In [None]:
import subprocess
import os

files = os.listdir("synthetic_dataset")

for f in files:
    if f.endswith(".mp3"):
        mp3_path = f"synthetic_dataset/{f}"
        wav_path = mp3_path.replace(".mp3", ".wav")
        subprocess.run(["ffmpeg", "-y", "-i", mp3_path, "-ar", "16000", "-ac", "1", wav_path])

print("Converted WAV files:")
!ls synthetic_dataset/*.wav


this the code for how to convert lontg text to audio file

In [None]:
!pip install gTTS
from gtts import gTTS
text=open("text.txt").read()
tts=gTTS(text=text,lang="en")
tts.save("speech.mp3")

In [None]:
!pip install TTS

In [None]:
from TTS.api import TTS

#load a male english model
tts=TTS("tts_models/en/ljspeech/glow-tts").to("cpu")
text="Hello, this is an example of a male voice."
tts.tts_to_file(text=text, file_path="male_voice.wav")

In [None]:
!pip install pydub