In [1]:
import os
import soundfile as sf
from pydub import AudioSegment
from google.cloud import speech, texttospeech
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from helpers import read_audio_file, transcribe_audio, process_with_llm, convert_text_to_speech
import torch    

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.cuda.is_available())

True


In [3]:
# Initializeed GOOGLE API Client
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "serviceAcc.json"
speech_client = speech.SpeechClient()
tts_client = texttospeech.TextToSpeechClient()

# Initialized LLM
device = "cuda"
llm_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(llm_name, device=device, clean_up_tokenization_spaces=True)
llm = GPT2LMHeadModel.from_pretrained(llm_name).to(device)



In [4]:
# File paths
input_audio_file = "HelloWorld.wav"
output_audio_file = "output_audio/" 

In [5]:
# Read and transcribe audio
transcript = transcribe_audio(input_audio_file, speech_client)
print(f"Transcribed text: {transcript}")

Transcribed text: hello world


In [6]:
# Process transcribed text with local LLM
transcript_tokens = tokenizer.encode(transcript, return_tensors='pt').to(device)
processed_text_tokens = process_with_llm(transcript_tokens, llm, device)
processed_text = tokenizer.decode(processed_text_tokens[0], skip_special_tokens=True)
print(f"Processed text: {processed_text}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Processed text: hello world.com/live-video-stream.htm The video below is provided as a stand alone video.

This episode is brought to you by the great home of the Vaping Party! Get out there and party on and on!


In [7]:
output_audio_path = output_audio_file + "output.wav"
convert_text_to_speech(processed_text, output_audio_path, tts_client, device)

Audio content written to file output_audio/output.wav


In [8]:
# Convert processed text back to audio
i = 0
for text in processed_text.split(" "):
    i += 1
    output_audio_path = output_audio_file + str(i) +".wav"
    convert_text_to_speech(text, output_audio_path, tts_client, device)

Audio content written to file output_audio/1.wav
Audio content written to file output_audio/2.wav


Audio content written to file output_audio/3.wav
Audio content written to file output_audio/4.wav
Audio content written to file output_audio/5.wav
Audio content written to file output_audio/6.wav
Audio content written to file output_audio/7.wav
Audio content written to file output_audio/8.wav
Audio content written to file output_audio/9.wav
Audio content written to file output_audio/10.wav
Audio content written to file output_audio/11.wav
Audio content written to file output_audio/12.wav
Audio content written to file output_audio/13.wav
Audio content written to file output_audio/14.wav
Audio content written to file output_audio/15.wav
Audio content written to file output_audio/16.wav
Audio content written to file output_audio/17.wav
Audio content written to file output_audio/18.wav
Audio content written to file output_audio/19.wav
Audio content written to file output_audio/20.wav
Audio content written to file output_audio/21.wav
Audio content written to file output_audio/22.wav
Audio c