# Get LLM Model

In [1]:
from characterai import PyCAI

client = PyCAI('dd7a926b001ecb0100c5022075f51b7cc8893cdd')

char = 'YntB_ZeqRq2l_aVf2gWDCZl4oBttQzDvhj9cXafWcF8'

chat = client.chat.get_chat(char)

participants = chat['participants']

if not participants[0]['is_human']:
    tgt = participants[0]['user']['username']
else:
    tgt = participants[1]['user']['username']

# STT

In [2]:
import speech_recognition as sr

microphone = sr.Microphone()
recognizer = sr.Recognizer()

def speech_to_text(microphone, recognizer):

    with microphone as source:
        recognizer.adjust_for_ambient_noise(source)

    print("Listening...")
    with microphone as source:
        audio = recognizer.listen(source)

    try:
        text = recognizer.recognize_google(audio)
        return text

    except sr.UnknownValueError:
        print("Speech Recognition could not understand audio")

    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {0}".format(e))

# TTS

In [3]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
from playsound import playsound
import torch
import random
import string
import soundfile as sf

device = "cuda" if torch.cuda.is_available() else "cpu"


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# load the processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# load the model
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
# load the vocoder, that is the voice encoder
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
# we load this dataset to get the speaker embeddings
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")


In [5]:
# speaker ids from the embeddings dataset
speakers = {
    'awb': 0,     # Scottish male
    'bdl': 1138,  # US male
    'clb': 2271,  # US female
    'jmk': 3403,  # Canadian male
    'ksp': 4535,  # Indian male
    'rms': 5667,  # US male
    'slt': 6799   # US female
}


In [6]:
def save_text_to_speech(text, speaker=None):
    # preprocess text
    inputs = processor(text=text, return_tensors="pt").to(device)
    if speaker is not None:
        # load xvector containing speaker's voice characteristics from a dataset
        speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
    else:
        # random vector, meaning a random voice
        speaker_embeddings = torch.randn((1, 512)).to(device)
    # generate speech with the models
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
    if speaker is not None:
        # if we have a speaker, we use the speaker's ID in the filename
        output_filename = f"../Cache/audio.mp3"
    else:
        # if we don't have a speaker, we use a random string in the filename
        random_str = ''.join(random.sample(string.ascii_letters+string.digits, k=5))
        output_filename = f"../Cache/audio.mp3"
    # save the generated speech to a file with 16KHz sampling rate
    sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)
    # return the filename for reference
    return output_filename


In [7]:
from mutagen.mp3 import MP3
import math

def floored_audio_duration(audio_directory:str):
    audio = MP3('../Cache/audio.mp3')
    audio_info = audio.info
    length = audio_info.length
    return math.floor(length)

In [8]:
import os, time
while True:

    message = speech_to_text(microphone, recognizer)

    print(f'You: {message}')

    data = client.chat.send_message(
        chat['external_id'], tgt, message
    )

    name = data['src_char']['participant']['name']
    text = data['replies'][0]['text']

    # TTS Function
    save_text_to_speech(text, speaker=speakers["slt"])

    duration = floored_audio_duration('../Cache/audio.mp3')

    sleep_duration = duration if duration < 3 else duration - 3

    time.sleep(1)
    playsound('../Cache/audio.mp3')
    print(f'{name}: {text}')
    time.sleep(sleep_duration)
    os.remove('../Cache/audio.mp3')

# Get token: JSON.parse(localStorage.getItem('char_token')).value in console

Listening...
You: hello brother
Character Assistant: Greetings, you are a very fine individual! How are you doing?
Listening...


KeyboardInterrupt: 