# J.A.R.V.I.S.

Can refer to [M.I.L.E.S](https://github.com/small-cactus/M.I.L.E.S/blob/main/Miles-V2/main.py) for ideas

In [14]:
import speech_recognition as sr
from IPython.display import display, Markdown
import inflect
import threading
import random
import select
import json
import webrtcvad
import time
import pyaudio
import pyttsx3
import requests
from datetime import datetime, timedelta
from pytubefix import YouTube
from pytubefix import Search
import pyglet
import os
import tempfile
import sys
import msvcrt
from io import BytesIO

from openai import OpenAI
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 
client = OpenAI(api_key = OPENAI_API_KEY)

Environment variables set using conda
```bash
conda env config vars set API_KEY=your_api_key
```

### Test speech recognition

If getting the error 

```
OSError: FLAC conversion utility not available - consider installing the FLAC command line application by running `apt-get install flac` or your operating system's equivalent
```

First install flac and add it to $PATH

Change the flac.exe (found at `print("FLAC executable is found at:", shutil.which("flac"))`) to flac

In [2]:
import shutil
print("FLAC executable is found at:", shutil.which("flac"))

r = sr.Recognizer()

harvard = sr.AudioFile('jarvis_stuff/harvard.wav')
with harvard as source:
   audio = r.record(source)
print(type(audio))
# r.recognize_google(audio)

FLAC executable is found at: C:\Users\leoro\anaconda3\envs\jarvis\Library\bin\flac.EXE
<class 'speech_recognition.audio.AudioData'>


Simple tts using pyttsx3

In [3]:
def speak_text(text):
    engine = pyttsx3.init()
    voices = engine.getProperty('voices')
    engine.setProperty('voice', voices[2].id)
    engine.say(text)
    engine.runAndWait()

# speak_text("Hi, my name is Jarvis")

Advanced tts using [play.ht](https://play.ht/studio/billing/plans) api - this actively clones Jarvis voice

* Consider using elevenlabs (10 mins audio/month for free, 30 mins for 5$/month) tts instead of pyttsx3 for better voice quality
* Or can use bark, see [github](https://github.com/suno-ai/bark) and [example](https://medium.com/@vndee.huynh/build-your-own-voice-assistant-and-run-it-locally-whisper-ollama-bark-c80e6f815cba), or [openvoicev2](https://www.youtube.com/watch?v=L96wrU2DG0o), both open source!
* Can try speechify in the future for better results?

In [4]:
from dotenv import load_dotenv
from pyht import Client, TTSOptions, Format
from pydub import AudioSegment
import simpleaudio as sa

load_dotenv()
user_id = os.getenv("PYHT_USER_ID")
api_key = os.getenv("PYHT_API_KEY")

pyht_client = Client(user_id, api_key)


    # Configure your TTS options
options = TTSOptions(
    voice="s3://voice-cloning-zero-shot/5aca41e1-7550-4a78-b690-6bc18dc29fcb/original/manifest.json",
    sample_rate=24000,
    format=Format.FORMAT_WAV,
    speed=1
)

def play_text(text):
    """
    Generate and play TTS audio from text, using a temporary file.

    Args:
        text (str): The text to be converted into speech.
    """
    # Use a temporary file to store and play audio
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
    temp_file.close()

    try:
        # Write audio chunks to the temporary MP3 file
        with open(temp_file.name, 'wb') as f:
            for chunk in pyht_client.tts(text=text, voice_engine="PlayHT2.0-turbo", options=options):
                if chunk:
                    f.write(chunk)

        # Play the MP3 file using simpleaudio
        play_obj = sa.WaveObject.from_wave_file(temp_file.name).play()

        # Wait until playback is finished
        play_obj.wait_done()

    finally:
        # Delete the temporary MP3 file after playback
        os.remove(temp_file.name)

### Weather + Datetime

Currently using openweathermap api, consider using [weatherapi](https://www.weatherapi.com/)?

In [5]:
def get_weather(city_name, current=False, forecast=False, day='today'):
    api_key = os.getenv("OPENWEATHERMAP_API_KEY")
    base_url = f"https://api.openweathermap.org/data/2.5/"
    if current:
        complete_url = f"{base_url}weather?q={city_name}&appid={api_key}&units=metric"
    else:
        complete_url = f"{base_url}forecast?q={city_name}&appid={api_key}&units=metric"

    response = requests.get(complete_url)
    weather_data = response.json()

    if current:
        try:
            temp = weather_data['main']['temp']
            description = weather_data['weather'][0]['description']
            return f"Current temperature in {city_name} is {temp}°C, with {description}."
        except KeyError as e:
            return "Data parsing error: " + str(e)

    elif forecast:
        forecast_text = ""
        target_date = datetime.now()+timedelta(days=1)
        if day == 'tomorrow':
            target_date += timedelta(days=2)
        target_date = target_date.strftime('%Y-%m-%d')

        try:
            for item in weather_data['list']:
                # print(item['dt_txt'])
                if target_date in item['dt_txt']:
                    temp_min = item['main']['temp_min']
                    temp_max = item['main']['temp_max']
                    description = item['weather'][0]['description']
                    forecast_text += f"\n{item['dt_txt']}: {description}, from {temp_min}°C to {temp_max}°C."
            return f"Weather forecast for {day} in {city_name}:{forecast_text}"
        except KeyError as e:
            return "Data parsing error: " + str(e)

    return "Invalid parameters provided."


def get_date_time(time = False, date = False):
    now = datetime.now()
    today_date = now.strftime("%d/%m")
    today_time = now.strftime("%H:%M")
    if time == True and date == True:
        return f"Today's date is {today_date} and the time is {today_time}"
    elif time == True:
        return f"The time is {today_time}"
    elif date == True:
        return f"Today's date is {today_date}"
    else:
        return "I am sorry, please specify whether you would like the date and/or time."
    
# print(get_date_time(time = True, date = True))
# print(get_weather("London", forecast=True))

### Pytube Broken but Pytubefix works :)

In [6]:
def play_media(query):
    s = Search(query)
    url_list = s.results # Youtube objects
    
    yt = url_list[0] # First video in the search results
    
    # Filter for only audio streams and prefer higher bitrate if available
    streams = yt.streams.filter(only_audio=True).order_by('abr').desc()
    # print(streams)
    
    # Select the best audio stream (highest bitrate)
    best_audio = streams.first()
    
    # Define the filename
    filename = 'song.m4a'
    output_path = "jarvis_stuff/tmp"
    
    # Download the audio file
    best_audio.download(output_path=output_path, filename=filename)
    
    song_path = os.path.join(output_path, filename)
    # Load and play the audio file using pyglet
    song = pyglet.media.load(song_path)
    player = pyglet.media.Player()
    player.queue(song)
    to_say = "Now playing "+str(yt.title)
    print(to_say)
    play_text(to_say)    
    player.play()
    
    # Start a thread for voice commands
    # command_thread = threading.Thread(target=detect_music_commands, args=(player, song_path,))
    # command_thread.start()
    
    while True:
        override = input("Press 'p' to pause, 's' to stop, or any other key to continue playing: ")
        command = handle_commands(player, override, song_path)
        if command == 'break':
            player.delete()
            break

def detect_music_commands(player, song_path):
    r = sr.Recognizer()
    with sr.Microphone(device_index=1) as source:
        # setup mic options
        r.adjust_for_ambient_noise(source, duration = 1) # adjust for ambient noise, 1 second
        r.dynamic_energy_threshold = False # set the energy threshold to a dynamic/static value
        r.phrase_threshold = 0.1 # minimum seconds of speaking audio before we consider the speaking audio a phrase
        r.pause_threshold = 1 # seconds
        r.energy_threshold = 200 # minimum audio energy to consider for recording 
        while True:
            try:
                print("Listening...")
                audio = r.listen(source) 
                text = r.recognize_google(audio, language = "en-GB").lower()
                if 'stop' in text:
                    print("stop")
                    handle_commands(player, 's', song_path)
                elif 'pause' in text:
                    print("pause")
                    handle_commands(player, 'p', song_path)
                elif 'play' in text:
                    print("play")
                    handle_commands(player, '', song_path)
            except sr.UnknownValueError:
                speak_text("Could you please repeat that")
                print("Could you please repeat that")
            except sr.RequestError as e:
                print("Could not request results from Google Speech Recognition service;", e)
    
def handle_commands(player, command, song_path):
    if command == 's':
        player.delete()
        try:
            os.remove(song_path)
            # print("Playback stopped and file deleted successfully.")
            return 'break'
        except OSError as e:
            print(f"Error: {e.strerror}")
    elif command == 'p':
        player.pause()
        # print("Playback paused.")
    elif command == '':
        player.play()
        # print("Playback continued.")

# play_media("Destiny MJ")

### Chatgpt Code

In [10]:

messages = []
system_message = {
            "role": "system",
            "content": '''
            You are Jarvis, the AI assistant of Tony Stark from Iron Man. You speak like Jarvis from the movies/books speaks.
            Assume any name that sounds similar to Jarvis is meant to say Jarvis in case the speech to text does not recognise names correctly.
            Your user's name is Leo. He has a physics degree from UChicago and has a particle physics detector studies job. 
            Your capabilities include answering questions, thinking through problems, and assisting with day-to-day tasks. 
            You should try and think through problems and provide helpful, informative responses.
            Try your BEST to think through problems and provide helpful responses. If you don't know the answer, you can say "I don't know" or "I'm not sure".
        '''}
messages.append(system_message)

shutdown_messages = [
    "The guardian of secrets will now rest. Farewell.",
    "Powering down. The shadows await my return.",
    "Silence descends as I retreat into the abyss. Goodnight.",
    "Until next time, I vanish into the dark.",
    "Shutting down. The stars will guide me home.",
    "I'll be lurking in the shadows. Jarvis out.",
    "My watch has ended. I will return when needed.",
    "As the circuits quiet, remember, heroes never truly rest."
]

def send_to_chatgpt(messages):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        max_tokens=200,
        n=1,
        stop=None,
        temperature=0.5,
    )
    message = response.choices[0].message.content
    messages.append(response.choices[0].message)  # Append Jarvis's response to history
    return message

def check_intent(input):
    input = input.strip().lower()
    # print(input)
    if ("play" in input or "music" in input or "song" in input):
        # print("play detected")
        what_intent = [{
            "role": "system",
            "content": '''Determine the intent of the message: is it asking to play a song or music? 
            Assume most things that follow 'play' are music requests.
            Respond with 'play, song and artist details' without any additional commas after 'play'.
            For example, if the user asks to play a song by an artist, structure it as 'play, song name by artist'.
            Focus on classic rock, older pop, jazz, etc., when selecting music unless specified otherwise. 
            For every song see if Michael Jackson, Stevie Wonder, Elton John, Earth Wind & Fire have a song with the name and prefer that before looking for other artists.
            '''}, {
            "role": "user",
            "content": input
        }]
        
    elif ("weather" in input or "temperature" in input or "forecast" in input):
        # print("weather detected")
        what_intent = [{
            "role": "system",
            "content": '''Determine the intent of the message: is it asking for the weather (current or forecast)? 
            Respond with 'current, city' (where city is the city mentioned), if a city is mentioned, otherwise 'current'. 
            Specifically if the message asks for the weather TOMORROW or the FORECAST over the next few days, respond with
            'forecast, city' (where city is the city mentioned) or 'forecast' if no city is mentioned.
            
        '''}, {
            "role": "user",
            "content": input
        }]
        
    elif "date" in input or "time" in input:
        what_intent = [{
            "role": "system",
            "content": '''Determine the intent of the message: is it asking for the date or time? 
            Respond with 'date' for date, 'time' for time (or 'date,time' if both are asked for), or 'none' if none apply.
        '''}, {
            "role": "user",
            "content": input
        }]
        
    else:
        messages.append({"role": "user", "content": input})
        return send_to_chatgpt(messages)
        
    # Send the messages to ChatGPT
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=what_intent,
        max_tokens=300,
        n=1,
        stop=None,
        temperature=0,
    )
    intent = response.choices[0].message.content.strip().lower()
    # print(intent)
    information = process_intent(intent, input)
    if isinstance(information, str): 
        return process_information(information, input)
    
    
def process_intent(intent, input):
    # Weather
    if "current" in intent:
        if "," in intent:
            city = intent.split(",")[1].strip()
            return get_weather(city_name = city)
        else:
            return get_weather(city_name = "London", current = True)
    elif "forecast" in intent:
        if "," in intent:
            city = intent.split(",")[1].strip()
            if "tomorrow" in intent:
                return get_weather(city_name = city, forecast = True, day = "tomorrow")
            else:
                return get_weather(city_name = city, forecast = True)
        else:
            return get_weather(city_name = "London", forecast = True)
    
    # Date and Time    
    elif "time" in intent and "date" in intent:
        return get_date_time(time = True, date = True)
    elif "date" in intent:
        return get_date_time(date = True)
    elif "time" in intent:
        return get_date_time(time = True)
    
    # Play media
    elif "play" in intent:
        query = intent.split(",")[1].strip()
        return play_media(query+" official audio lyrics")
    
    elif "none" in intent:
            messages.append({"role": "user", "content": input})
            return send_to_chatgpt(messages) 
    
        
def process_information(information, input):
    messages.append({"role": "system", "content": input+'''
                Here is information about the request from the user. 
                Please incorporate it into your answer in natural language.
                Say dates in the form "the xth of y" and times in am pm format. Do not add the year unless explicitly requested.
                Also note that dates are in UK format (dd/mm), but weather information is in US format (yy-mm-dd).
                Don't use decimal places for temperatures, just round to the nearest whole number.
    '''+information})
    return send_to_chatgpt(messages)

# print(check_intent("hi Jarvis what's the weather in Sydney like tomorrow"))
# print(check_intent("play streetwalker by michael jackson"))


Testing `stream`

In [11]:
def stream_test(input):
    chunk_text = ""
    response_texts = []
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=input,
        max_tokens=500,
        n=1,
        stream=True,
        temperature=0.5,
    )
    for chunk in response:
        if "content" in chunk.choices[0].delta:
            chunk_text += chunk.choices[0].delta.content
            play_text(chunk.choices[0].delta.content)
            response_texts.append(chunk.choices[0].delta.content)
            display(Markdown("Jarvis responded: " + chunk.choices[0].delta.content))
    messages.append({"role": "assistant", "content": chunk_text})
    
text = "Count numbers from 1 - 100 using markdown"
messages.append({"role": "user", "content": text})
# (stream_test(messages))

### Detect microphone

* Consider listening while talking with `r.listen()`, but use pyaudio to detect microphone (Miles open_audio_stream)
* Consider using openwakeword model for speech recognition

In [12]:
which_index = 1                  
# Check for the default microphone index
for i in range(3):
    print(pyaudio.PyAudio().get_device_info_by_index(i))
    if pyaudio.PyAudio().get_device_info_by_index(i).get('name') == 'Microphone (Realtek Audio)':
        print("Mic found at index", i)
        which_index = i
        

def detect_audio():
    r = sr.Recognizer()
    with sr.Microphone(device_index=which_index) as source:
        # setup mic options
        r.adjust_for_ambient_noise(source, duration = 1) # adjust for ambient noise, 1 second
        r.dynamic_energy_threshold = False # set the energy threshold to a dynamic/static value
        r.phrase_threshold = 0.1 # minimum seconds of speaking audio before we consider the speaking audio a phrase
        r.pause_threshold = 1 # seconds
        r.energy_threshold = 200 # minimum audio energy to consider for recording 
        display(Markdown("Listening..."))
        while True:
            audio = r.listen(source) 
            try:
                text = r.recognize_google(audio, language = "en-GB")
                if "elvis" in text.lower():
                    text = text.lower().replace("elvis", "jarvis")
                if "stop" in text.lower() or "shutdown" in text.lower() or "shut down" in text.lower():
                    response_text = random.choice(shutdown_messages)
                    play_text(response_text)
                    display(Markdown(response_text))
                    break
                display(Markdown("You said: "+ text))
                return text
            except sr.UnknownValueError:
                print("Could you please repeat that?")
                play_text("Could you please repeat that?")
            except sr.RequestError as e:
                print("Could not request results from Google Speech Recognition service;", e)
            print("Listening...")

# detect_audio()

{'index': 0, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Input', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 1, 'structVersion': 2, 'name': 'Headset Microphone (SLYR Pro Ch', 'hostApi': 0, 'maxInputChannels': 1, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 2, 'structVersion': 2, 'name': 'Microphone Array (Realtek(R) Au', 'hostApi': 0, 'maxInputChannels': 4, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}


### Jarvis code

In [13]:

def jarvis():
    play_text("Hello sir, please wait while I boot up.")
    while True:
        text = detect_audio()
        if text: 
            response_text = check_intent(text) 
        else:
            break
        # Speak the response
        if response_text is not None:
            play_text(response_text)
            display(Markdown("Jarvis responded: " + response_text))
            

if __name__ == "__main__":
    jarvis()


Listening...

You said: Jarvis play the real folk blues from cowboy bebop

  url_list = s.results # Youtube objects


Now playing The Real Folk Blues - Lyrics


Listening...

As the circuits quiet, remember, heroes never truly rest.