# Voice Assistant

In [None]:
import speech_recognition as sr
import pyttsx3
engine = pyttsx3.init()

from langchain_ollama import ChatOllama
llm = ChatOllama(model='mistral', quantization='1bit')

recognizer = sr.Recognizer()
with sr.Microphone() as source:
    print("Adjusting for ambient noise... Please wait.")
    recognizer.adjust_for_ambient_noise(source)
    while True:
        try:
            print("Listening... Speak now!")
            # Listen and capture the audio
            audio = recognizer.listen(source)
            print("Processing audio...")

            # with open('command.wav', 'wb') as f:
            #     f.write(audio.get_wav_data())

            # Recognize speech using Google's Web Speech API
            text = recognizer.recognize_google(audio)
            print("You said:", text)

            if text == 'bye-bye': break

            response = llm.invoke('Answer the following question concisely: '+ text).content.strip()
            print("response: " + response)
            engine.say(response)
            engine.runAndWait()
            
        except sr.UnknownValueError:
            print("Sorry, I could not understand the audio.")
        except sr.RequestError as e:
            print(f"Could not request results from the service; {e}")
        except KeyboardInterrupt:
            print("Stopped by user.")

Adjusting for ambient noise... Please wait.
Listening... Speak now!
Processing audio...
You said: bye-bye


In [14]:
from langchain_ollama import ChatOllama
llm = ChatOllama(model='mistral', quantization='1bit')
llm.invoke('what is the capital of Dutch?').content.strip()

'The capital of the Netherlands, also known as Holland (although it is actually two provinces of the country), is Amsterdam. However, it\'s important to note that "Dutch" refers to the language spoken in the Netherlands and some parts of Germany, Belgium, and Suriname. Therefore, there is no capital for Dutch as a language.'

In [24]:
prompt = """Your mission is to give the original online URL of websites. For example:
Youtube: https://www.youtube.com/
Messenger: https://www.messenger.com/
ChatGPT: https://chatgpt.com/
Google Translate: https://translate.google.com/
Gmail: https://mail.google.com/
Zalo: https://chat.zalo.me/
The original URL of {webName} is: 
"""
llm.invoke(prompt.format(webName = 'nettruyen')).content.strip()

'The original URL for NetTruyen (Net Read) is: https://nettruyen.vn/'

In [None]:
from selenium import webdriver
import time

# Start the browser and open YouTube Shorts
driver = webdriver.Chrome()
ut_short_path = "https://www.youtube.com/shorts"
driver.get(ut_short_path)
driver.maximize_window()
time.sleep(3)  # Wait for the page to load

# Close the browser
# driver.quit()

In [35]:
# take a screen shot
from PIL import ImageGrab
import time

# Take a screenshot of the entire screen
screenshot = ImageGrab.grab()

print(screenshot)
# Save the screenshot
screenshot.save(f"screenshot_{time.time_ns()}.png")

# Display a message
print(f"Screenshot saved as 'screenshot_{time.time_ns()}.png'")

<PIL.Image.Image image mode=RGB size=1920x1080 at 0x29E770F2690>
Screenshot saved as 'screenshot_1733068883541383700.png'


In [None]:
import numpy as np
note_frequencies = np.load('note_frequencies.npy', allow_pickle=True).item()
print(note_frequencies)

{'C0': 16.35, 'C#0': 17.32, 'Db0': 17.32, 'D0': 18.35, 'D#0': 19.45, 'Eb0': 19.45, 'E0': 20.6, 'F0': 21.83, 'F#0': 23.12, 'Gb0': 23.12, 'G0': 24.5, 'G#0': 25.96, 'Ab0': 25.96, 'A0': 27.5, 'A#0': 29.14, 'Bb0': 29.14, 'B0': 30.87, 'C1': 32.7, 'C#1': 34.65, 'Db1': 34.65, 'D1': 36.71, 'D#1': 38.89, 'Eb1': 38.89, 'E1': 41.2, 'F1': 43.65, 'F#1': 46.25, 'Gb1': 46.25, 'G1': 49.0, 'G#1': 51.91, 'Ab1': 51.91, 'A1': 55.0, 'A#1': 58.27, 'Bb1': 58.27, 'B1': 61.74, 'C2': 65.41, 'C#2': 69.3, 'Db2': 69.3, 'D2': 73.42, 'D#2': 77.78, 'Eb2': 77.78, 'E2': 82.41, 'F2': 87.31, 'F#2': 92.5, 'Gb2': 92.5, 'G2': 98.0, 'G#2': 103.83, 'Ab2': 103.83, 'A2': 110.0, 'A#2': 116.54, 'Bb2': 116.54, 'B2': 123.47, 'C3': 130.81, 'C#3': 138.59, 'Db3': 138.59, 'D3': 146.83, 'D#3': 155.56, 'Eb3': 155.56, 'E3': 164.81, 'F3': 174.61, 'F#3': 185.0, 'Gb3': 185.0, 'G3': 196.0, 'G#3': 207.65, 'Ab3': 207.65, 'A3': 220.0, 'A#3': 233.08, 'Bb3': 233.08, 'B3': 246.94, 'C4': 261.63, 'C#4': 277.18, 'Db4': 277.18, 'D4': 293.66, 'D#4': 311.

In [None]:
import sounddevice as sd
def create_note(note_name, duration): # Duration in seconds
    num_sample = int(sample_rate * duration)
    amplitude = np.linspace(1, 0, num_sample, endpoint=False)
    frequency = note_frequencies[note_name]
    t = np.linspace(0, duration, num_sample, endpoint=False)
    return np.float32(amplitude * np.sin(2 * np.pi * frequency * t))  # 440 Hz sine wave, # Convert to 16-bit PCM format

sample_rate = 44100
sd.play(create_note('C4',5), sample_rate)

In [None]:
def create_song(notes):
    song = np.array([])
    for note in notes:
        song = np.concatenate([song, create_note(note[0], note[1])])
        song = np.concatenate([song, np.zeros(int(sample_rate * 0.1))])
    
    return np.float32(song)

flattened_notes = [
    ('G4', 0.5), ('G4', 0.5), ('A4', 0.75), ('G4', 0.5), ('C5', 0.5), ('B4', 0.75),
    ('G4', 0.5), ('G4', 0.5), ('A4', 0.75), ('G4', 0.5), ('D5', 0.5), ('C5', 0.75),
    ('G4', 0.5), ('G4', 0.5), ('E5', 0.75), ('C5', 0.5), ('B4', 0.5), ('A4', 0.75),
    ('F5', 0.5), ('F5', 0.5), ('E5', 0.75), ('C5', 0.5), ('D5', 0.75), ('C5', 1)
]
sd.play(create_song(flattened_notes), sample_rate)

In [3]:
import re
response = """Playing song Happy New Year: I'm sorry for the inconvenience, but there seems to be no standard solfa notation for the "Happy New Year" song across different cultures and versions of the song. However, here is an example in solfa notation for a popular version of "Auld Lang Syne":

[('F4', 0.5), ('G4', 0.5), ('C5', 1.5), ('D5', 1.5), ('E5', 1.5), ('F5', 1.5), ('B4', 0.5), ('A4', 0.5), ('F4', 1), ('G4', 0.5), ('C5', 0.5), ('D5', 0.5), ('E5', 0.5), ('F5', 0.5), ('F5', 0.5), ('E5', 0.5), ('D5', 0.5), ('C5', 1)]"""

pattern = r"\[\('.*?',\s*[\d.]+\)(?:,\s*\('.*?',\s*[\d.]+\))*\]"
re.findall(pattern, response)[0]

"[('F4', 0.5), ('G4', 0.5), ('C5', 1.5), ('D5', 1.5), ('E5', 1.5), ('F5', 1.5), ('B4', 0.5), ('A4', 0.5), ('F4', 1), ('G4', 0.5), ('C5', 0.5), ('D5', 0.5), ('E5', 0.5), ('F5', 0.5), ('F5', 0.5), ('E5', 0.5), ('D5', 0.5), ('C5', 1)]"

In [None]:
import cv2
from PIL import ImageGrab
import numpy as np

# Set up screen dimensions and output file
screen_size = ImageGrab.grab().size # Get screen resolution (width, height)
output_file = "screen_recording.avi"  # Output file name
fps = 20.0  # Frames per second

# Define codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*"XVID")  # Codec for AVI format
out = cv2.VideoWriter(output_file, fourcc, fps, screen_size)

print("Press 'q' key to stop recording...")

try:
    while True:
        # Capture the screen
        screenshot = ImageGrab.grab()
        
        frame = np.array(screenshot)
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

        # Write the frame to the video file
        out.write(frame)

        # Show the frame (useful for monitoring)
        cv2.imshow("Recording", frame)

        # Stop recording when 'q' is pressed
        if cv2.waitKey(1) & 0xFF == ord('q'): break

except KeyboardInterrupt:
    print("Recording stopped.")

# Release resources
out.release()
cv2.destroyAllWindows()

Press 'Ctrl+C' or 'q' key to stop recording...
Recording stopped.
