In [None]:
# Installing the required libraries
# pip install speech_recognition
# pip install soundfile
# pip install opencv-python-headless
# pip install torch
# pip install transformers
# pip install huggingface_hub

Collecting opencv-python-headless
  Downloading opencv_python_headless-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Downloading opencv_python_headless-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.9/49.9 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opencv-python-headless
Successfully installed opencv-python-headless-4.10.0.84


In [None]:
!apt install espeak
!apt-get install portaudio19-dev
!pip install pyaudio

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
espeak is already the newest version (1.48.15+dfsg-3).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
portaudio19-dev is already the newest version (19.6.0-1.1).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.


In [None]:
# importing the libraries

import speech_recognition as sr
import pyttsx3
import cv2
import torch

from huggingface_hub import login
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM

In [None]:
hugging_face_token_id = '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'
login(hugging_face_token_id)


# loading the models  (Whisper for speech) (Llama 2 for text)

whisper_processor = WhisperProcessor.from_pretrained("openai/whisper-base")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

tts_engine = pyttsx3.init()
recognizer = sr.Recognizer()

Token is valid (permission: fineGrained).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def recognize_speech_microphone():  # recognize from microphone
    with sr.Microphone() as source:
        recognizer.adjust_for_ambient_noise(source)
        print("Listening...")
        audio = recognizer.listen(source, timeout=5)
        audio_data = audio.get_wav_data()
        inputs = whisper_processor(audio_data, return_tensors="pt", sampling_rate=16000)
        outputs = whisper_model.generate(inputs["input_ids"])
        text = whisper_processor.decode(outputs[0], skip_special_tokens=True)
        print(f"Recognized Text: {text}")
        return text

In [None]:
def recognize_speech_webcam(): # recognize from webcam
    cap = cv2.VideoCapture(0)

    while True:
        ret, frame = cap.read()
        if not ret:
            print("Failed to capture video.")
            break

        cv2.imshow("Webcam", frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            with sr.Microphone() as source:
                print("Listening...")
                audio = recognizer.listen(source, timeout=5)
                audio_data = audio.get_wav_data()
                inputs = whisper_processor(audio_data, return_tensors="pt", sampling_rate=16000)
                outputs = whisper_model.generate(inputs["input_ids"])
                text = whisper_processor.decode(outputs[0], skip_special_tokens=True)
                print(f"Recognized Text: {text}")
                cap.release()
                cv2.destroyAllWindows()
                return text

    cap.release()
    cv2.destroyAllWindows()

In [None]:
def recognize_speech_text_input(): # recognize from text
    text = input("Enter your text: ")
    return text

In [None]:
def generate_response(input_text):
    inputs = llama_tokenizer(input_text, return_tensors="pt")
    outputs = llama_model.generate(inputs["input_ids"])
    response_text = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Generated Response: {response_text}")
    return response_text

In [None]:
def speak_text(text):
    tts_engine.say(text)
    tts_engine.runAndWait()

In [None]:
def main():   # main loop for full code
    print("Select input method:")
    print("1: Microphone")
    print("2: Webcam")
    print("3: Text Input")

    choice = input("Enter the number of your choice: ")

    if choice == "1":
        recognize_speech = recognize_speech_microphone
    elif choice == "2":
        recognize_speech = recognize_speech_webcam
    elif choice == "3":
        recognize_speech = recognize_speech_text_input
    else:
        print("Invalid choice!")
        return

    print("Starting the speech-to-speech application...")

    while True:
        spoken_text = recognize_speech()
        if not spoken_text:
            continue

        response_text = generate_response(spoken_text)
        speak_text(response_text)

if __name__ == "__main__":
    main()

Select input method:
1: Microphone
2: Webcam
3: Text Input
Enter the number of your choice: 3
Starting the speech-to-speech application...
Enter your text: where is tajmahal?
Generated Response: where is tajmahal?

The Taj Mahal is located in Agra, India. It is situated on the southern bank of the Yamuna River, and is considered one of the most beautiful examples of Mughal architecture in India. The Taj Mahal was built in the 17th century by Mughal Emperor Shah Jahan as a mausoleum for his wife, Mumtaz Mahal, who died during childbirth in 1631. The monument is made of white marble and features intricate inlay work of precious stones, including jasper, jade, and turquoise. It is considered one of the Seven Wonders of the World and is a UNESCO World Heritage Site.
Enter your text: IT means
Generated Response: IT means Information Technology. IT is a broad field that combines computer science, computer engineering, and other areas to design, develop, and manage computer systems and techno

KeyboardInterrupt: Interrupted by user