Resources Used :
- https://huggingface.co/microsoft/speecht5_tts?text=hello
- https://drlee.io/a-step-by-step-guide-to-train-a-yolo-object-detector-using-google-colab-and-your-laptop-camera-in-ca935a506927

In [None]:
!pip install -U torch torchvision cython
!pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'

Collecting torch
  Downloading torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl (779.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
import torchvision
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F
from PIL import Image
import cv2
from google.colab.patches import cv2_imshow

In [None]:
!git clone https://github.com/ultralytics/yolov5
%cd yolov5
!pip install -r requirements.txt

In [None]:
import torch
from yolov5.models.yolo import Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True).to(device)
model.eval()

In [None]:
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode

def take_photo(filename='photo.jpg', quality=0.8):
  js = Javascript('''
    async function takePhoto(quality) {
      const div = document.createElement('div');
      const capture = document.createElement('button');
      capture.textContent = 'Capture';
      div.appendChild(capture);

      const video = document.createElement('video');
      video.style.display = 'block';
      const stream = await navigator.mediaDevices.getUserMedia({video: true});

      document.body.appendChild(div);
      div.appendChild(video);
      video.srcObject = stream;
      await video.play();

      // Resize the output to fit the video element.
      google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

      // Wait for Capture to be clicked.
      await new Promise((resolve) => capture.onclick = resolve);

      const canvas = document.createElement('canvas');
      canvas.width = video.videoWidth;
      canvas.height = video.videoHeight;
      canvas.getContext('2d').drawImage(video, 0, 0);
      stream.getVideoTracks()[0].stop();
      div.remove();
      return canvas.toDataURL('image/jpeg', quality);
    }
  ''')
  display(js)
  data = eval_js('takePhoto({})'.format(quality))
  binary = b64decode(data.split(',')[1])
  with open(filename, 'wb') as f:
    f.write(binary)
  return filename

In [None]:
# !pip install --upgrade pip
!pip install --upgrade transformers sentencepiece datasets[audio]
!pip install tensorflow-probability
# !pip install tensorflow==2.14

In [None]:
from transformers import pipeline
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import soundfile as sf
import torch
from PIL import Image
import torchvision.transforms as transforms
from yolov5.utils.general import non_max_suppression
import matplotlib.pyplot as plt
import cv2

# Function for text-to-speech synthesis
def text_to_speech(text):
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
    synthesiser = pipeline("text-to-speech", "microsoft/speecht5_tts")
    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
    speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
    return speech

# Function to preprocess an image
def preprocess_image(image_path):
    # Load the image and transform it into a tensor
    image = Image.open(image_path)
    transform = transforms.Compose([transforms.ToTensor()])
    image_tensor = transform(image).unsqueeze(0).to(device)
    return image_tensor

# Capture image (this function is not provided, make sure it captures an image correctly)
image_path = take_photo()  # Make sure this captures an image without errors
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Perform object detection on the captured image
image_tensor = preprocess_image(image_path)
outputs = model(image_tensor)
outputs = non_max_suppression(outputs)[0]

# Check if a 'person' is detected
# If the label is not a 'person', generate text-to-speech

detected_objects = []
for box in outputs:
    label = int(box[5])
    score = box[4]
    if label != 0:  # Change '0' to the label ID of the desired object
      detected_objects.append(f"Detected {model.names[label]} with confidence {score:.2f}")

if detected_objects:
    text = ', '.join(detected_objects)
    print("Detected objects:", text)
    speech_output = text_to_speech(text)
    print("Speech output:", speech_output)
    sf.write("speech.wav", speech_output["audio"], samplerate=speech_output["sampling_rate"])


# Visualize object detection results
def draw_boxes(image_path, outputs, threshold=0.3):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    h, w, _ = image.shape

    for box in outputs:
        score, label, x1, y1, x2, y2 = box[4].item(), int(box[5].item()), box[0].item(), box[1].item(), box[2].item(), box[3].item()
        if score > threshold:
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
            cv2.rectangle(image, (x1, y1), (x2, y2), (255, 0, 0), 2)
            text = f"{model.names[label]:s}: {score:.2f}"
            cv2.putText(image, text, (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

    plt.imshow(image)
    plt.axis('off')  # Hide axes
    plt.show()

# Draw bounding boxes on the image
draw_boxes(image_path, outputs)
plt.show()  # Display the detected image
plt.savefig("detected_image.jpg")  # Save the detected image if needed

In [None]:
# Initialize speech_output
speech_output = None

if detected_objects:
    text = ', '.join(detected_objects)
    print("Detected objects:", text)
    # Generate text-to-speech output
    speech_output = text_to_speech(text + "at distance 10 cm")
    print("Speech output:", speech_output)
    sf.write("speech.wav", speech_output["audio"], samplerate=speech_output["sampling_rate"])

# Check if speech_output is not None before playing audio
if speech_output is not None:
    # Play audio if available
    try:
        from IPython.display import Audio
        audio = Audio(data=speech_output["audio"], rate=speech_output["sampling_rate"])
        display(audio)
    except ImportError:
        print("IPython.display.Audio is not available. Please make sure IPython is properly installed.")

    # Save the audio to a file
    try:
        with open("speech_output.wav", "wb") as f:
            f.write(speech_output["audio"])
        print("Audio file saved as 'speech_output.wav'.")
    except Exception as e:
        print(f"Error saving audio file: {e}")


In [None]:
!pip list > installed_packages.txt

In [None]:
from google.colab import files
files.download('installed_packages.txt')
