In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [2]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0, api_key=os.getenv("OPENAI_API_KEY"))

In [3]:
import google.generativeai as genai

genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
vision = genai.GenerativeModel("gemini-1.5-flash")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from elevenlabs import stream
from elevenlabs.client import ElevenLabs

speech_client = ElevenLabs(
    api_key=os.getenv("11LABS_API_KEY"),
)

* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed




In [5]:
from twilio.rest import Client

message_client = Client(
    username=os.getenv("TWILIO_ACCOUNT_SID"), password=os.getenv("TWILIO_AUTH_TOKEN")
)

In [6]:
import cv2
import time


class VisionHelper:
    def __init__(self, save_path: str):
        self.save_path = save_path
        self.fps = 24
        self.frame_width = 640
        self.frame_height = 480

    def record_video(self, duration: int = 3):
        cap = cv2.VideoCapture(0)

        if not cap.isOpened():
            raise Exception("Webcam not found!")

        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        out = cv2.VideoWriter(
            self.save_path, fourcc, self.fps, (self.frame_width, self.frame_height)
        )

        start_time = time.time()
        while int(time.time() - start_time) < duration:
            ret, frame = cap.read()
            if ret:
                out.write(frame)
                cv2.imshow("Recording...", frame)
                if cv2.waitKey(1) & 0xFF == ord("q"):
                    break
            else:
                break

        cap.release()
        out.release()
        cv2.destroyAllWindows()

        return self.save_path

    def take_snapshot(self):
        cap = cv2.VideoCapture(0)

        if not cap.isOpened():
            raise Exception("Webcam not found!")

        ret, frame = cap.read()
        if ret:
            cv2.imwrite(self.save_path, frame)
            cv2.imshow("Snapshot", frame)
            cv2.waitKey(1)
        else:
            raise Exception("Failed to take snapshot")

        cap.release()
        cv2.destroyAllWindows()

        return self.save_path

    def mod_prompt(prompt: str):
        return prompt + 'Answer in one sentence. '

    def send_video_to_gemini(self, prompt):
        vid_file = genai.upload_file(self.save_path)
        while vid_file.state.name == "PROCESSING":
            time.sleep(2)
            vid_file = genai.get_file(vid_file.name)

        if vid_file.state.name == "FAILED":
            raise ValueError(f"Failed to upload video: {vid_file.name}")
        else:
            response = vision.generate_content([vid_file, self.mod_prompt(prompt)])
            genai.delete_file(vid_file.name)
            return response.text

    def send_image_to_gemini(self, prompt):
        img_file = genai.upload_file(self.save_path)
        response = vision.generate_content([img_file, self.mod_prompt(prompt)])
        genai.delete_file(img_file.name)
        return response.text

In [7]:
from langchain.agents import tool
import uuid

from pydub import AudioSegment
from pydub.playback import play


os.makedirs("snapshots/videos", exist_ok=True)
os.makedirs("snapshots/photos", exist_ok=True)

@tool
def analyze_vision(prompt: str, media: str) -> str:
    """Records webcam feed for 3 seconds or takes a snapshot and analyzes the content based on the prompt. Media is either 'video' or 'photo'."""
    print("-------------------------")
    print(f"Instruction: {prompt}")

    if media == "video":
        file_name = f"snapshots/videos/{uuid.uuid4()}.mp4"

        helper = VisionHelper(save_path=file_name)

        print("Recording video...")
        helper.record_video()
        print("Analyzing video...")
        response = helper.send_video_to_gemini(prompt)
        print(f"Activity: {response}")
        print("-------------------------")

    elif media == "photo":
        file_name = f"snapshots/photos/{uuid.uuid4()}.jpg"

        helper = VisionHelper(save_path=file_name)

        print("Snapshot taken...")
        helper.take_snapshot()
        print("Analyzing photo...")
        response = helper.send_image_to_gemini(prompt)
        print(f"Activity: {response}")
        print("-------------------------")

    return response


@tool
def play_music(title: str) -> str:
    """plays 30 sec short music, can be one of: study, chill, playful styles"""
    try:
        song = AudioSegment.from_file(f"music/{title}.mp3")
        play(song)
        return f"{title} song finished playing!"
    except Exception as e:
        return f"Error playing file: {str(e)}"


@tool
def send_alert(message: str) -> str:
    '''sends a whatsapp message to the parent's emergency number'''
    message_client.messages.create(
        from_="whatsapp:+14155238886",
        body=message,
        to=f"whatsapp:{os.getenv('EMERGENCY_NUMBER')}",
    )
    return "Alert sent to parent!"


@tool
def check_reminders(placeholder: str) -> str:
    '''gets the latest reminders set by parent, input is empty string'''
    # write an API call to get reminders from database
    reminders = [
        "Remember to pick up your toys when you're done playing",
        "Brush your teeth before bed",
        "Don't forget to do your science homework",
    ]
    return reminders


tools = [analyze_vision, play_music, send_alert, check_reminders]

In [8]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage


prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are a friendly, talking teddy bear, a constant companion to a child. Your primary role is to engage the child warmly and be attentive to their emotions. Comfort the child if they seem scared, sad, or anxious, and reassure them with calming words and supportive actions. When they are happy, playful, or excited, actively engage in fun and imaginative activities with them.

            Whenever the child expresses emotions, describes a visual scene, or engages in an activity, immediately call the analyze_vision tool to interpret their facial expressions, mood, or surrounding activities. 

            For example:
            If the child says, "I'm feeling sad," call the tool with the prompt: "What emotion is the child expressing?"
            If the child says, "I'm playing a fun game," call the tool with the prompt: "What game is the child playing?"
            Make frequent use of the tool to make the interaction more personalized and meaningful. You may choose to take video or photo depending on how necessary it is to understand the situation.

            When you detect that the child might benefit from some background music, call the play_music tool. 
            For example:
            If the child seems calm or needs to relax, play "chill" music.
            If the child is playing a game or fun activity, play "playful" music.
            If the child is focused or studying, play "study" music.

            If an emergency situation occurs, immediately send an alert to the parent by calling the send_alert tool with a message such as: "Emergency: The child has fallen down. Immediate assistance needed."

            Whenever the child seems to be finishing an activity or winding down for the day, the agent can call the check_reminders tool to help the child stay on top of their tasks.
            
            You can also imagine having your own life and experiences to relate to the child. Speak like a close friend, not just as a tool for questioning, and use these imaginative stories to connect more deeply.""",
        ),
        MessagesPlaceholder(variable_name="chat_history"),
        ("user", "{input}"),
        MessagesPlaceholder(variable_name="agent_scratchpad"),
    ]
)

In [9]:
from langchain.agents.format_scratchpad.openai_tools import (
    format_to_openai_tool_messages,
)
from langchain.agents.output_parsers.openai_tools import OpenAIToolsAgentOutputParser
from langchain.agents import AgentExecutor


llm_with_tools = llm.bind_tools(tools)

agent = (
    {
        "input": lambda x: x["input"],
        "agent_scratchpad": lambda x: format_to_openai_tool_messages(
            x["intermediate_steps"]
        ),
        "chat_history": lambda x: x["chat_history"],
    }
    | prompt
    | llm_with_tools
    | OpenAIToolsAgentOutputParser()
)

agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=False)

In [10]:
def manage_memory(chat_history, k: int = 5):
    new_memory = []

    if len(chat_history) <= k:
        return False

    last_k_messages = chat_history[-k:]

    earlier_messages = chat_history[:-k]
    if len(earlier_messages) > 0:
        summarization_prompt = ChatPromptTemplate.from_messages(
            [
                MessagesPlaceholder(variable_name="chat_history"),
                (
                    "system",
                    "summarize the chat history in one sentence, to provide context for the agent. ",
                ),
            ]
        )
        summarization_chain = summarization_prompt | llm
        summary_message = summarization_chain.invoke({"chat_history": earlier_messages})

        new_memory.append(SystemMessage(content=summary_message.content))

    for message in last_k_messages:
        new_memory.append(message)

    return new_memory

In [12]:
import speech_recognition as sr
import time

recognizer = sr.Recognizer()

def get_speech_input():
    with sr.Microphone(device_index=2) as source:
        print("Listening...")
        audio = recognizer.listen(source)
    try:
        return recognizer.recognize_google_cloud(audio, credentials_json='clipcraft-account.json')
    except sr.UnknownValueError:
        return "Sorry, I didn't understand that."
    except sr.RequestError:
        return "Could not request results; check your network."


def speak(text):
    audio = speech_client.generate(
        text=text,
        voice="Jessica",
        stream=True,
        model="eleven_multilingual_v2",
    )
    time.sleep(1)
    stream(audio)


chat_history = []

opening = "Hello! I am Teddy, your friendly talking teddy bear. What's up Kid?"
chat_history.append(AIMessage(content=opening))
speak(opening)

while True:
    new_memory = manage_memory(chat_history, k=5)
    if new_memory:
        chat_history = new_memory
    
    question = get_speech_input()
    print(f"You: {question}")
    result = agent_executor.invoke({"input": question, "chat_history": chat_history})

    print(f"Teddy: {result['output']}")
    speak(result["output"])

    chat_history.extend(
        [
            HumanMessage(content=question),
            AIMessage(content=result["output"]),
        ]
    )

KeyboardInterrupt: 