将视觉语言模型gpt4v封装成API服务，通过agent对complex question进行解答

Plan-and-execute agents accomplish an objective by first planning what to do, then executing the sub tasks. This idea is largely inspired by [BabyAGI](https://github.com/yoheinakajima/babyagi) and then the ["Plan-and-Solve" paper](https://arxiv.org/abs/2305.04091). The planning is almost always done by an LLM. The execution is usually done by a separate agent (equipped with tools).

In [None]:
import cv2
import base64
from openai import OpenAI
from langchain.chains import LLMMathChain
from langchain.memory import ConversationTokenBufferMemory, ConversationSummaryBufferMemory
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain.tools import BaseTool, StructuredTool, Tool, tool
from langchain_experimental.plan_and_execute import (
    PlanAndExecute,
    load_agent_executor,
    load_chat_planner,
)
from langchain_openai import ChatOpenAI
import os
from PIL import Image
from langchain.agents import initialize_agent, AgentType
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
import sys

os.environ["HTTP_PROXY"] = "http://10.16.64.223:7890"
os.environ["HTTPS_PROXY"] = "http://10.16.64.223:7890"

In [None]:
client = OpenAI(api_key=OPENAI_API_KEY)
def gpt4v_video(video_path, q):
    video = cv2.VideoCapture(video_path)
    base64Frames = []
    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))

    video.release()

    total_frames = len(base64Frames)  # 获取列表总长度
    interval = total_frames // 16  # 计算间隔

    # 使用切片操作抽取帧
    selected_frames = base64Frames[::interval]

    PROMPT_MESSAGES = [
        {
            "role": "user",
            "content": [
                q,
                *map(lambda x: {"image": x}, selected_frames),
            ],
        },
    ]
    params = {
        "model": "gpt-4-vision-preview",
        "messages": PROMPT_MESSAGES,
        "max_tokens": 4090,
    }

    try:
        result = client.chat.completions.create(**params)
        return result.choices[0].message.content
    except Exception as e:
        print(f"An exception occurred: {e}")
        return ""

In [None]:
class GPTObjectDetectingTool(BaseTool):
    name = "GPT Object Detecting Tool"
    description = "Utilize this tool for object detection in images or videos. Input the image or video path and ask questions related to object detection. "\
                    "The tool will provide information about the detected objects in the image or video. "\
                    "Keep in mind that GPT may have limitations in object detection, and the results should be verified for accuracy. "\
                    "If the response is 'I cannot assist with this request,' consider refining your question or retrying."



    def _run(self, video_path, question):
        # model = YOLO('yolov8n.pt')
        response = gpt4v_video(video_path, question)

        return response

    def _arun(self, query: str):
        raise NotImplementedError("This tool does not support async")

In [23]:
class GPTObjectTrackingTool(BaseTool):
    name = "GPT Object Tracking Tool"
    description = "Utilize this tool for object tracking in videos. Input the video path and ask questions related to object tracking. " \
                "The tool will provide information about the tracked objects in the video. " \
                "Keep in mind that GPT may have limitations in object tracking, and the results should be verified for accuracy. " \
                "If the response is 'I cannot assist with this request,' consider refining your question or retrying."



    def _run(self, video_path, question):
        # model = YOLO('yolov8n.pt')
        response = gpt4v_video(video_path, question)

        return response

    def _arun(self, query: str):
        raise NotImplementedError("This tool does not support async")


In [None]:
class GPTActionRecognitionTool(BaseTool):
    name = "GPT Action Recognition Tool"
    description = "Utilize this tool for video action recognition. Input the video path and ask questions related to action recognition within the video. "\
                    "The tool will provide information about the recognized actions or activities in the video. "\
                    "Keep in mind that GPT may have limitations in video action recognition, and the results should be verified for accuracy. "\
                    "If the response is 'I cannot assist with this request,' consider refining your question or retrying."


    def _run(self, video_path, question):
        # model = YOLO('yolov8n.pt')
        response = gpt4v_video(video_path, question)

        return response

    def _arun(self, query: str):
        raise NotImplementedError("This tool does not support async")

In [None]:
class GPTVideoFaceRecognitionTool(BaseTool):
    name = "GPT Video Face Recognition Tool"
    description = "Use this tool to perform face recognition on a video. Provide the path to the video along with any questions related to face recognition. " \
                "The tool will return information about recognized faces in the video. " \
                "Keep in mind that GPT might make mistakes, so it's advisable to verify critical information. " \
                "If the response is 'I cannot assist with this request,' consider retrying."

    def _run(self, video_path, question):
        response = gpt4v_video(video_path, question)

        return response

    def _arun(self, query: str):
        raise NotImplementedError("This tool does not support async")


In [None]:
class GPTVideoClassificationTool(BaseTool):
    name = "GPT Video Classification Tool"
    description = "Utilize this tool for video classification tasks. Input the video path and ask questions related to video classification. "\
                    "The tool will provide information about the classified categories or labels for the video content. "\
                    "Keep in mind that GPT may have limitations in video classification, and the results should be verified for accuracy. "\
                    "If the response is 'I cannot assist with this request,' consider refining your question or retrying."

    def _run(self, video_path, question):
        response = gpt4v_video(video_path, question)

        return response

    def _arun(self, query: str):
        raise NotImplementedError("This tool does not support async")


In [24]:
class GPTVideoSegmentationTool(BaseTool):
    name = "GPT Video Segmentation Tool"
    description = "This tool is designed for video segmentation. Provide the video path and ask questions related to video segmentation. " \
                "The tool will return information about segmented regions or objects in the video. " \
                "Please note that GPT may have limitations in video segmentation, and results should be validated for accuracy. " \
                "If the response is 'I cannot assist with this request,' consider refining your question or retrying."


    def _run(self, video_path, question):
        # model = YOLO('yolov8n.pt')
        response = gpt4v_video(video_path, question)

        return response

    def _arun(self, query: str):
        raise NotImplementedError("This tool does not support async")


In [None]:
class GPTVideoCaptioningTool(BaseTool):
    name = "GPT Video Captioning Tool"
    description = "Utilize this tool for video captioning tasks. Input the video path and ask questions related to generating captions for the video content. "\
                    "The tool will provide descriptive captions or textual summaries for the video. "\
                    "Keep in mind that GPT may have limitations in video captioning, and the results should be verified for accuracy. "\
                    "If the response is 'I cannot assist with this request,' consider refining your question or retrying."


    def _run(self, video_path, question):
        response = gpt4v_video(video_path, question)

        return response

    def _arun(self, query: str):
        raise NotImplementedError("This tool does not support async")

In [None]:
class GPTVideoSummarizationTool(BaseTool):
    name = "GPT Video Summarization Tool"
    description = "Use this tool to generate a summary of a video. Input the video path and ask questions related to video content summarization. " \
                "The tool will provide a concise summary based on the input video. " \
                "Keep in mind that GPT may have limitations, and the generated summary should be verified for accuracy. " \
                "If the response is 'I cannot assist with this request,' consider refining your question or retrying."


    def _run(self, video_path, question):
        response = gpt4v_video(video_path, question)

        return response

    def _arun(self, query: str):
        raise NotImplementedError("This tool does not support async")

In [None]:
class GPTVideoPoseEstimationTool(BaseTool):
    name = "GPT Video Pose Estimation Tool"
    description = "Use this tool when given the path to a video that you would like to estimate the people's poses in the video. " \
                  "You should input the video path and a question about Video Pose Estimation. It will return the corresponding answer." \
                  "It will return people's poses in the video. " \
                  "Please note that GPT can make mistakes. Consider checking important information." \
                  "If it says 'I cannot assist with this request', you may consider retrying."

    def _run(self, video_path, question):
        # model = YOLO('yolov8n.pt')
        response = gpt4v_video(video_path, question)

        return response

    def _arun(self, query: str):
        raise NotImplementedError("This tool does not support async")

In [25]:
OPENAI_API_KEY = "<input your api key>"
search = DuckDuckGoSearchAPIWrapper()
llm = ChatOpenAI(temperature=0, api_key=OPENAI_API_KEY, model = "gpt-4")
llm_math_chain = LLMMathChain.from_llm(llm=llm, verbose=True)
tools = [GPTVideoPoseEstimationTool(),GPTVideoFaceRecognitionTool(),GPTVideoSummarizationTool(),GPTObjectTrackingTool(),GPTVideoSegmentationTool()]

In [26]:
model = ChatOpenAI(temperature=0, api_key=OPENAI_API_KEY, model='gpt-4')
planner = load_chat_planner(model)
executor = load_agent_executor(model, tools, verbose=True)
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)
# agent = PlanAndExecute(planner=planner, executor=executor)
agent = initialize_agent(
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION, # AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    tools=tools,
    llm=model,
    max_iterations=5,
    verbose=True,
    memory=conversational_memory,
    early_stopping_method='generate'
)

In [27]:
import json

with open('./mvbech_file/complex.json', 'r') as file:
    complex_questions = json.load(file)
complex_questions[2]

{'id': 2,
 'video_path': '/home/nkd/Documents/ssd_nvme0n1/jiajiyuan/ai_agent/video/star/Charades_v1_480/WBS4I.mp4',
 'question': 'Question: What happened before the person watched at the book?\nOptions:\n(A) Put down the sandwich.\n(B) Washed the table.\n(C) Opened the closet/cabinet.\n(D) Put down the pillow.',
 'gt': '(C) Opened the closet/cabinet.'}

In [29]:
import random
question = complex_questions[random.randint(0, len(complex_questions) - 1)]
video_path = question.get("video_path")
user_question = question.get("question")
print(question)
response = agent.run(f'{user_question}, this is the video path: {video_path}')

# for question in complex_questions:
#     video_path = question.get("video_path")
#     user_question = question.get("question")
#     print(question)
#     response = agent.run(f'{user_question}, this is the video path: {video_path}')
#     break

{'id': 137, 'video_path': '/home/nkd/Documents/ssd_nvme0n1/jiajiyuan/ai_agent/video/star/Charades_v1_480/P2HZG.mp4', 'question': 'Question: What happened before the person opened the closet/cabinet?\nOptions:\n(A) Closed the laptop.\n(B) Put down the cup/glass/bottle.\n(C) Put down the food.\n(D) Threw the bag.', 'gt': '(A) Closed the laptop.'}


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: The user wants to know what action was performed before the person opened the closet/cabinet in the video. I will use the GPT Video Summarization tool to get a summary of the video, which should help answer the question.

Action:
```
{
  "action": "GPT Video Summarization",
  "action_input": {
    "video_path": "/home/nkd/Documents/ssd_nvme0n1/jiajiyuan/ai_agent/video/star/Charades_v1_480/P2HZG.mp4",
    "question": "What happened before the person opened the closet/cabinet?"
  }
}
```[0m
Observation: [38;5;200m[1;3mIt seems like the person was initially sitting at a clutte