fix(agent): improve event cleanup and enhance view_Image result handling in agent

tonyhu-012 · tonyhu-012 · commit 6e919c8cbcd9 · 2025-09-16T15:21:17.000+08:00
diff --git a/entrypoints/sidepanel/App.vue b/entrypoints/sidepanel/App.vue
@@ -12,7 +12,7 @@
 
 <script setup lang="tsx">
 import mime from 'mime'
-import { useTemplateRef, watch } from 'vue'
+import { onBeforeUnmount, useTemplateRef, watch } from 'vue'
 import { browser } from 'wxt/browser'
 
 import { useZIndex } from '@/composables/useZIndex'
@@ -37,7 +37,7 @@ const mainRef = useTemplateRef('mainRef')
 const { index: onboardingPanelZIndex } = useZIndex('settings')
 const userConfig = await getUserConfig()
 
-registerSidepanelRpcEvent('gmailAction', async (e) => {
+const cleanupGmailActionEvent = registerSidepanelRpcEvent('gmailAction', async (e) => {
   const { action, data } = e
   logger.debug('Gmail action triggered:', action, data)
   if (action === 'summary') {
@@ -73,7 +73,7 @@ registerSidepanelRpcEvent('gmailAction', async (e) => {
   }
 })
 
-registerSidepanelRpcEvent('contextMenuClicked', async (e) => {
+const cleanupContextMenuEvent = registerSidepanelRpcEvent('contextMenuClicked', async (e) => {
   const menuItemId = e.menuItemId as ContextMenuId
   const windowId = e.tabInfo.windowId
   if (windowId !== (await browser.windows.getCurrent()).id) return
@@ -118,6 +118,11 @@ registerSidepanelRpcEvent('contextMenuClicked', async (e) => {
     }, { immediate: true })
   }
 })
+
+onBeforeUnmount(() => {
+  cleanupContextMenuEvent()
+  cleanupGmailActionEvent()
+})
 </script>
 
 <style lang="scss">
diff --git a/entrypoints/sidepanel/utils/agent/index.ts b/entrypoints/sidepanel/utils/agent/index.ts
@@ -115,7 +115,7 @@ export class Agent<T extends PromptBasedToolName> {
   }
 
   injectImagesToLastMessage(messages: CoreMessage[], images: Base64ImageData[]) {
-    const lastMessage = messages[messages.length - 1]
+    const lastMessage = structuredClone(messages[messages.length - 1])
     if (lastMessage && lastMessage.role === 'user') {
       if (typeof lastMessage.content === 'string') {
         lastMessage.content = [
@@ -126,8 +126,9 @@ export class Agent<T extends PromptBasedToolName> {
       else {
         lastMessage.content.push(...images.map((img) => ({ type: 'image' as const, image: img.data, mimeType: img.type })))
       }
+      return [...messages.slice(0, -1), lastMessage]
     }
-    return messages
+    return [...messages]
   }
 
   overrideSystemPrompt(messages: CoreMessage[], systemPrompt?: string) {
@@ -254,14 +255,18 @@ export class Agent<T extends PromptBasedToolName> {
   }
 
   // iteration starts from 1
-  buildExtendedUserMessage(iteration: number, originalUserMessage: string, toolResults?: string) {
+  buildExtendedUserMessage(iteration: number, originalUserMessage: string, toolResults?: (AgentToolExecuteResultToolResult & { toolName: T })[]) {
     if (iteration === 1) return `${originalUserMessage}\n\n${AGENT_INITIAL_GUIDANCE.build()}`
-    const textBuilder = new TextBuilder(`${originalUserMessage}`)
-    if (toolResults) {
-      textBuilder.insertContent(toolResults)
+    if (toolResults?.length) {
+      const toolResultPart = this.toolResultsToPrompt(toolResults)
+      const hasViewImageTool = toolResults.some((t) => t.toolName === 'view_image')
+      // compatible with gemma3 model, which will loop infinitely if the view_image tool result has original user message
+      const textBuilder = new TextBuilder(!hasViewImageTool ? originalUserMessage : '')
+      textBuilder.insertContent(toolResultPart)
       textBuilder.insertContent(AGENT_TOOL_CALL_RESULT_GUIDANCE)
+      return textBuilder.build().trim()
     }
-    return textBuilder.build()
+    return new TextBuilder(originalUserMessage).build().trim()
   }
 
   async run(rawBaseMessages: CoreMessage[]) {
@@ -340,14 +345,12 @@ export class Agent<T extends PromptBasedToolName> {
             taskMessageModifier = this.makeTaskMessageGroupProxy(abortController.signal)
           }
           this.log.debug('Executing tool calls', currentLoopToolCalls)
-          const toolResults = await this.executeToolCalls(currentLoopToolCalls, taskScopeToolCalls, loopImages, taskMessageModifier, eventBus)
-          this.log.debug('Tool calls executed', currentLoopToolCalls, toolResults)
-          if (toolResults.length === 0) {
-            const errorResult = TagBuilder.fromStructured('error', { message: `Tool not found, available tools are: ${Object.keys(this.tools).join(', ')}` })
-            loopMessages.push({ role: 'user', content: renderPrompt`${errorResult}` })
-          }
-          else if (toolResults.some((result) => result.type === 'hand-off')) {
-            const handoffResult = toolResults.find((result) => result.type === 'hand-off')
+          const toolExecuteResults = await this.executeToolCalls(currentLoopToolCalls, taskScopeToolCalls, loopImages, taskMessageModifier, eventBus)
+          this.log.debug('Tool calls executed', currentLoopToolCalls, toolExecuteResults)
+          const toolResults = toolExecuteResults.filter((r) => r.type === 'tool-result')
+          const handOffResults = toolExecuteResults.filter((r) => r.type === 'hand-off')
+          if (handOffResults.length > 0) {
+            const handoffResult = handOffResults[0]
             if (handoffResult) {
               // This feature is in beta, not used yet
               this.log.debug('Hand-off detected', handoffResult)
@@ -359,9 +362,12 @@ export class Agent<T extends PromptBasedToolName> {
               if (lastMsg?.content) loopMessages.push(lastMsg)
             }
           }
+          else if (toolResults.length) {
+            loopMessages.push({ role: 'user', content: this.buildExtendedUserMessage(iteration + 1, originalUserMessageText, toolResults) })
+          }
           else {
-            const toolResultPart = this.toolResultsToPrompt(toolResults.filter((t) => t.type === 'tool-result'))
-            loopMessages.push({ role: 'user', content: this.buildExtendedUserMessage(iteration + 1, originalUserMessageText, toolResultPart) })
+            const errorResult = TagBuilder.fromStructured('error', { message: `Tool not found, available tools are: ${Object.keys(this.tools).join(', ')}` })
+            loopMessages.push({ role: 'user', content: renderPrompt`${errorResult}` })
           }
         }
       }
diff --git a/entrypoints/sidepanel/utils/chat/tool-calls/index.ts b/entrypoints/sidepanel/utils/chat/tool-calls/index.ts
@@ -408,8 +408,8 @@ export const executePageClick: AgentToolCallExecute<'click'> = async ({ params,
     const result = await browserSession.buildAccessibleMarkdown({ highlightInteractiveElements, contentFilterThreshold, abortSignal })
     if (lastTabResult && result) {
       const diffs = markdownSectionDiff(lastTabResult.content, result.content)
-      log.debug(`Found diffs between old and new tab content: ${diffs}`, { lastTabResult, result })
       const shouldUseDiff = diffs.trim() && diffs.length < (result.content.length / 2) // not to use diff result if there are too many changes
+      log.debug(`Found diffs between old and new tab content: ${diffs}`, { lastTabResult, result, shouldUseDiff })
       if (shouldUseDiff) {
         taskMsg.summary = t('chat.tool_calls.page_click.redirected', { destination: normalizeInnerText(currentTabInfo?.title) || currentTabInfo?.url || '' })
         return [{