feat(pdf): Refactor PDF handling to extract text content and improve attachment structure

tonyhu-012 · tonyhu-012 · commit 3f2a64359da4 · 2025-07-16T17:21:09.000+08:00
diff --git a/entrypoints/content/components/AttachmentSelector.vue b/entrypoints/content/components/AttachmentSelector.vue
@@ -192,22 +192,23 @@ import Tag from '@/components/Tag.vue'
 import Button from '@/components/ui/Button.vue'
 import Divider from '@/components/ui/Divider.vue'
 import Text from '@/components/ui/Text.vue'
+import { useLogger } from '@/composables/useLogger'
 import { useTimeoutValue } from '@/composables/useTimeoutValue'
 import { AttachmentItem, ContextAttachment } from '@/types/chat'
 import { TabInfo } from '@/types/tab'
-import { fileToBase64 } from '@/utils/base64'
+import { hashFile } from '@/utils/hash'
 import { useI18n } from '@/utils/i18n'
 import { generateRandomId } from '@/utils/id'
 import { convertImageFileToJpegBase64 } from '@/utils/image'
-import { isModelSupportPDFToImages } from '@/utils/llm/models'
-import { checkReadablePdf, getDocumentProxy, getPdfPageCount } from '@/utils/pdf'
+import { extractPdfText, getDocumentProxy, getPdfPageCount } from '@/utils/pdf'
 import { c2bRpc, registerContentScriptRpcEvent } from '@/utils/rpc'
 import { ByteSize } from '@/utils/sizes'
 import { getUserConfig } from '@/utils/user-config'
 
 import { getValidTabs } from '../utils/tabs'
 import ExternalImage from './ExternalImage.vue'
 
+const logger = useLogger()
 const { t } = useI18n()
 
 const cleanUpTabUpdatedListener = registerContentScriptRpcEvent('tabUpdated', async () => {
@@ -297,13 +298,6 @@ const SUPPORTED_ATTACHMENT_TYPES: AttachmentItem[] = [
     matchMimeType: (mimeType) => mimeType === 'application/pdf',
     validateFile: async ({ count }, file: File) => {
       const docProxy = await getDocumentProxy(file)
-      if (!currentModel.value || !isModelSupportPDFToImages(currentModel.value)) {
-        const readable = await checkReadablePdf(docProxy)
-        if (!readable) {
-          showErrorMessage(t('chat.input.attachment_selector.pdf_text_extract_error'))
-          return false
-        }
-      }
       if (count >= MAX_PDF_COUNT) {
         showErrorMessage(t('chat.input.attachment_selector.too_many_pdfs', { max: MAX_PDF_COUNT }))
         return false
@@ -320,16 +314,21 @@ const SUPPORTED_ATTACHMENT_TYPES: AttachmentItem[] = [
       return true
     },
     convertFileToAttachment: async (file: File): Promise<ContextAttachment> => {
-      const base64Data = await fileToBase64(file)
-      return {
+      const pdfText = await extractPdfText(file)
+      const info: ContextAttachment = {
         type: 'pdf',
         value: {
-          data: base64Data,
+          type: 'text',
+          pageCount: pdfText.pdfProxy.numPages,
+          textContent: pdfText.textContent.text,
           id: generateRandomId(),
-          size: file.size,
+          fileSize: file.size,
+          fileHash: await hashFile(file),
           name: file.name,
         },
       }
+      logger.debug('extracted pdf content', info)
+      return info
     },
   },
 ]
diff --git a/entrypoints/content/utils/chat/chat.ts b/entrypoints/content/utils/chat/chat.ts
@@ -2,16 +2,13 @@ import { CoreMessage } from 'ai'
 import EventEmitter from 'events'
 import { type Ref, ref } from 'vue'
 
-import { ContextAttachment } from '@/types/chat'
-import { Base64PDFData, PDFContentForModel } from '@/types/pdf'
+import { ContextAttachment, PDFAttachment } from '@/types/chat'
+import { PDFContentForModel } from '@/types/pdf'
 import { nonNullable } from '@/utils/array'
-import { arrayBufferToBase64 } from '@/utils/base64'
 import { parseDocument } from '@/utils/document-parser'
 import { AbortError, AppError } from '@/utils/error'
 import { useGlobalI18n } from '@/utils/i18n'
-import { isModelSupportPDFToImages } from '@/utils/llm/models'
 import logger from '@/utils/logger'
-import { extractPdfText, renderPdfPagesAsImages } from '@/utils/pdf'
 import { chatWithPageContent, generateSearchKeywords, nextStep, Page, summarizeWithPageContent } from '@/utils/prompts'
 import { UserPrompt } from '@/utils/prompts/helpers'
 import { SearchingMessage } from '@/utils/search'
@@ -319,9 +316,11 @@ export class Chat {
   async checkNextStep(contextMsgs: { role: 'user' | 'assistant', content: string }[]) {
     log.debug('checkNextStep', contextMsgs)
     const relevantTabIds = this.contextTabs.map((tab) => tab.tabId)
+    const relevantPDF = this.contextPDFs?.[0] ? await this.extractPDFContent(this.contextPDFs[0]) : undefined
+    if (this.contextPDFs.length > 1) log.warn('Multiple PDFs are attached, only the first one will be used for the chat context.')
     const pages = await getDocumentContentOfTabs(relevantTabIds)
     const abortController = this.createAbortController()
-    const prompt = await nextStep(contextMsgs, pages.filter(nonNullable))
+    const prompt = await nextStep(contextMsgs, pages.filter(nonNullable), relevantPDF)
     const next = await generateObjectInBackground({
       schema: 'nextStep',
       prompt: prompt.user.extractText(),
@@ -452,35 +451,18 @@ export class Chat {
     return await this.sendMessage(prompt.user, prompt.system, { autoDeleteEmptyResponseMsg: false })
   }
 
-  private async extractPDFContent(model: string | undefined, pdfData: Base64PDFData): Promise<PDFContentForModel> {
-    // use vision model for better performance if available
-    if (model && isModelSupportPDFToImages(model)) {
-      const pdfContent = (await renderPdfPagesAsImages(pdfData.data))
-      return {
-        type: 'images',
-        images: await Promise.all(pdfContent.images.map((img) => arrayBufferToBase64(img.image).then((base64) => {
-          return {
-            data: base64,
-            type: 'image/png',
-          }
-        }))),
-        pageCount: pdfContent.pdfProxy.numPages,
-      } as const
-    }
-    else {
-      const pdfContent = await extractPdfText(pdfData.data)
-      return {
-        type: 'text',
-        textContent: pdfContent.textContent.text,
-        pageCount: pdfContent.pdfProxy.numPages,
-      } as const
+  private async extractPDFContent(pdfData: PDFAttachment['value']): Promise<PDFContentForModel> {
+    return {
+      type: 'text',
+      textContent: pdfData.textContent,
+      pageCount: pdfData.pageCount,
+      fileName: pdfData.name,
     }
   }
 
   async ask(question: string) {
     using _s = this.statusScope('pending')
     const userConfig = await getUserConfig()
-    const currentModel = userConfig.llm.model.get()
     const abortController = new AbortController()
     this.abortControllers.push(abortController)
 
@@ -511,7 +493,7 @@ export class Chat {
     }
     const relevantTabIds = this.contextTabs.map((tab) => tab.tabId)
     const relevantImages = this.contextImages
-    const relevantPDF = this.contextPDFs?.[0] ? await this.extractPDFContent(currentModel, this.contextPDFs[0]) : undefined
+    const relevantPDF = this.contextPDFs?.[0] ? await this.extractPDFContent(this.contextPDFs[0]) : undefined
     if (this.contextPDFs.length > 1) log.warn('Multiple PDFs are attached, only the first one will be used for the chat context.')
     const pages = await getDocumentContentOfTabs(relevantTabIds)
     const prompt = await chatWithPageContent(question, pages.filter(nonNullable), onlineResults, relevantImages, relevantPDF)
diff --git a/types/chat.ts b/types/chat.ts
@@ -1,6 +1,5 @@
 import { PromiseOr } from './common'
 import { Base64ImageData } from './image'
-import { Base64PDFData } from './pdf'
 import { TabInfo } from './tab'
 
 export type ImageAttachment = {
@@ -14,10 +13,14 @@ export type ImageAttachment = {
 
 export type PDFAttachment = {
   type: 'pdf'
-  value: Base64PDFData & {
+  value: {
     id: string
     name: string
-    size?: number
+    textContent: string
+    pageCount: number
+    fileSize: number
+    fileHash: string
+    type: 'text'
   }
 }
 
diff --git a/utils/hash.ts b/utils/hash.ts
@@ -0,0 +1,18 @@
+export async function sha256(message: string | ArrayBuffer): Promise<string> {
+  let data: BufferSource
+  if (typeof message === 'string') {
+    data = new TextEncoder().encode(message)
+  }
+  else {
+    data = message
+  }
+  const buf = await crypto.subtle.digest('SHA-256', data)
+  return [...new Uint8Array(buf)]
+    .map((b) => b.toString(16).padStart(2, '0'))
+    .join('')
+}
+
+export async function hashFile(file: File | Blob) {
+  const arrayBuffer = await file.arrayBuffer()
+  return sha256(arrayBuffer)
+}
diff --git a/utils/llm/models.ts b/utils/llm/models.ts
@@ -101,7 +101,8 @@ export function parseErrorMessageFromChunk(error: unknown): string | null {
   return null
 }
 
-export function isModelSupportPDFToImages(model: string): boolean {
+export function isModelSupportPDFToImages(_model: string): boolean {
   // Currently only gemma3 models have the ability to understand PDF converted to images
-  return model.startsWith('gemma3:')
+  // but it's too slow to process large number of image so we disable this feature temporarily by returning false here
+  return false
 }
diff --git a/utils/prompts/index.ts b/utils/prompts/index.ts
@@ -82,7 +82,7 @@ Question: ${question}`.trim()
   return { user: new UserPrompt(user), system }
 })
 
-export const nextStep = definePrompt(async (messages: { role: 'user' | 'assistant' | string, content: string }[], pages: Page[]) => {
+export const nextStep = definePrompt(async (messages: { role: 'user' | 'assistant' | string, content: string }[], pages: Page[], pdfInfo?: PDFContentForModel) => {
   const system = renderPrompt`You are a helpful assistant. Based on the conversation below and the current web page content, suggest the next step to take. You can suggest one of the following options:
 
 1. search_online: ONLY if user requests the latest information or news that you don't already know. If you choose this option, you must also provide a list of search keywords.
@@ -112,13 +112,24 @@ ${new JSONBuilder({ action: 'chat' })}
     tabContextBuilder.insert(new TagBuilder('tab', { id: i + 1 }).insertContent(head, body))
   }
 
+  const pdfContextBuilder: TagBuilder = new TagBuilder('pdf_document')
+  if (pdfInfo?.type === 'text') {
+    const { fileName = '', pageCount } = pdfInfo
+    pdfContextBuilder.insert(new TagBuilder('title').insertContent(fileName))
+    pdfContextBuilder.insert(new TagBuilder('pages').insertContent(pageCount.toString()))
+  }
+  else {
+    // ignore pdf images for now
+  }
+
   const conversationContextBuilder = new TagBuilder('conversation')
   for (const message of messages) {
     conversationContextBuilder.insertContent(`${message.role}: ${message.content}`)
   }
 
   const user = renderPrompt`
 ${tabContextBuilder}
+${pdfContextBuilder}
 
 ${conversationContextBuilder}
 `.trim()

Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,8 @@ export function parseErrorMessageFromChunk(error: unknown): string \| null {`
`101`	`101`	`return null`
`102`	`102`	`}`
`103`	`103`
`104`		`-export function isModelSupportPDFToImages(model: string): boolean {`
	`104`	`+export function isModelSupportPDFToImages(_model: string): boolean {`
`105`	`105`	`// Currently only gemma3 models have the ability to understand PDF converted to images`
`106`		`- return model.startsWith('gemma3:')`
	`106`	`+ // but it's too slow to process large number of image so we disable this feature temporarily by returning false here`
	`107`	`+ return false`
`107`	`108`	`}`