feat(pdf): Enhance PDF handling with new text extraction and attachment features

tonyhu-012 · tonyhu-012 · commit 0c8706ee4e52 · 2025-07-17T17:05:04.000+08:00
diff --git a/entrypoints/content/components/AttachmentSelector.vue b/entrypoints/content/components/AttachmentSelector.vue
@@ -196,6 +196,8 @@ import { useLogger } from '@/composables/useLogger'
 import { useTimeoutValue } from '@/composables/useTimeoutValue'
 import { AttachmentItem, ContextAttachment } from '@/types/chat'
 import { TabInfo } from '@/types/tab'
+import { nonNullable } from '@/utils/array'
+import { PdfTextFile } from '@/utils/file'
 import { hashFile } from '@/utils/hash'
 import { useI18n } from '@/utils/i18n'
 import { generateRandomId } from '@/utils/id'
@@ -237,13 +239,21 @@ const showErrorMessage = (message: string) => {
   })
 }
 
+const getTabIdOfAttachment = (attachment: ContextAttachment) => {
+  if (attachment.type === 'tab') return attachment.value.tabId
+  if (attachment.type === 'pdf' && attachment.value.source.type === 'tab') return attachment.value.source.tabId
+  return undefined
+}
+
 const selectorListContainer = ref<HTMLDivElement>()
 const tabsContainerRef = ref<HTMLDivElement>()
 
 const attachments = useVModel(props, 'attachments', emit)
 const allTabs = ref<TabInfo[]>([])
 
-const selectedTabs = computed(() => attachments.value.filter((attachment) => attachment.type === 'tab').map((attachment) => attachment.value))
+const selectedTabs = computed(() => attachments.value.map((attachment) => {
+  return getTabIdOfAttachment(attachment)
+}).filter(nonNullable))
 
 const selectFile = async () => {
   open()
@@ -252,7 +262,7 @@ const selectFile = async () => {
 const MAX_IMAGE_SIZE = ByteSize.fromMB(5).toBytes() // 5 MB
 const MAX_IMAGE_COUNT = 5 // Maximum number of images allowed
 const MAX_PDF_COUNT = 1 // Maximum number of PDFs allowed
-const MAX_PDF_SIZE = ByteSize.fromMB(15).toBytes() // 15 MB
+const MAX_PDF_SIZE = Infinity // No limit on PDF size
 const MAX_PDF_PAGE_COUNT = 50 // Maximum number of pages allowed in a PDF
 const SUPPORTED_ATTACHMENT_TYPES: AttachmentItem[] = [
   {
@@ -295,9 +305,8 @@ const SUPPORTED_ATTACHMENT_TYPES: AttachmentItem[] = [
   {
     selectorMimeTypes: ['application/pdf'] as const, // Supported MIME types for the file selector
     type: 'pdf',
-    matchMimeType: (mimeType) => mimeType === 'application/pdf',
+    matchMimeType: (mimeType) => mimeType === 'application/pdf' || mimeType === 'application/x-pdf-text',
     validateFile: async ({ count }, file: File) => {
-      const docProxy = await getDocumentProxy(file)
       if (count >= MAX_PDF_COUNT) {
         showErrorMessage(t('chat.input.attachment_selector.too_many_pdfs', { max: MAX_PDF_COUNT }))
         return false
@@ -306,25 +315,43 @@ const SUPPORTED_ATTACHMENT_TYPES: AttachmentItem[] = [
         showErrorMessage(t('chat.input.attachment_selector.pdf_oversize', { size: ByteSize.fromBytes(MAX_PDF_SIZE).format(0) }))
         return false
       }
-      const pageCount = await getPdfPageCount(docProxy)
+      let pageCount: number
+      if (file instanceof PdfTextFile) {
+        pageCount = file.pageCount
+      }
+      else {
+        const docProxy = await getDocumentProxy(file)
+        pageCount = await getPdfPageCount(docProxy)
+      }
       if (pageCount > MAX_PDF_PAGE_COUNT) {
-        showErrorMessage(t('chat.input.attachment_selector.pdf_page_count_exceeded', { max: MAX_PDF_PAGE_COUNT }))
-        return false
+        // show error but allow this file
+        showErrorMessage(t('chat.input.attachment_selector.only_load_partial_pages', { max: MAX_PDF_PAGE_COUNT }))
       }
       return true
     },
     convertFileToAttachment: async (file: File): Promise<ContextAttachment> => {
-      const pdfText = await extractPdfText(file)
+      let textContent: string
+      let pageCount: number
+      if (file instanceof PdfTextFile) {
+        textContent = (await file.textContent()).join('\n').replace(/\s+/g, ' ')
+        pageCount = file.pageCount
+      }
+      else {
+        const pdfText = await extractPdfText(file, { pageRange: [1, MAX_PDF_PAGE_COUNT] })
+        textContent = pdfText.mergedText
+        pageCount = pdfText.pdfProxy.numPages
+      }
       const info: ContextAttachment = {
         type: 'pdf',
         value: {
           type: 'text',
-          pageCount: pdfText.pdfProxy.numPages,
-          textContent: pdfText.textContent.text,
+          pageCount,
+          textContent,
           id: generateRandomId(),
-          fileSize: file.size,
           fileHash: await hashFile(file),
+          fileSize: file.size,
           name: file.name,
+          source: file instanceof PdfTextFile ? { type: 'tab', tabId: file.source as number } : { type: 'local-file' },
         },
       }
       logger.debug('extracted pdf content', info)
@@ -360,10 +387,6 @@ defineExpose({
   appendAttachmentsFromFiles,
 })
 
-const addTabsToAttachments = (tabs: (ContextAttachment & { type: 'tab' })[]) => {
-  attachments.value = [...tabs.toReversed(), ...attachments.value]
-}
-
 const userConfig = await getUserConfig()
 const currentModel = userConfig.llm.model.toRef()
 const endpointType = userConfig.llm.endpointType.toRef()
@@ -390,11 +413,11 @@ const checkCurrentModelSupportVision = async () => {
 }
 
 const unselectedTabs = computed(() => {
-  return allTabs.value.filter((tab) => !selectedTabs.value.some((selectedTab) => selectedTab.tabId === tab.tabId))
+  return allTabs.value.filter((tab) => !selectedTabs.value.some((selectedTab) => selectedTab === tab.tabId))
 })
 
 const isTabSelected = (tab: TabInfo) => {
-  return selectedTabs.value.some((selectedTab) => selectedTab.tabId === tab.tabId)
+  return selectedTabs.value.some((selectedTab) => selectedTab === tab.tabId)
 }
 
 const updateAllTabs = async () => {
@@ -411,16 +434,18 @@ const isAllTabSelected = computed(() => {
   return unselectedTabs.value.length === 0
 })
 
-const selectAllTabs = () => {
+const selectAllTabs = async () => {
   if (isAllTabSelected.value) {
-    attachments.value = attachments.value.filter((attachment) => attachment.type !== 'tab')
+    attachments.value = attachments.value.filter((attachment) => {
+      if (attachment.type === 'tab') return false
+      if (attachment.type === 'pdf' && attachment.value.source.type === 'tab') return false
+      return true
+    })
   }
   else {
-    const newTabs = unselectedTabs.value.map((tab) => ({
-      type: 'tab' as const,
-      value: tab,
-    }))
-    addTabsToAttachments(newTabs)
+    for (const tab of unselectedTabs.value) {
+      await appendTab(tab)
+    }
   }
 }
 
@@ -432,10 +457,13 @@ const showSelector = async () => {
   isShowSelector.value = true
 }
 
-const toggleSelectTab = (tab: TabInfo) => {
-  const index = attachments.value.findIndex((selectedTab) => selectedTab.type === 'tab' && selectedTab.value.tabId === tab.tabId)
-  if (index !== -1) {
-    attachments.value.splice(index, 1) // Remove the tab if it is already selected
+const appendTab = async (tab: TabInfo) => {
+  const pageContentType = await c2bRpc.getPageContentType(tab.tabId)
+  if (pageContentType === 'application/pdf') {
+    const pdfContent = await c2bRpc.getPagePDFContent(tab.tabId)
+    if (pdfContent) {
+      appendAttachmentsFromFiles([new PdfTextFile(pdfContent.fileName, pdfContent.texts, pdfContent.pageCount, tab.tabId)])
+    }
   }
   else {
     attachments.value.push({
@@ -445,6 +473,18 @@ const toggleSelectTab = (tab: TabInfo) => {
   }
 }
 
+const toggleSelectTab = async (tab: TabInfo) => {
+  const index = attachments.value.findIndex((attachment) => {
+    return getTabIdOfAttachment(attachment) === tab.tabId
+  })
+  if (index !== -1) {
+    attachments.value.splice(index, 1) // Remove the tab if it is already selected
+  }
+  else {
+    await appendTab(tab)
+  }
+}
+
 const hideSelector = () => {
   isShowSelector.value = false
 }
@@ -453,6 +493,17 @@ const removeAttachment = (attachment: ContextAttachment) => {
   attachments.value = attachments.value.filter((a) => a !== attachment)
 }
 
+const updateCurrentTabIfPDF = async () => {
+  const currentTab = await c2bRpc.getTabInfo()
+  if (attachments.value.length === 1 && attachments.value[0].type === 'tab') {
+    const pagePDFContent = await c2bRpc.getPagePDFContent(currentTab.tabId)
+    if (pagePDFContent) {
+      await addAttachmentsFromFiles([new PdfTextFile(pagePDFContent.fileName, pagePDFContent.texts, pagePDFContent.pageCount, currentTab.tabId)])
+      attachments.value.pop() // pop the original tab
+    }
+  }
+}
+
 useEventListener(window, 'click', (e: MouseEvent) => {
   const target = (e.composed ? e.composedPath()[0] : e.target) as HTMLElement
   if (!selectorListContainer.value?.contains(target)) {
@@ -474,6 +525,7 @@ useEventListener(tabsContainerRef, 'wheel', (e: WheelEvent) => {
 
 onMounted(() => {
   updateAllTabs()
+  updateCurrentTabIfPDF()
 })
 
 onBeforeUnmount(() => {
diff --git a/entrypoints/content/components/Main.vue b/entrypoints/content/components/Main.vue
@@ -134,7 +134,6 @@ const pinSidebar = userConfig.ui.pinSidebar.toRef()
 const onNewChat = async () => {
   chat.stop()
   chat.historyManager.clear()
-  await chat.resetContextTabs()
 }
 
 const showCloseButton = computed(() => {
diff --git a/entrypoints/content/index.tsx b/entrypoints/content/index.tsx
@@ -11,7 +11,7 @@ import RootProvider from './components/RootProvider.vue'
 import { createShadowRootOverlay } from './ui'
 
 export default defineContentScript({
-  matches: ['*://*/*'],
+  matches: ['<all_urls>'],
   cssInjectionMode: 'manual',
   runAt: 'document_start',
   async main(ctx) {
diff --git a/entrypoints/content/utils/tabs.ts b/entrypoints/content/utils/tabs.ts
@@ -26,7 +26,7 @@ export async function getDocumentContentOfTabs(tabIds: number[]) {
     log.error(`Failed to get content for tab ${tabId}, it might not be a valid HTML page or the tab is closed.`)
     return undefined
   })))
-  return contents
+  return contents.filter((tabContent) => tabContent?.type === 'html')
 }
 
 export async function getCurrentTabInfo(): Promise<TabInfo> {
diff --git a/locales/de.json b/locales/de.json
@@ -24,7 +24,8 @@
         "pdf_oversize": "PDF-Dateigröße muss kleiner als {size} sein",
         "pdf_text_extract_error": "Textextraktion fehlgeschlagen - diese PDF könnte gescannt oder bildbasiert sein. Wir empfehlen dringend, zu Gemma3 zu wechseln, um bessere Ergebnisse zu erzielen.",
         "too_many_pdfs": "Maximal {max} PDF-Datei erlaubt",
-        "pdf_page_count_exceeded": "PDF muss weniger als {max} Seiten haben"
+        "pdf_page_count_exceeded": "PDF muss weniger als {max} Seiten haben",
+        "only_load_partial_pages": "Aufgrund der Dateigröße werden nur die ersten {max} Seiten geladen"
       }
     },
     "prompt": {
diff --git a/locales/en.json b/locales/en.json
@@ -106,7 +106,8 @@
         "pdf_page_count_exceeded": "PDF must be less than {max} pages",
         "pdf_oversize": "PDF file size must be less than {size}",
         "too_many_pdfs": "Maximum {max} PDF file allowed",
-        "pdf_text_extract_error": "Text extraction failed - this PDF may be scanned or image-based. We strongly recommend switching to Gemma3 for better results."
+        "pdf_text_extract_error": "Text extraction failed - this PDF may be scanned or image-based. We strongly recommend switching to Gemma3 for better results.",
+        "only_load_partial_pages": "Only the first {max} pages are loaded due to file size"
       }
     },
     "prompt": {
diff --git a/locales/es.json b/locales/es.json
@@ -15,7 +15,8 @@
         "pdf_oversize": "El tamaño del archivo PDF debe ser menor que {size}",
         "pdf_text_extract_error": "La extracción de texto falló - este PDF puede estar escaneado o basado en imágenes. Recomendamos encarecidamente cambiar a Gemma3 para obtener mejores resultados.",
         "too_many_pdfs": "Máximo {max} archivo PDF permitido",
-        "pdf_page_count_exceeded": "El PDF debe tener menos de {max} páginas"
+        "pdf_page_count_exceeded": "El PDF debe tener menos de {max} páginas",
+        "only_load_partial_pages": "Solo se cargan las primeras {max} páginas debido al tamaño del archivo"
       }
     },
     "messages": {
diff --git a/locales/fr.json b/locales/fr.json
@@ -24,7 +24,8 @@
         "pdf_oversize": "La taille du fichier PDF doit être inférieure à {size}",
         "pdf_text_extract_error": "L'extraction de texte a échoué - ce PDF peut être numérisé ou basé sur des images. Nous recommandons fortement de passer à Gemma3 pour de meilleurs résultats.",
         "too_many_pdfs": "Maximum {max} fichier PDF autorisé",
-        "pdf_page_count_exceeded": "Le PDF doit contenir moins de {max} pages"
+        "pdf_page_count_exceeded": "Le PDF doit contenir moins de {max} pages",
+        "only_load_partial_pages": "Seules les {max} premières pages sont chargées en raison de la taille du fichier"
       }
     },
     "prompt": {
diff --git a/locales/id.json b/locales/id.json
@@ -24,7 +24,8 @@
         "pdf_oversize": "Ukuran file PDF harus kurang dari {size}",
         "pdf_text_extract_error": "Ekstraksi teks gagal - PDF ini mungkin hasil scan atau berbasis gambar. Kami sangat merekomendasikan untuk beralih ke Gemma3 untuk hasil yang lebih baik.",
         "too_many_pdfs": "Maksimal {max} file PDF diizinkan",
-        "pdf_page_count_exceeded": "PDF harus kurang dari {max} halaman"
+        "pdf_page_count_exceeded": "PDF harus kurang dari {max} halaman",
+        "only_load_partial_pages": "Hanya {max} halaman pertama yang dimuat karena ukuran file"
       }
     },
     "prompt": {
diff --git a/locales/ja.json b/locales/ja.json
@@ -24,7 +24,8 @@
         "pdf_oversize": "PDFファイルサイズは{size}未満である必要があります",
         "pdf_text_extract_error": "テキスト抽出に失敗しました - このPDFはスキャンされたものか画像ベースの可能性があります。より良い結果を得るためにGemma3への切り替えを強くお勧めします。",
         "too_many_pdfs": "最大{max}個のPDFファイルまで許可されています",
-        "pdf_page_count_exceeded": "PDFは{max}ページ以下である必要があります"
+        "pdf_page_count_exceeded": "PDFは{max}ページ以下である必要があります",
+        "only_load_partial_pages": "ファイルサイズのため、最初の{max}ページのみが読み込まれました"
       }
     },
     "prompt": {
diff --git a/locales/ko.json b/locales/ko.json
@@ -24,7 +24,8 @@
         "pdf_oversize": "PDF 파일 크기는 {size}보다 작아야 합니다",
         "pdf_text_extract_error": "텍스트 추출 실패 - 이 PDF는 스캔되었거나 이미지 기반일 수 있습니다. 더 나은 결과를 위해 Gemma3로 전환하는 것을 강력히 권장합니다.",
         "too_many_pdfs": "최대 {max}개의 PDF 파일이 허용됩니다",
-        "pdf_page_count_exceeded": "PDF는 {max}페이지 미만이어야 합니다"
+        "pdf_page_count_exceeded": "PDF는 {max}페이지 미만이어야 합니다",
+        "only_load_partial_pages": "파일 크기로 인해 처음 {max}페이지만 로드됩니다"
       }
     },
     "prompt": {
diff --git a/locales/pt.json b/locales/pt.json
@@ -24,7 +24,8 @@
         "pdf_oversize": "O tamanho do arquivo PDF deve ser menor que {size}",
         "pdf_text_extract_error": "A extração de texto falhou - este PDF pode estar digitalizado ou ser baseado em imagem. Recomendamos vivamente mudar para o Gemma3 para melhores resultados.",
         "too_many_pdfs": "Máximo de {max} arquivo PDF permitido",
-        "pdf_page_count_exceeded": "O PDF deve ter menos de {max} páginas"
+        "pdf_page_count_exceeded": "O PDF deve ter menos de {max} páginas",
+        "only_load_partial_pages": "Apenas as primeiras {max} páginas são carregadas devido ao tamanho do arquivo"
       }
     },
     "prompt": {
diff --git a/locales/ru.json b/locales/ru.json
@@ -24,7 +24,8 @@
         "pdf_oversize": "Размер PDF-файла должен быть меньше {size}",
         "pdf_text_extract_error": "Извлечение текста не удалось - этот PDF может быть отсканированным или основанным на изображениях. Мы настоятельно рекомендуем переключиться на Gemma3 для получения лучших результатов.",
         "too_many_pdfs": "Максимально разрешено {max} PDF файлов",
-        "pdf_page_count_exceeded": "PDF должен содержать менее {max} страниц"
+        "pdf_page_count_exceeded": "PDF должен содержать менее {max} страниц",
+        "only_load_partial_pages": "Загружены только первые {max} страниц из-за размера файла"
       }
     },
     "prompt": {
diff --git a/locales/th.json b/locales/th.json
@@ -24,7 +24,8 @@
         "pdf_oversize": "ขนาดไฟล์ PDF ต้องน้อยกว่า {size}",
         "pdf_text_extract_error": "การดึงข้อความล้มเหลว - PDF นี้อาจเป็นไฟล์สแกนหรือเป็นรูปภาพ เราขอแนะนำอย่างยิ่งให้เปลี่ยนไปใช้ Gemma3 เพื่อผลลัพธ์ที่ดีกว่า",
         "too_many_pdfs": "อนุญาตไฟล์ PDF สูงสุด {max} ไฟล์",
-        "pdf_page_count_exceeded": "PDF ต้องมีจำนวนหน้าไม่เกิน {max} หน้า"
+        "pdf_page_count_exceeded": "PDF ต้องมีจำนวนหน้าไม่เกิน {max} หน้า",
+        "only_load_partial_pages": "โหลดเฉพาะ {max} หน้าแรกเท่านั้นเนื่องจากขนาดไฟล์"
       }
     },
     "prompt": {
diff --git a/locales/vi.json b/locales/vi.json
@@ -24,7 +24,8 @@
         "pdf_oversize": "Kích thước file PDF phải nhỏ hơn {size}",
         "pdf_text_extract_error": "Trích xuất văn bản thất bại - tệp PDF này có thể được quét hoặc dựa trên hình ảnh. Chúng tôi đặc biệt khuyến nghị chuyển sang Gemma3 để có kết quả tốt hơn.",
         "too_many_pdfs": "Tối đa {max} tệp PDF được phép",
-        "pdf_page_count_exceeded": "PDF phải có ít hơn {max} trang"
+        "pdf_page_count_exceeded": "PDF phải có ít hơn {max} trang",
+        "only_load_partial_pages": "Chỉ {max} trang đầu tiên được tải do kích thước tệp"
       }
     },
     "prompt": {
diff --git a/locales/zh-CN.json b/locales/zh-CN.json
@@ -106,7 +106,8 @@
         "pdf_oversize": "PDF文件大小必须小于{size}",
         "pdf_text_extract_error": "文本提取失败 - 此PDF可能是扫描版或基于图像的。我们强烈建议切换到Gemma3以获得更好的结果。",
         "too_many_pdfs": "最多允许 {max} 个 PDF 文件",
-        "pdf_page_count_exceeded": "PDF 页数不能超过 {max} 页"
+        "pdf_page_count_exceeded": "PDF 页数不能超过 {max} 页",
+        "only_load_partial_pages": "由于文件大小限制，仅加载前 {max} 页"
       }
     },
     "prompt": {
diff --git a/locales/zh-TW.json b/locales/zh-TW.json
@@ -24,7 +24,8 @@
         "pdf_oversize": "PDF 檔案大小必須小於 {size}",
         "pdf_text_extract_error": "文字提取失敗 - 此 PDF 可能是掃描檔或基於圖像的檔案。我們強烈建議切換到 Gemma3 以獲得更好的結果。",
         "too_many_pdfs": "最多允許 {max} 個 PDF 檔案",
-        "pdf_page_count_exceeded": "PDF 必須少於 {max} 頁"
+        "pdf_page_count_exceeded": "PDF 必須少於 {max} 頁",
+        "only_load_partial_pages": "由於檔案大小限制，僅載入前 {max} 頁"
       }
     },
     "prompt": {
diff --git a/types/chat.ts b/types/chat.ts
@@ -11,6 +11,13 @@ export type ImageAttachment = {
   }
 }
 
+type PDFAttachmentSource = {
+  type: 'local-file'
+} | {
+  type: 'tab'
+  tabId: number
+}
+
 export type PDFAttachment = {
   type: 'pdf'
   value: {
@@ -21,6 +28,7 @@ export type PDFAttachment = {
     fileSize: number
     fileHash: string
     type: 'text'
+    source: PDFAttachmentSource
   }
 }
 
diff --git a/utils/constants.ts b/utils/constants.ts
diff --git a/utils/file.ts b/utils/file.ts
diff --git a/utils/pdf/index.ts b/utils/pdf/index.ts
diff --git a/utils/rpc/background-fns.ts b/utils/rpc/background-fns.ts
diff --git a/utils/rpc/content-fns.ts b/utils/rpc/content-fns.ts

Original file line number	Diff line number	Diff line change
`@@ -134,7 +134,6 @@ const pinSidebar = userConfig.ui.pinSidebar.toRef()`
`134`	`134`	`const onNewChat = async () => {`
`135`	`135`	`chat.stop()`
`136`	`136`	`chat.historyManager.clear()`
`137`		`- await chat.resetContextTabs()`
`138`	`137`	`}`
`139`	`138`
`140`	`139`	`const showCloseButton = computed(() => {`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ export async function getDocumentContentOfTabs(tabIds: number[]) {`
`26`	`26`	log.error(`Failed to get content for tab ${tabId}, it might not be a valid HTML page or the tab is closed.`)
`27`	`27`	`return undefined`
`28`	`28`	`})))`
`29`		`- return contents`
	`29`	`+ return contents.filter((tabContent) => tabContent?.type === 'html')`
`30`	`30`	`}`
`31`	`31`
`32`	`32`	`export async function getCurrentTabInfo(): Promise<TabInfo> {`