Skip to content

Commit 0c8706e

Browse files
committed
feat(pdf): Enhance PDF handling with new text extraction and attachment features
1 parent aa8d009 commit 0c8706e

File tree

23 files changed

+223
-49
lines changed

23 files changed

+223
-49
lines changed

entrypoints/content/components/AttachmentSelector.vue

Lines changed: 80 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,8 @@ import { useLogger } from '@/composables/useLogger'
196196
import { useTimeoutValue } from '@/composables/useTimeoutValue'
197197
import { AttachmentItem, ContextAttachment } from '@/types/chat'
198198
import { TabInfo } from '@/types/tab'
199+
import { nonNullable } from '@/utils/array'
200+
import { PdfTextFile } from '@/utils/file'
199201
import { hashFile } from '@/utils/hash'
200202
import { useI18n } from '@/utils/i18n'
201203
import { generateRandomId } from '@/utils/id'
@@ -237,13 +239,21 @@ const showErrorMessage = (message: string) => {
237239
})
238240
}
239241
242+
const getTabIdOfAttachment = (attachment: ContextAttachment) => {
243+
if (attachment.type === 'tab') return attachment.value.tabId
244+
if (attachment.type === 'pdf' && attachment.value.source.type === 'tab') return attachment.value.source.tabId
245+
return undefined
246+
}
247+
240248
const selectorListContainer = ref<HTMLDivElement>()
241249
const tabsContainerRef = ref<HTMLDivElement>()
242250
243251
const attachments = useVModel(props, 'attachments', emit)
244252
const allTabs = ref<TabInfo[]>([])
245253
246-
const selectedTabs = computed(() => attachments.value.filter((attachment) => attachment.type === 'tab').map((attachment) => attachment.value))
254+
const selectedTabs = computed(() => attachments.value.map((attachment) => {
255+
return getTabIdOfAttachment(attachment)
256+
}).filter(nonNullable))
247257
248258
const selectFile = async () => {
249259
open()
@@ -252,7 +262,7 @@ const selectFile = async () => {
252262
const MAX_IMAGE_SIZE = ByteSize.fromMB(5).toBytes() // 5 MB
253263
const MAX_IMAGE_COUNT = 5 // Maximum number of images allowed
254264
const MAX_PDF_COUNT = 1 // Maximum number of PDFs allowed
255-
const MAX_PDF_SIZE = ByteSize.fromMB(15).toBytes() // 15 MB
265+
const MAX_PDF_SIZE = Infinity // No limit on PDF size
256266
const MAX_PDF_PAGE_COUNT = 50 // Maximum number of pages allowed in a PDF
257267
const SUPPORTED_ATTACHMENT_TYPES: AttachmentItem[] = [
258268
{
@@ -295,9 +305,8 @@ const SUPPORTED_ATTACHMENT_TYPES: AttachmentItem[] = [
295305
{
296306
selectorMimeTypes: ['application/pdf'] as const, // Supported MIME types for the file selector
297307
type: 'pdf',
298-
matchMimeType: (mimeType) => mimeType === 'application/pdf',
308+
matchMimeType: (mimeType) => mimeType === 'application/pdf' || mimeType === 'application/x-pdf-text',
299309
validateFile: async ({ count }, file: File) => {
300-
const docProxy = await getDocumentProxy(file)
301310
if (count >= MAX_PDF_COUNT) {
302311
showErrorMessage(t('chat.input.attachment_selector.too_many_pdfs', { max: MAX_PDF_COUNT }))
303312
return false
@@ -306,25 +315,43 @@ const SUPPORTED_ATTACHMENT_TYPES: AttachmentItem[] = [
306315
showErrorMessage(t('chat.input.attachment_selector.pdf_oversize', { size: ByteSize.fromBytes(MAX_PDF_SIZE).format(0) }))
307316
return false
308317
}
309-
const pageCount = await getPdfPageCount(docProxy)
318+
let pageCount: number
319+
if (file instanceof PdfTextFile) {
320+
pageCount = file.pageCount
321+
}
322+
else {
323+
const docProxy = await getDocumentProxy(file)
324+
pageCount = await getPdfPageCount(docProxy)
325+
}
310326
if (pageCount > MAX_PDF_PAGE_COUNT) {
311-
showErrorMessage(t('chat.input.attachment_selector.pdf_page_count_exceeded', { max: MAX_PDF_PAGE_COUNT }))
312-
return false
327+
// show error but allow this file
328+
showErrorMessage(t('chat.input.attachment_selector.only_load_partial_pages', { max: MAX_PDF_PAGE_COUNT }))
313329
}
314330
return true
315331
},
316332
convertFileToAttachment: async (file: File): Promise<ContextAttachment> => {
317-
const pdfText = await extractPdfText(file)
333+
let textContent: string
334+
let pageCount: number
335+
if (file instanceof PdfTextFile) {
336+
textContent = (await file.textContent()).join('\n').replace(/\s+/g, ' ')
337+
pageCount = file.pageCount
338+
}
339+
else {
340+
const pdfText = await extractPdfText(file, { pageRange: [1, MAX_PDF_PAGE_COUNT] })
341+
textContent = pdfText.mergedText
342+
pageCount = pdfText.pdfProxy.numPages
343+
}
318344
const info: ContextAttachment = {
319345
type: 'pdf',
320346
value: {
321347
type: 'text',
322-
pageCount: pdfText.pdfProxy.numPages,
323-
textContent: pdfText.textContent.text,
348+
pageCount,
349+
textContent,
324350
id: generateRandomId(),
325-
fileSize: file.size,
326351
fileHash: await hashFile(file),
352+
fileSize: file.size,
327353
name: file.name,
354+
source: file instanceof PdfTextFile ? { type: 'tab', tabId: file.source as number } : { type: 'local-file' },
328355
},
329356
}
330357
logger.debug('extracted pdf content', info)
@@ -360,10 +387,6 @@ defineExpose({
360387
appendAttachmentsFromFiles,
361388
})
362389
363-
const addTabsToAttachments = (tabs: (ContextAttachment & { type: 'tab' })[]) => {
364-
attachments.value = [...tabs.toReversed(), ...attachments.value]
365-
}
366-
367390
const userConfig = await getUserConfig()
368391
const currentModel = userConfig.llm.model.toRef()
369392
const endpointType = userConfig.llm.endpointType.toRef()
@@ -390,11 +413,11 @@ const checkCurrentModelSupportVision = async () => {
390413
}
391414
392415
const unselectedTabs = computed(() => {
393-
return allTabs.value.filter((tab) => !selectedTabs.value.some((selectedTab) => selectedTab.tabId === tab.tabId))
416+
return allTabs.value.filter((tab) => !selectedTabs.value.some((selectedTab) => selectedTab === tab.tabId))
394417
})
395418
396419
const isTabSelected = (tab: TabInfo) => {
397-
return selectedTabs.value.some((selectedTab) => selectedTab.tabId === tab.tabId)
420+
return selectedTabs.value.some((selectedTab) => selectedTab === tab.tabId)
398421
}
399422
400423
const updateAllTabs = async () => {
@@ -411,16 +434,18 @@ const isAllTabSelected = computed(() => {
411434
return unselectedTabs.value.length === 0
412435
})
413436
414-
const selectAllTabs = () => {
437+
const selectAllTabs = async () => {
415438
if (isAllTabSelected.value) {
416-
attachments.value = attachments.value.filter((attachment) => attachment.type !== 'tab')
439+
attachments.value = attachments.value.filter((attachment) => {
440+
if (attachment.type === 'tab') return false
441+
if (attachment.type === 'pdf' && attachment.value.source.type === 'tab') return false
442+
return true
443+
})
417444
}
418445
else {
419-
const newTabs = unselectedTabs.value.map((tab) => ({
420-
type: 'tab' as const,
421-
value: tab,
422-
}))
423-
addTabsToAttachments(newTabs)
446+
for (const tab of unselectedTabs.value) {
447+
await appendTab(tab)
448+
}
424449
}
425450
}
426451
@@ -432,10 +457,13 @@ const showSelector = async () => {
432457
isShowSelector.value = true
433458
}
434459
435-
const toggleSelectTab = (tab: TabInfo) => {
436-
const index = attachments.value.findIndex((selectedTab) => selectedTab.type === 'tab' && selectedTab.value.tabId === tab.tabId)
437-
if (index !== -1) {
438-
attachments.value.splice(index, 1) // Remove the tab if it is already selected
460+
const appendTab = async (tab: TabInfo) => {
461+
const pageContentType = await c2bRpc.getPageContentType(tab.tabId)
462+
if (pageContentType === 'application/pdf') {
463+
const pdfContent = await c2bRpc.getPagePDFContent(tab.tabId)
464+
if (pdfContent) {
465+
appendAttachmentsFromFiles([new PdfTextFile(pdfContent.fileName, pdfContent.texts, pdfContent.pageCount, tab.tabId)])
466+
}
439467
}
440468
else {
441469
attachments.value.push({
@@ -445,6 +473,18 @@ const toggleSelectTab = (tab: TabInfo) => {
445473
}
446474
}
447475
476+
const toggleSelectTab = async (tab: TabInfo) => {
477+
const index = attachments.value.findIndex((attachment) => {
478+
return getTabIdOfAttachment(attachment) === tab.tabId
479+
})
480+
if (index !== -1) {
481+
attachments.value.splice(index, 1) // Remove the tab if it is already selected
482+
}
483+
else {
484+
await appendTab(tab)
485+
}
486+
}
487+
448488
const hideSelector = () => {
449489
isShowSelector.value = false
450490
}
@@ -453,6 +493,17 @@ const removeAttachment = (attachment: ContextAttachment) => {
453493
attachments.value = attachments.value.filter((a) => a !== attachment)
454494
}
455495
496+
const updateCurrentTabIfPDF = async () => {
497+
const currentTab = await c2bRpc.getTabInfo()
498+
if (attachments.value.length === 1 && attachments.value[0].type === 'tab') {
499+
const pagePDFContent = await c2bRpc.getPagePDFContent(currentTab.tabId)
500+
if (pagePDFContent) {
501+
await addAttachmentsFromFiles([new PdfTextFile(pagePDFContent.fileName, pagePDFContent.texts, pagePDFContent.pageCount, currentTab.tabId)])
502+
attachments.value.pop() // pop the original tab
503+
}
504+
}
505+
}
506+
456507
useEventListener(window, 'click', (e: MouseEvent) => {
457508
const target = (e.composed ? e.composedPath()[0] : e.target) as HTMLElement
458509
if (!selectorListContainer.value?.contains(target)) {
@@ -474,6 +525,7 @@ useEventListener(tabsContainerRef, 'wheel', (e: WheelEvent) => {
474525
475526
onMounted(() => {
476527
updateAllTabs()
528+
updateCurrentTabIfPDF()
477529
})
478530
479531
onBeforeUnmount(() => {

entrypoints/content/components/Main.vue

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,6 @@ const pinSidebar = userConfig.ui.pinSidebar.toRef()
134134
const onNewChat = async () => {
135135
chat.stop()
136136
chat.historyManager.clear()
137-
await chat.resetContextTabs()
138137
}
139138
140139
const showCloseButton = computed(() => {

entrypoints/content/index.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import RootProvider from './components/RootProvider.vue'
1111
import { createShadowRootOverlay } from './ui'
1212

1313
export default defineContentScript({
14-
matches: ['*://*/*'],
14+
matches: ['<all_urls>'],
1515
cssInjectionMode: 'manual',
1616
runAt: 'document_start',
1717
async main(ctx) {

entrypoints/content/utils/tabs.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ export async function getDocumentContentOfTabs(tabIds: number[]) {
2626
log.error(`Failed to get content for tab ${tabId}, it might not be a valid HTML page or the tab is closed.`)
2727
return undefined
2828
})))
29-
return contents
29+
return contents.filter((tabContent) => tabContent?.type === 'html')
3030
}
3131

3232
export async function getCurrentTabInfo(): Promise<TabInfo> {

locales/de.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
"pdf_oversize": "PDF-Dateigröße muss kleiner als {size} sein",
2525
"pdf_text_extract_error": "Textextraktion fehlgeschlagen - diese PDF könnte gescannt oder bildbasiert sein. Wir empfehlen dringend, zu Gemma3 zu wechseln, um bessere Ergebnisse zu erzielen.",
2626
"too_many_pdfs": "Maximal {max} PDF-Datei erlaubt",
27-
"pdf_page_count_exceeded": "PDF muss weniger als {max} Seiten haben"
27+
"pdf_page_count_exceeded": "PDF muss weniger als {max} Seiten haben",
28+
"only_load_partial_pages": "Aufgrund der Dateigröße werden nur die ersten {max} Seiten geladen"
2829
}
2930
},
3031
"prompt": {

locales/en.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,8 @@
106106
"pdf_page_count_exceeded": "PDF must be less than {max} pages",
107107
"pdf_oversize": "PDF file size must be less than {size}",
108108
"too_many_pdfs": "Maximum {max} PDF file allowed",
109-
"pdf_text_extract_error": "Text extraction failed - this PDF may be scanned or image-based. We strongly recommend switching to Gemma3 for better results."
109+
"pdf_text_extract_error": "Text extraction failed - this PDF may be scanned or image-based. We strongly recommend switching to Gemma3 for better results.",
110+
"only_load_partial_pages": "Only the first {max} pages are loaded due to file size"
110111
}
111112
},
112113
"prompt": {

locales/es.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
"pdf_oversize": "El tamaño del archivo PDF debe ser menor que {size}",
1616
"pdf_text_extract_error": "La extracción de texto falló - este PDF puede estar escaneado o basado en imágenes. Recomendamos encarecidamente cambiar a Gemma3 para obtener mejores resultados.",
1717
"too_many_pdfs": "Máximo {max} archivo PDF permitido",
18-
"pdf_page_count_exceeded": "El PDF debe tener menos de {max} páginas"
18+
"pdf_page_count_exceeded": "El PDF debe tener menos de {max} páginas",
19+
"only_load_partial_pages": "Solo se cargan las primeras {max} páginas debido al tamaño del archivo"
1920
}
2021
},
2122
"messages": {

locales/fr.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
"pdf_oversize": "La taille du fichier PDF doit être inférieure à {size}",
2525
"pdf_text_extract_error": "L'extraction de texte a échoué - ce PDF peut être numérisé ou basé sur des images. Nous recommandons fortement de passer à Gemma3 pour de meilleurs résultats.",
2626
"too_many_pdfs": "Maximum {max} fichier PDF autorisé",
27-
"pdf_page_count_exceeded": "Le PDF doit contenir moins de {max} pages"
27+
"pdf_page_count_exceeded": "Le PDF doit contenir moins de {max} pages",
28+
"only_load_partial_pages": "Seules les {max} premières pages sont chargées en raison de la taille du fichier"
2829
}
2930
},
3031
"prompt": {

locales/id.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
"pdf_oversize": "Ukuran file PDF harus kurang dari {size}",
2525
"pdf_text_extract_error": "Ekstraksi teks gagal - PDF ini mungkin hasil scan atau berbasis gambar. Kami sangat merekomendasikan untuk beralih ke Gemma3 untuk hasil yang lebih baik.",
2626
"too_many_pdfs": "Maksimal {max} file PDF diizinkan",
27-
"pdf_page_count_exceeded": "PDF harus kurang dari {max} halaman"
27+
"pdf_page_count_exceeded": "PDF harus kurang dari {max} halaman",
28+
"only_load_partial_pages": "Hanya {max} halaman pertama yang dimuat karena ukuran file"
2829
}
2930
},
3031
"prompt": {

locales/ja.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@
2424
"pdf_oversize": "PDFファイルサイズは{size}未満である必要があります",
2525
"pdf_text_extract_error": "テキスト抽出に失敗しました - このPDFはスキャンされたものか画像ベースの可能性があります。より良い結果を得るためにGemma3への切り替えを強くお勧めします。",
2626
"too_many_pdfs": "最大{max}個のPDFファイルまで許可されています",
27-
"pdf_page_count_exceeded": "PDFは{max}ページ以下である必要があります"
27+
"pdf_page_count_exceeded": "PDFは{max}ページ以下である必要があります",
28+
"only_load_partial_pages": "ファイルサイズのため、最初の{max}ページのみが読み込まれました"
2829
}
2930
},
3031
"prompt": {

0 commit comments

Comments
 (0)