Skip to content

Commit 3f2a643

Browse files
committed
feat(pdf): Refactor PDF handling to extract text content and improve attachment structure
1 parent 4a26c6f commit 3f2a643

File tree

6 files changed

+64
-50
lines changed

6 files changed

+64
-50
lines changed

entrypoints/content/components/AttachmentSelector.vue

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -192,22 +192,23 @@ import Tag from '@/components/Tag.vue'
192192
import Button from '@/components/ui/Button.vue'
193193
import Divider from '@/components/ui/Divider.vue'
194194
import Text from '@/components/ui/Text.vue'
195+
import { useLogger } from '@/composables/useLogger'
195196
import { useTimeoutValue } from '@/composables/useTimeoutValue'
196197
import { AttachmentItem, ContextAttachment } from '@/types/chat'
197198
import { TabInfo } from '@/types/tab'
198-
import { fileToBase64 } from '@/utils/base64'
199+
import { hashFile } from '@/utils/hash'
199200
import { useI18n } from '@/utils/i18n'
200201
import { generateRandomId } from '@/utils/id'
201202
import { convertImageFileToJpegBase64 } from '@/utils/image'
202-
import { isModelSupportPDFToImages } from '@/utils/llm/models'
203-
import { checkReadablePdf, getDocumentProxy, getPdfPageCount } from '@/utils/pdf'
203+
import { extractPdfText, getDocumentProxy, getPdfPageCount } from '@/utils/pdf'
204204
import { c2bRpc, registerContentScriptRpcEvent } from '@/utils/rpc'
205205
import { ByteSize } from '@/utils/sizes'
206206
import { getUserConfig } from '@/utils/user-config'
207207
208208
import { getValidTabs } from '../utils/tabs'
209209
import ExternalImage from './ExternalImage.vue'
210210
211+
const logger = useLogger()
211212
const { t } = useI18n()
212213
213214
const cleanUpTabUpdatedListener = registerContentScriptRpcEvent('tabUpdated', async () => {
@@ -297,13 +298,6 @@ const SUPPORTED_ATTACHMENT_TYPES: AttachmentItem[] = [
297298
matchMimeType: (mimeType) => mimeType === 'application/pdf',
298299
validateFile: async ({ count }, file: File) => {
299300
const docProxy = await getDocumentProxy(file)
300-
if (!currentModel.value || !isModelSupportPDFToImages(currentModel.value)) {
301-
const readable = await checkReadablePdf(docProxy)
302-
if (!readable) {
303-
showErrorMessage(t('chat.input.attachment_selector.pdf_text_extract_error'))
304-
return false
305-
}
306-
}
307301
if (count >= MAX_PDF_COUNT) {
308302
showErrorMessage(t('chat.input.attachment_selector.too_many_pdfs', { max: MAX_PDF_COUNT }))
309303
return false
@@ -320,16 +314,21 @@ const SUPPORTED_ATTACHMENT_TYPES: AttachmentItem[] = [
320314
return true
321315
},
322316
convertFileToAttachment: async (file: File): Promise<ContextAttachment> => {
323-
const base64Data = await fileToBase64(file)
324-
return {
317+
const pdfText = await extractPdfText(file)
318+
const info: ContextAttachment = {
325319
type: 'pdf',
326320
value: {
327-
data: base64Data,
321+
type: 'text',
322+
pageCount: pdfText.pdfProxy.numPages,
323+
textContent: pdfText.textContent.text,
328324
id: generateRandomId(),
329-
size: file.size,
325+
fileSize: file.size,
326+
fileHash: await hashFile(file),
330327
name: file.name,
331328
},
332329
}
330+
logger.debug('extracted pdf content', info)
331+
return info
333332
},
334333
},
335334
]

entrypoints/content/utils/chat/chat.ts

Lines changed: 12 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,13 @@ import { CoreMessage } from 'ai'
22
import EventEmitter from 'events'
33
import { type Ref, ref } from 'vue'
44

5-
import { ContextAttachment } from '@/types/chat'
6-
import { Base64PDFData, PDFContentForModel } from '@/types/pdf'
5+
import { ContextAttachment, PDFAttachment } from '@/types/chat'
6+
import { PDFContentForModel } from '@/types/pdf'
77
import { nonNullable } from '@/utils/array'
8-
import { arrayBufferToBase64 } from '@/utils/base64'
98
import { parseDocument } from '@/utils/document-parser'
109
import { AbortError, AppError } from '@/utils/error'
1110
import { useGlobalI18n } from '@/utils/i18n'
12-
import { isModelSupportPDFToImages } from '@/utils/llm/models'
1311
import logger from '@/utils/logger'
14-
import { extractPdfText, renderPdfPagesAsImages } from '@/utils/pdf'
1512
import { chatWithPageContent, generateSearchKeywords, nextStep, Page, summarizeWithPageContent } from '@/utils/prompts'
1613
import { UserPrompt } from '@/utils/prompts/helpers'
1714
import { SearchingMessage } from '@/utils/search'
@@ -319,9 +316,11 @@ export class Chat {
319316
async checkNextStep(contextMsgs: { role: 'user' | 'assistant', content: string }[]) {
320317
log.debug('checkNextStep', contextMsgs)
321318
const relevantTabIds = this.contextTabs.map((tab) => tab.tabId)
319+
const relevantPDF = this.contextPDFs?.[0] ? await this.extractPDFContent(this.contextPDFs[0]) : undefined
320+
if (this.contextPDFs.length > 1) log.warn('Multiple PDFs are attached, only the first one will be used for the chat context.')
322321
const pages = await getDocumentContentOfTabs(relevantTabIds)
323322
const abortController = this.createAbortController()
324-
const prompt = await nextStep(contextMsgs, pages.filter(nonNullable))
323+
const prompt = await nextStep(contextMsgs, pages.filter(nonNullable), relevantPDF)
325324
const next = await generateObjectInBackground({
326325
schema: 'nextStep',
327326
prompt: prompt.user.extractText(),
@@ -452,35 +451,18 @@ export class Chat {
452451
return await this.sendMessage(prompt.user, prompt.system, { autoDeleteEmptyResponseMsg: false })
453452
}
454453

455-
private async extractPDFContent(model: string | undefined, pdfData: Base64PDFData): Promise<PDFContentForModel> {
456-
// use vision model for better performance if available
457-
if (model && isModelSupportPDFToImages(model)) {
458-
const pdfContent = (await renderPdfPagesAsImages(pdfData.data))
459-
return {
460-
type: 'images',
461-
images: await Promise.all(pdfContent.images.map((img) => arrayBufferToBase64(img.image).then((base64) => {
462-
return {
463-
data: base64,
464-
type: 'image/png',
465-
}
466-
}))),
467-
pageCount: pdfContent.pdfProxy.numPages,
468-
} as const
469-
}
470-
else {
471-
const pdfContent = await extractPdfText(pdfData.data)
472-
return {
473-
type: 'text',
474-
textContent: pdfContent.textContent.text,
475-
pageCount: pdfContent.pdfProxy.numPages,
476-
} as const
454+
private async extractPDFContent(pdfData: PDFAttachment['value']): Promise<PDFContentForModel> {
455+
return {
456+
type: 'text',
457+
textContent: pdfData.textContent,
458+
pageCount: pdfData.pageCount,
459+
fileName: pdfData.name,
477460
}
478461
}
479462

480463
async ask(question: string) {
481464
using _s = this.statusScope('pending')
482465
const userConfig = await getUserConfig()
483-
const currentModel = userConfig.llm.model.get()
484466
const abortController = new AbortController()
485467
this.abortControllers.push(abortController)
486468

@@ -511,7 +493,7 @@ export class Chat {
511493
}
512494
const relevantTabIds = this.contextTabs.map((tab) => tab.tabId)
513495
const relevantImages = this.contextImages
514-
const relevantPDF = this.contextPDFs?.[0] ? await this.extractPDFContent(currentModel, this.contextPDFs[0]) : undefined
496+
const relevantPDF = this.contextPDFs?.[0] ? await this.extractPDFContent(this.contextPDFs[0]) : undefined
515497
if (this.contextPDFs.length > 1) log.warn('Multiple PDFs are attached, only the first one will be used for the chat context.')
516498
const pages = await getDocumentContentOfTabs(relevantTabIds)
517499
const prompt = await chatWithPageContent(question, pages.filter(nonNullable), onlineResults, relevantImages, relevantPDF)

types/chat.ts

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import { PromiseOr } from './common'
22
import { Base64ImageData } from './image'
3-
import { Base64PDFData } from './pdf'
43
import { TabInfo } from './tab'
54

65
export type ImageAttachment = {
@@ -14,10 +13,14 @@ export type ImageAttachment = {
1413

1514
export type PDFAttachment = {
1615
type: 'pdf'
17-
value: Base64PDFData & {
16+
value: {
1817
id: string
1918
name: string
20-
size?: number
19+
textContent: string
20+
pageCount: number
21+
fileSize: number
22+
fileHash: string
23+
type: 'text'
2124
}
2225
}
2326

utils/hash.ts

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
export async function sha256(message: string | ArrayBuffer): Promise<string> {
2+
let data: BufferSource
3+
if (typeof message === 'string') {
4+
data = new TextEncoder().encode(message)
5+
}
6+
else {
7+
data = message
8+
}
9+
const buf = await crypto.subtle.digest('SHA-256', data)
10+
return [...new Uint8Array(buf)]
11+
.map((b) => b.toString(16).padStart(2, '0'))
12+
.join('')
13+
}
14+
15+
export async function hashFile(file: File | Blob) {
16+
const arrayBuffer = await file.arrayBuffer()
17+
return sha256(arrayBuffer)
18+
}

utils/llm/models.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,8 @@ export function parseErrorMessageFromChunk(error: unknown): string | null {
101101
return null
102102
}
103103

104-
export function isModelSupportPDFToImages(model: string): boolean {
104+
export function isModelSupportPDFToImages(_model: string): boolean {
105105
// Currently only gemma3 models have the ability to understand PDF converted to images
106-
return model.startsWith('gemma3:')
106+
// but it's too slow to process large number of image so we disable this feature temporarily by returning false here
107+
return false
107108
}

utils/prompts/index.ts

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ Question: ${question}`.trim()
8282
return { user: new UserPrompt(user), system }
8383
})
8484

85-
export const nextStep = definePrompt(async (messages: { role: 'user' | 'assistant' | string, content: string }[], pages: Page[]) => {
85+
export const nextStep = definePrompt(async (messages: { role: 'user' | 'assistant' | string, content: string }[], pages: Page[], pdfInfo?: PDFContentForModel) => {
8686
const system = renderPrompt`You are a helpful assistant. Based on the conversation below and the current web page content, suggest the next step to take. You can suggest one of the following options:
8787
8888
1. search_online: ONLY if user requests the latest information or news that you don't already know. If you choose this option, you must also provide a list of search keywords.
@@ -112,13 +112,24 @@ ${new JSONBuilder({ action: 'chat' })}
112112
tabContextBuilder.insert(new TagBuilder('tab', { id: i + 1 }).insertContent(head, body))
113113
}
114114

115+
const pdfContextBuilder: TagBuilder = new TagBuilder('pdf_document')
116+
if (pdfInfo?.type === 'text') {
117+
const { fileName = '', pageCount } = pdfInfo
118+
pdfContextBuilder.insert(new TagBuilder('title').insertContent(fileName))
119+
pdfContextBuilder.insert(new TagBuilder('pages').insertContent(pageCount.toString()))
120+
}
121+
else {
122+
// ignore pdf images for now
123+
}
124+
115125
const conversationContextBuilder = new TagBuilder('conversation')
116126
for (const message of messages) {
117127
conversationContextBuilder.insertContent(`${message.role}: ${message.content}`)
118128
}
119129

120130
const user = renderPrompt`
121131
${tabContextBuilder}
132+
${pdfContextBuilder}
122133
123134
${conversationContextBuilder}
124135
`.trim()

0 commit comments

Comments
 (0)