In [5]:
import kagglehub
path = kagglehub.dataset_download("sujaykapadnis/dog-breeds")
print("Path to dataset files:", path)

Path to dataset files: D:\AppData\kaggle\datasets\sujaykapadnis\dog-breeds\versions\2


In [10]:
import pandas as pd
import os

breed_traits = pd.read_csv(os.path.join(path, "breed_traits.csv"))
breed_rank = pd.read_csv(os.path.join(path, "breed_rank.csv"))
breed_traits_long = pd.read_csv(os.path.join(path, "breed_traits_long.csv"))
trait_description = pd.read_csv(os.path.join(path, "trait_description.csv"))

In [None]:
import requests
from pathlib import Path

# 获取 pdf 数据

links = breed_rank["links"].tolist()
pdf_urls = []
pdf_base_url = "https://images.akc.org/pdf/breeds/standards/"
breed_names = []
for link in links:
    breed_name = link.rstrip("/").split("/")[-1]
    breed_names.append(breed_name)
    words = [word.capitalize() for word in breed_name.split("-")]
    breed_name = "".join(words)
    pdf_urls.append(os.path.join(pdf_base_url, breed_name + ".pdf"))

def download_breed_pdf(pdf_url, breed_name):
    pdf_dir = Path("breed_pdfs")
    pdf_dir.mkdir(exist_ok=True)
    pdf_path = pdf_dir / f"{breed_name}.pdf"

    try:
        # 下载PDF文件
        response = requests.get(pdf_url, stream=True, verify=False)
        response.raise_for_status()  # 检查是否下载成功
        
        # 保存文件
        with open(pdf_path, 'wb') as pdf_file:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    pdf_file.write(chunk)
        
        print(f"成功下载PDF文件: {pdf_path}")
        return str(pdf_path)
    
    except requests.exceptions.RequestException as e:
        print(f"下载PDF时发生错误: {e}")
    return None

for url, name in zip(pdf_urls, breed_names):
    download_breed_pdf(url, name)

In [6]:
from datasets import load_dataset
ds = load_dataset("ajinkyakolhe112/dog_breed_classification_kaggle")

In [1]:
import torch
from transformers import AutoModel

import numpy as np
from tqdm import tqdm
import pandas as pd

# 初始化模型和处理器
model = AutoModel.from_pretrained("jinaai/jina-clip-v2", trust_remote_code=True)



检索回来的时候, 不单独检索图片以及文本, 而是检索回一个文档, 这个文档包含了图片和文本
```python
Document {
    text: string,
    images: Image[],
    metadata: {
        source: string,
        index: number
    }
}
```

问题是如何处理这个文档, 使其包含图片以及位置信息
类似marker-pdf的库好像可以将链接嵌入到 markdown 文档中, 如果 chunking 的时候把链接一起包含了, 那么那个 Document 就会包含图片了. 在 markdown 中的链接应该包含图片实际存储的位置, 然后将图片读取并存储到Document内

In [8]:
truncate_dim = 512

def extract_image_features(images):
    image_features = model.encode_image(images, truncate_dim=truncate_dim)
    return image_features

def extract_text_features(texts):
    text_features = model.encode_text(texts, truncate_dim=truncate_dim)
    return text_features

In [9]:
# === 构建 Image Embedding 数据库 ===
image_features_list = []
for img in tqdm(ds['train'][:100]['image']):
    features = extract_image_features(img)
    image_features_list.append(features)

image_features = np.vstack(image_features_list)
df = pd.DataFrame(image_features)
df.to_parquet(r"D:\HKU\Inno Wing RA\multimodal-rag-tutorial\db\image_features.parquet")

100%|██████████| 100/100 [02:04<00:00,  1.24s/it]


In [10]:
# === 构建 Text Embedding 数据库 ===
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

# context chunking
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
markdown_dir = Path(r"d:/HKU/Inno Wing RA/multimodal-rag-tutorial/md_outputs")
documents = []
for path in markdown_dir.rglob("*.md"):
    with open(path, "r", encoding="utf-8") as f:
        content = f.read()
        documents.append(content)

chunks = splitter.split_text("\n\n".join(documents))

text_features_list = []
for doc in tqdm(chunks):
    features = extract_text_features(doc)
    text_features_list.append(features)

text_features = np.vstack(text_features_list)
df = pd.DataFrame(text_features)
df.to_parquet(r"D:\HKU\Inno Wing RA\multimodal-rag-tutorial\db\text_features.parquet")

100%|██████████| 3173/3173 [25:19<00:00,  2.09it/s]


In [15]:
df = pd.DataFrame(chunks)
df.to_parquet(r"D:\HKU\Inno Wing RA\multimodal-rag-tutorial\db\text_chunks.parquet")

In [11]:
image_db = pd.read_parquet(r"D:\HKU\Inno Wing RA\multimodal-rag-tutorial\db\image_features.parquet").to_numpy()
text_db = pd.read_parquet(r"D:\HKU\Inno Wing RA\multimodal-rag-tutorial\db\text_features.parquet").to_numpy()

In [12]:
image_db.shape

(100, 512)

In [13]:
text_db.shape

(3173, 512)