In [None]:
# Limitation：
# 1. The text is limited to 77 tokens(fixed) due to CLIP restrictions

In [1]:
from zipfile import ZipFile
import os
import torch
from PIL import Image
import open_clip
from docx import Document
from io import BytesIO
from llama_index.readers.file.docs import DocxReader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType, Collection

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


In [None]:
def extract_text_with_docxreader(docx_path):
    docx_reader = DocxReader()
    documents = docx_reader.load_data(docx_path)
    for i in range(len(documents)):
        documents[i] = documents[i].to_langchain_format()
    return documents  # List[Document]， LlamaIndex

def extract_images_as_pil(docx_path, output_folder="images"):
    images = []
    with ZipFile(docx_path, 'r') as docx_zip:
        image_files = [f for f in docx_zip.namelist() if f.endswith(".png") or f.endswith(".jpg") or f.endswith(".jpeg")]
        # os.makedirs(output_folder, exist_ok=True)
        for image_file in image_files:
            image_data = docx_zip.read(image_file)
            image = Image.open(BytesIO(image_data))
            images.append(image)
            # image_filename = os.path.basename(image_file)
            # with open(os.path.join(output_folder, image_filename), "wb") as img_out:
            #     img_out.write(image_data)
    return images

In [31]:
DOCX_PATH = "/home/longquan/project/learning/LLM/docs/Autumn_Colour.docx"

In [32]:
docs = extract_text_with_docxreader(DOCX_PATH)
images = extract_images_as_pil(DOCX_PATH)

In [33]:
docs[0].page_content

'Elevate Your Garden with Autumn Colour\n\n\n\nAutumn is a season of transition, where the garden shifts from the lush greens of summer to warmer hues. Enjoy the last of the warmer weather, and fill your garden with vibrant colour that will thrive in the cooler months ahead.\n\nAutumn is nature’s planting time. Planting now gives your plants a chance to get established before winter arrives, and brings vibrant colour to your garden through those cooler months. Here are some of our top tips for creating a stunning autumn garden.\n\nCreate Depth\n\nTo make your garden feel lush and dynamic, incorporate plants at different heights. Start with taller feature trees like maples, renowned for their fiery red and orange foliage. Add mid-sized shrubs like camellias or azaleas to bring seasonal blooms, and finish with groundcovers such as alyssum or pansies to fill in the gaps.\n\nPlant Odd Numbers\n\nOdd numbers of plants create a natural look in your garden. Start by placing the largest struct

In [34]:
docs[0].metadata

{'file_name': 'Autumn_Colour.docx'}

In [35]:
images

[<PIL.PngImagePlugin.PngImageFile image mode=RGB size=748x417>]

In [36]:
textSplitter = RecursiveCharacterTextSplitter(
    chunk_size=1024, chunk_overlap=100
)

In [37]:
text_chunks = textSplitter.split_documents(docs)
text_chunks

[Document(metadata={'file_name': 'Autumn_Colour.docx'}, page_content='Elevate Your Garden with Autumn Colour\n\n\n\nAutumn is a season of transition, where the garden shifts from the lush greens of summer to warmer hues. Enjoy the last of the warmer weather, and fill your garden with vibrant colour that will thrive in the cooler months ahead.\n\nAutumn is nature’s planting time. Planting now gives your plants a chance to get established before winter arrives, and brings vibrant colour to your garden through those cooler months. Here are some of our top tips for creating a stunning autumn garden.\n\nCreate Depth\n\nTo make your garden feel lush and dynamic, incorporate plants at different heights. Start with taller feature trees like maples, renowned for their fiery red and orange foliage. Add mid-sized shrubs like camellias or azaleas to bring seasonal blooms, and finish with groundcovers such as alyssum or pansies to fill in the gaps.\n\nPlant Odd Numbers'),
 Document(metadata={'file_

In [38]:
# open_clip.list_pretrained()
# https://huggingface.co/laion/CLIP-ViT-B-32-laion2B-s34B-b79K
model, _, preprocess = open_clip.create_model_and_transforms(
    'ViT-B-32', 
    pretrained='/home/longquan/model/CLIP-ViT-B-32-laion2B-s34B-b79K/open_clip_pytorch_model.bin', 
    load_weights_only=True
)

In [39]:
model.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
tokenizer = open_clip.get_tokenizer('ViT-B-32')

In [40]:
def pil_2_vec(image_pil):
    image = preprocess(image_pil).unsqueeze(0)
    with torch.no_grad(), torch.autocast("cuda"):
        image_features = model.encode_image(image)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    return image_features

In [41]:
image_tensor = torch.cat(
    [pil_2_vec(image) for image in images], 
    dim=0
    )
image_tensor.shape

torch.Size([1, 512])

In [42]:
[i.page_content for i in text_chunks]

['Elevate Your Garden with Autumn Colour\n\n\n\nAutumn is a season of transition, where the garden shifts from the lush greens of summer to warmer hues. Enjoy the last of the warmer weather, and fill your garden with vibrant colour that will thrive in the cooler months ahead.\n\nAutumn is nature’s planting time. Planting now gives your plants a chance to get established before winter arrives, and brings vibrant colour to your garden through those cooler months. Here are some of our top tips for creating a stunning autumn garden.\n\nCreate Depth\n\nTo make your garden feel lush and dynamic, incorporate plants at different heights. Start with taller feature trees like maples, renowned for their fiery red and orange foliage. Add mid-sized shrubs like camellias or azaleas to bring seasonal blooms, and finish with groundcovers such as alyssum or pansies to fill in the gaps.\n\nPlant Odd Numbers',
 'Plant Odd Numbers\n\nOdd numbers of plants create a natural look in your garden. Start by pla

In [43]:
def text_2_vec(text: str):
    text_token = tokenizer(text)
    with torch.no_grad(), torch.autocast("cuda"):
        text_features = model.encode_text(text_token)
    text_features /= text_features.norm(dim=-1, keepdim=True)
    return text_features

In [44]:
text_tensor = torch.cat(
    [text_2_vec(i.page_content) for i in text_chunks], 
    dim=0
    )
text_tensor.shape

torch.Size([3, 512])

In [45]:
connection_args = {'uri': './milvus_demo.db'}

vector_store = MilvusClient(**connection_args)

In [46]:
collection_name = "clip_collection"

In [47]:
if not vector_store.has_collection(collection_name):
    vector_store.create_collection(
            collection_name=collection_name,
            vector_field_name="vector",
            dimension=512,
            auto_id=True,
            enable_dynamic_field=True,
        )

In [48]:
# prepare data
clip_image_vector = image_tensor.squeeze(0).tolist()  # List[float]
data_image = {
    "vector": clip_image_vector,  
    "type": "image",    
    "filename": f"{os.path.basename(DOCX_PATH)}",  
}

vector_store.insert(collection_name=collection_name, data=data_image)

{'insert_count': 1, 'ids': [457366000468492294], 'cost': 0}

In [49]:
clip_text_vector =  text_tensor  # torch.Size([2, 512])
for vector in clip_text_vector:
    data_text = {
        "vector": vector, 
        "type": "text",             
        "filename": f"{os.path.basename(DOCX_PATH)}",  
    }
    vector_store.insert(collection_name=collection_name, data=data_text)

# search

In [53]:
def search_text(text: str, top_k: int = 5):
    query_vector = text_2_vec(text)
    query_vector = query_vector.squeeze(0).tolist()  # List[float]

    results = vector_store.search(
        collection_name=collection_name,
        data=[query_vector],  # 查询的向量
        anns_field="vector",  # 向量字段
        search_params={"metric_type": "COSINE", "params": {"nprobe": 10}},  # 搜索参数
        limit=top_k,  # 返回前 5 个结果
        output_fields=["filename", "type"],  # 返回的字段
        filter="type == 'text'"  # 过滤条件，只搜索 type 为 'image' 的数据
    )
    return results

In [54]:
results = search_text("Spring Bulbs")

In [55]:
results

data: ["[{'id': 457365964801441794, 'distance': 0.7986000776290894, 'entity': {'type': 'text', 'filename': 'Spring_Bulbs.docx'}}, {'id': 457365964809306116, 'distance': 0.782997727394104, 'entity': {'type': 'text', 'filename': 'Spring_Bulbs.docx'}}, {'id': 457366000565747720, 'distance': 0.5521793961524963, 'entity': {'type': 'text', 'filename': 'Autumn_Colour.docx'}}, {'id': 457366000574136330, 'distance': 0.5507391095161438, 'entity': {'type': 'text', 'filename': 'Autumn_Colour.docx'}}, {'id': 457366000579117068, 'distance': 0.38708963990211487, 'entity': {'type': 'text', 'filename': 'Autumn_Colour.docx'}}]"]