In [1]:
import pandas as pd 
from PIL import Image 
import matplotlib.pyplot as plt 
import os 
import json 


with open("./data/query.json", "r") as f:
    query = json.load(f)
    df_query = pd.DataFrame(query)
df_train = pd.read_csv("./data/train_annotation.csv", sep="\t")
print(f"Shape of df_query: {df_query.shape}")
display(df_query.head(2))
print(f"Shape of df_train: {df_train.shape}")
display(df_train.head(2))
# 所有图片
df_image_all = pd.DataFrame(os.listdir("./data/image"))
df_image_all.columns = ["image"]
print(f"Shape of df_image_all: {df_image_all.shape}")
display(df_image_all.head(2))
# 所有没在训练集中的图片
df_image_not_train = df_image_all.loc[~df_image_all["image"].isin(df_train["image"])]
print(f"Shape of df_image_not_train: {df_image_not_train.shape}")
display(df_image_not_train.head(2))

Shape of df_query: (1497, 3)


Unnamed: 0,question,related_image,answer
0,请对给定的图片进行描述。,vwsscflkvakdictzacfx.jpg,
1,这款运动裤是什么材质做的？,jjxjzgkbrfizjwfngwis.jpg,


Shape of df_train: (12768, 2)


Unnamed: 0,image,text
0,scqxwrymypdzdefummyj.jpg,无拘2019女夏新款衬衫裙夏装格纹收腰气质显瘦蕾丝腰带衬衫连衣裙
1,chvgdtmndrqwfkabrgoh.jpg,2019夏季新款高端气质不对称肩带chic修身显瘦日常V领连衣裙女潮


Shape of df_image_all: (14652, 1)


Unnamed: 0,image
0,aaaopsvfhtbgtpoybknx.jpg
1,aabaonefxwpacwmiijkx.jpg


Shape of df_image_not_train: (1884, 1)


Unnamed: 0,image
4,aacwkearrqavokzmhxat.jpg
6,aadfkitjvpjohztcbsrs.jpg


In [2]:
from transformers import ChineseCLIPProcessor, ChineseCLIPModel
import os
import torch 


device = "cuda" if torch.cuda.is_available() else "cpu"
# 设置镜像端点
os.environ["HF_ENDPOINT"] = "https://huggingface.co"
os.environ["TRANSFORMERS_CACHE"] = "hf-mirror"
clip_model_path = "/root/onethingai-tmp/models--OFA-Sys--chinese-clip-vit-huge-patch14/snapshots/503e16b560aff94c1922f13a86a7693d36957a4f"
model = torch.load("CLIP_LoRA_625_10.pth").to(device)
processor = ChineseCLIPProcessor.from_pretrained(clip_model_path)
print("Loading Done!")

  model = torch.load("CLIP_LoRA_625_10.pth").to(device)
2024-09-02 00:40:46.866235: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-02 00:40:46.924310: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-02 00:40:46.953029: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-02 00:40:46.962940: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-02 00:4

Loading Done!




# 向量化

In [3]:
from datasets import Dataset
import numpy as np


df_train = pd.read_csv("./data/train_annotation.csv", sep="\t")
model.eval()
dataset_train = Dataset.from_pandas(df_train)
def get_image_embed(batch):
    with torch.no_grad():
        image_paths = [os.path.join('./data/image', image_name) for image_name in batch["image"]]
        images = [Image.open(image_path).transpose(Image.FLIP_LEFT_RIGHT).convert("RGB") for image_path in image_paths]
        pixel_values = processor(text=None, images=images, return_tensors="pt")["pixel_values"].to(device)
        image_embeds = model.get_image_features(pixel_values)
        batch["image_embeds"] = image_embeds
        return batch
        
def get_text_embed(batch):
    with torch.no_grad():
        inputs = processor(text=batch["text"], images=None, return_tensors="pt", padding=True, truncation=True, max_length=52).to(device)
        text_embeds = model.get_text_features(**inputs)
        batch["text_embeds"] = text_embeds
        return batch

# 训练集图片和文本向量化
dataset_train = dataset_train.map(get_image_embed, batched=True, batch_size=256)
dataset_train = dataset_train.map(get_text_embed, batched=True, batch_size=512)
dataset_train.set_format("torch", columns=["image_embeds", "text_embeds"])
image_embeddings = dataset_train["image_embeds"]
text_embeddings = dataset_train["text_embeds"]
image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)
text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
image_embeddings = image_embeddings.detach().cpu().numpy()
text_embeddings = text_embeddings.detach().cpu().numpy()

# 未在训练集中的图片向量化
dataset_image_not_train = Dataset.from_pandas(df_image_not_train)
dataset_image_not_train = dataset_image_not_train.map(get_image_embed, batched=True, batch_size=256)
dataset_image_not_train.set_format("torch", columns=["image_embeds"])
image_embeddings_not_train = dataset_image_not_train["image_embeds"]
image_embeddings_not_train = image_embeddings_not_train / image_embeddings_not_train.norm(dim=-1, keepdim=True)
image_embeddings_not_train = image_embeddings_not_train.detach().cpu().numpy()

Map:   0%|          | 0/12768 [00:00<?, ? examples/s]

Map:   0%|          | 0/12768 [00:00<?, ? examples/s]

Map:   0%|          | 0/1884 [00:00<?, ? examples/s]

# 文本匹配图片

In [4]:
import re 


# 文搜图的数据
df_text2image = df_query.loc[df_query["related_image"]=="", ["question"]]
# 定义正则表达式模式
pattern = re.compile(r'请匹配到与 (.+) 最相关的图片。')
# 查找所有匹配项
def get_text(x):
    return pattern.findall(x)[0]
df_text2image = df_text2image.map(get_text)


def get_text_embed_text2image(batch):
    """文本向量化"""
    with torch.no_grad():
        inputs = processor(text=batch["question"], images=None, return_tensors="pt", padding=True, truncation=True, max_length=52).to(device)
        text_embeds = model.get_text_features(**inputs)
        batch["text_embeds"] = text_embeds
        return batch

# 文本向量化
dataset_text2image = Dataset.from_pandas(df_text2image)
dataset_text2image = dataset_text2image.map(get_text_embed_text2image, batched=True, batch_size=512)
dataset_text2image.set_format("torch", columns=["text_embeds"])
text_embeddings_query = dataset_text2image["text_embeds"].to(device)
text_embeddings_query = text_embeddings_query / text_embeddings_query.norm(dim=-1, keepdim=True)
text_embeddings_query = text_embeddings_query.detach().cpu().numpy()

Map:   0%|          | 0/392 [00:00<?, ? examples/s]

# 图片匹配文本

In [5]:
# 图片匹配文本的样本
df_image2text = df_query.loc[df_query["related_image"]!=""].copy()

def get_image_embed_image2text(batch):
    """图片向量化"""
    with torch.no_grad():
        image_paths = [os.path.join('./data/image', image_name) for image_name in batch["related_image"]]
        images = [Image.open(image_path).transpose(Image.FLIP_LEFT_RIGHT).convert("RGB") for image_path in image_paths]
        pixel_values = processor(text=None, images=images, return_tensors="pt")["pixel_values"].to(device)
        image_embeds = model.get_image_features(pixel_values)
        batch["image_embeds"] = image_embeds
        return batch

# 图片向量化
dataset_image2text = Dataset.from_pandas(df_image2text)
dataset_image2text = dataset_image2text.map(get_image_embed_image2text, batched=True, batch_size=256)
dataset_image2text.set_format("torch", columns=["image_embeds"])
image_embeddings_query = dataset_image2text["image_embeds"]
image_embeddings_query = image_embeddings_query / image_embeddings_query.norm(dim=-1, keepdim=True)
image_embeddings_query = image_embeddings_query.detach().cpu().numpy()

# 对图片寻找最相似的文本
most_similar_index_image2text = (image_embeddings_query @ text_embeddings.T).argmax(axis=1)
df_image2text = df_query.loc[df_query["related_image"]!=""].copy()
df_image2text["answer"] = df_train.loc[most_similar_index_image2text, "text"].values

Map:   0%|          | 0/1105 [00:00<?, ? examples/s]

# 图片匹配图片然后对应到文本

In [6]:
# 图片匹配文本的样本
df_image2image = df_query.loc[df_query["related_image"]!=""].copy()

def get_image_embed_image2text(batch):
    """图片向量化"""
    with torch.no_grad():
        image_paths = [os.path.join('./data/image', image_name) for image_name in batch["related_image"]]
        images = [Image.open(image_path).transpose(Image.FLIP_LEFT_RIGHT).convert("RGB") for image_path in image_paths]
        pixel_values = processor(text=None, images=images, return_tensors="pt")["pixel_values"].to(device)
        image_embeds = model.get_image_features(pixel_values)
        batch["image_embeds"] = image_embeds
        return batch

# 图片向量化
dataset_image2image = Dataset.from_pandas(df_image2image)
dataset_image2image = dataset_image2image.map(get_image_embed_image2text, batched=True, batch_size=256)
dataset_image2image.set_format("torch", columns=["image_embeds"])
image_embeddings_query = dataset_image2image["image_embeds"]
image_embeddings_query = image_embeddings_query / image_embeddings_query.norm(dim=-1, keepdim=True)
image_embeddings_query = image_embeddings_query.detach().cpu().numpy()

# 对图片寻找最相似的图片
most_similar_index_image2image = (image_embeddings_query @ image_embeddings.T).argmax(axis=1)
df_image2image = df_query.loc[df_query["related_image"]!=""].copy()
df_image2image["answer"] = df_train.loc[most_similar_index_image2image, "text"].values

Map:   0%|          | 0/1105 [00:00<?, ? examples/s]

In [7]:
# 对图片寻找最相似的图片
most_similar_index_image2image = (image_embeddings_query @ image_embeddings.T).argmax(axis=1)
df_image2image = df_query.loc[df_query["related_image"]!=""].copy()
df_image2image["answer"] = df_train.loc[most_similar_index_image2image, "text"].values

In [8]:
df_image2text = df_image2image

In [None]:
# 保存所有向量
# 训练集所有图片的向量
np.save("./embeddings/image_embeddings.npy", image_embeddings)
print(f"Shape of image_embeddings: {image_embeddings.shape}")
# 训练集所有文字的向量
np.save("./embeddings/text_embeddings.npy", text_embeddings)
print(f"Shape of text_embeddings: {text_embeddings.shape}")
# 所有不在训练集中的图片的向量
np.save("./embeddings/image_embeddings_not_train.npy", image_embeddings_not_train)
print(f"Shape of image_embeddings_not_train: {image_embeddings_not_train.shape}")
# 测试集中所有image的向量
np.save("./embeddings/image_embeddings_query.npy", image_embeddings_query)
print(f"Shape of image_embeddings_query: {image_embeddings_query.shape}")
# 测试集中所有text的向量
np.save("./embeddings/text_embeddings_query.npy", text_embeddings_query)
print(f"Shape of text_embeddings_query: {text_embeddings_query.shape}")

# 使用Qwen2-VL结合RAG检索进行图文问答

In [9]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from modelscope import snapshot_download
import torch


model_dir = snapshot_download("qwen/Qwen2-VL-7B-Instruct", cache_dir="/root/onethingai-tmp")
# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_dir,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
# default processer
processor = AutoProcessor.from_pretrained(model_dir)
print("Qwen2-VL Loading Done!")

You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Qwen2-VL Loading Done!


In [10]:
import Levenshtein

def get_similarity(question, answer):
    """
    计算相似度
    """
    similarity = Levenshtein.ratio(question, answer)
    return similarity


def improve_answer(question, related_image, answer):
    """
    润色答案
    """
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": os.path.join("./data/image", related_image),
                },
                {"type": "text", "text": f"""请根据问题和答案，将答案修改为问题的格式，例如：
问题：这套裙子是2019年的新款吗？
答案：是
润色后的答案：这套裙子是2019年的新款。
问题：这是什么系列的衣服？
答案：这款衣服属于夏季系列。
润色后的答案：这是夏季系列的衣服。

现在请你进行回答：
问题：{question}
答案: {answer}
润色后的答案："""},
            ],
        }
    ]
    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    
    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    return output_text

In [11]:
def chat(question, related_image, related_text):
    """
    RAG多模态图文问答
    """
    messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": os.path.join("./data/image", related_image),
            },
            {"type": "text", "text": f"""这件商品的名称是"{related_text}"，请根据商品图片和名称，回答问题，答案尽可能简洁且和问题格式保持一致。
例如：
问题：这件衣服是男款还是女款？
回答：这件衣服是男款。
问题：这是哪一年的衣服？
回答：这是20XX年的衣服。
问题：这件衣服是哪个季节的？
回答：这件衣服是X季的。

现在请你回答：
问题: {question}
回答："""},
        ],
    }
]
    # Preparation for inference
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    
    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    answer = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )[0]
    if get_similarity(question, answer) >= 0.35:
        print(question, answer)
        return answer
    else:
        print("Similarity is so low!", question, answer)
        print("Improving Answer...")
        improved_answer = improve_answer(question, related_image, answer)  
        print(question, improved_answer)
        return improved_answer

In [12]:
llm_answer = []
with torch.no_grad():
    for idx, line in enumerate(df_image2text.values):
        question, related_image, related_text = line
        if question == "请对给定的图片进行描述。":
            llm_answer.append(related_text)
        else:
            try:
                print(f"[{idx}|{len(df_image2text)}]:")
                llm_answer.append(chat(question, related_image, related_text))
            except:
                llm_answer.append(question)

[1|1105]:
这款运动裤是什么材质做的？ 这款运动裤是莫代尔面料做的。
[2|1105]:
这套衣服是什么材质的？ 这套衣服是棉质的。
[3|1105]:
Similarity is so low! 这款牛仔短裤是2019年的新款吗？ 不是。
Improving Answer...
这款牛仔短裤是2019年的新款吗？ 这款牛仔短裤不是2019年的新款。
[6|1105]:
这件连衣裙是什么季节穿的？ 这件连衣裙是夏季穿的。
[7|1105]:
这件连衣裙是什么季节的？ 这件连衣裙是夏季的。
[8|1105]:
这款帕波仕蒂男士皮衣是哪一年的新款？ 这款帕波仕蒂男士皮衣是2019年的新款。
[9|1105]:
这件孕妇连衣裙适合哪个季节穿？ 这件孕妇连衣裙适合夏季穿。
[10|1105]:
这款连衣裙是什么风格的？ 这款连衣裙是气质显瘦的一字领心机小黑裙长裙。
[13|1105]:
这款羽绒服是男款还是女款？ 这款羽绒服是女款。
[15|1105]:
这款连衣裙适合胖mm吗？ 这款连衣裙适合胖mm。
[16|1105]:
这件雪纺衫是什么季节穿的？ 这件雪纺衫是夏季穿的。
[17|1105]:
这款罗衣原创连衣裙是什么季节的新款？ 这款罗衣原创连衣裙是夏季的新款。
[18|1105]:
Similarity is so low! 这款裙子是2019年的新款吗？ 是
Improving Answer...
这款裙子是2019年的新款吗？ 这款裙子是2019年的新款。
[19|1105]:
这件连衣裙是什么风格的？ 这件连衣裙是中国风的。
[20|1105]:
这款连衣裙是什么材质制成的？ 这款连衣裙是100%桑蚕丝制成的。
[22|1105]:
这件条纹t恤是什么季节穿的？ 这件条纹t恤是夏季穿的。
[23|1105]:
这件孕妇装适合哪个季节穿？ 这件孕妇装适合夏季穿。
[24|1105]:
Similarity is so low! 这件克劳德乌鸦荷叶边印花衬衫是2019夏季款吗？ 是
Improving Answer...
这件克劳德乌鸦荷叶边印花衬衫是2019夏季款吗？ 这件克劳德乌鸦荷叶边印花衬衫是2019夏季款。
[25|1105]:
Similarity is so low! 这款裙子是2019年的新款吗？ 是
Improving Answer...

# 输出

In [22]:
# import json 
# import pandas as pd 

# # 创建一个提交副本
# df_submit = df_query.copy()
# # 对文本寻找最相似的图片
# most_similar_index_text2image = (text_embeddings_query @ image_embeddings_not_train.T).argmax(axis=1)
# # 填充文本匹配图片的结果
# df_submit.loc[df_text2image.index, "answer"] = df_image_not_train.reset_index(drop=True).loc[most_similar_index_text2image, "image"].values
# # 填充图片问答的结果
# df_submit.loc[df_image2text.index, "answer"] = llm_answer
# # 输出为json文件
# submit_json = []
# with open("LoRA_CLIP_Qwen2_VL_improved_image2image_justify.json", "w", encoding="utf-8") as f:
#     for line in df_submit.values:
#         question, related_image, answer = line
#         json_str = {"question": question, "related_image": related_image, "answer": answer}
#         submit_json.append(json_str)
#     json.dump(submit_json, f, ensure_ascii=False, indent=4)

# 匈牙利算法优化

In [14]:
import json 
import pandas as pd 
from scipy.optimize import linear_sum_assignment
import numpy as np


# 创建一个提交副本
df_submit = df_query.copy()
# 对文本寻找最相似的图片
most_similar_matrix_text2image = (text_embeddings_query @ image_embeddings_not_train.T)
max_similarity = np.max(most_similar_matrix_text2image)
cost_matrix = max_similarity - most_similar_matrix_text2image
# 使用匈牙利算法求解
row_ind, col_ind = linear_sum_assignment(cost_matrix)

# 填充文本匹配图片的结果
df_submit.loc[df_text2image.index, "answer"] = df_image_not_train.reset_index(drop=True).loc[col_ind, "image"].values
# 填充图片问答的结果
df_submit.loc[df_image2text.index, "answer"] = llm_answer
# 输出为json文件
submit_json = []
with open("LoRA_CLIP_Qwen2_VL_improved_image2image_justify_匈牙利算法.json", "w", encoding="utf-8") as f:
    for line in df_submit.values:
        question, related_image, answer = line
        json_str = {"question": question, "related_image": related_image, "answer": answer}
        submit_json.append(json_str)
    json.dump(submit_json, f, ensure_ascii=False, indent=4)