# Multimodal RAG

In [127]:
import warnings
warnings.filterwarnings("ignore")
import os
from openai import OpenAI
import base64
from langchain.chat_models import ChatOpenAI
from langchain.schema.messages import HumanMessage
from IPython.display import Markdown,Image,Latex, display
import requests
from sentence_transformers import SentenceTransformer
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import faiss
import torch
import pandas as pd

In [13]:
import json
file_path = '../dataset/multiDemo.json'
with open(file_path, 'r') as file:
    data = json.load(file)
query=data[-3]
error_log=data[0]
error_log

{'ID': 0,
 'Question Number': 1,
 'Share Context': 'Neshie Wakuluk is an investment strategist who develops capital market expectations for\nan investment firm that invests across asset classes and global markets. Wakuluk started her\ncareer when the global markets were experiencing significant volatility and poor returns; as a\nresult, she is now careful to base her conclusions on objective evidence and analytical\nprocedures to mitigate any potential biases.Wakuluk’s approach to economic forecasting\nutilizes a structural model in conjunction with a diffusion index to determine the current\nphase of a country’s business cycle. This approach has produced successful predictions in the\npast, thus Wakuluk has high confidence in the predictions. Wakuluk also determines whether any\nadjustments need to be made to her initial estimates of the respective aggregate economic\ngrowth trends based on historical rates of growth for Countries X and Y (both developed\nmarkets) and Country Z (a dev

In [32]:
# 加载CLIP模型和处理器
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")


In [73]:
def clipEmbedding(data):
    textdata="Question:"+data.get("Question Text")+ "Options:"+str(data.get("Options"))+ "Correct Answer:"+data.get("Answer")
    # "Wrong Answer:"+data.get("Wrong_reasoning_steps")+"Feedback"
    if data.get("Image")!='':
        print("we have image")
        image_path = '../dataset/'+ data.get("Image")
        image = Image.open(image_path)
        # 生成文本和图像的嵌入，添加 truncation=True 和 max_length=77
        inputs = processor(text=[textdata], images=image, return_tensors="pt", padding=True, truncation=True, max_length=77)
        # 使用CLIP模型生成嵌入
        outputs = model(**inputs)
        image_embedding = outputs.image_embeds  # 图像嵌入
        text_embedding = outputs.text_embeds  # 文本和选项嵌入
        
    else:
        # print("No image provided.")  # 如果没有提供图片，打印提示
        # 如果没有图像，生成文本嵌入
        inputs = processor(text=[textdata], return_tensors="pt", padding=True, truncation=True, max_length=77)
        text_embedding = model.get_text_features(**inputs)
        # 创建一个与图像嵌入维度相同的零向量
        image_embedding = torch.zeros((text_embedding.shape[0], 512))  # 假设图像嵌入维度是512
    combined_embedding = torch.cat((text_embedding, image_embedding), dim=-1) # 将文本和图像嵌入连接在一起
    return combined_embedding


# 获取嵌入的维度
embedding_dim = clipEmbedding(error_log).shape[1]  # 嵌入的最后一个维度表示向量维度
index = faiss.IndexFlatL2(embedding_dim)  # 使用 L2 距离度量
# 将生成的多模态嵌入添加到 Faiss 中
index.add(clipEmbedding(error_log).detach().numpy())

In [84]:
# 获取嵌入的维度
embedding_dim = clipEmbedding(error_log).shape[1]  # 嵌入的最后一个维度表示向量维度
index = faiss.IndexFlatL2(embedding_dim)  # 使用 L2 距离度量
# 将生成的多模态嵌入添加到 Faiss 中
index.add(clipEmbedding(error_log).detach().numpy())

In [85]:
# 查看Faiss索引中存储的向量数量
print("Number of vectors in the index:", index.ntotal)


Number of vectors in the index: 1


In [86]:
# 假设我们有一个查询向量 query_embedding
query_embedding = clipEmbedding(query).detach().numpy()

# 检索 Faiss 中最相似的 k 个向量
k = 1  # 你希望检索到最相似的 1 个向量
D, I = index.search(query_embedding, k)

# D 是距离，I 是对应向量的索引
print("Distances to closest vectors:", D)
print("Indices of closest vectors:", I)


we have image
Distances to closest vectors: [[92.592575]]
Indices of closest vectors: [[0]]


#### 余弦相似度

In [117]:
import faiss
import numpy as np

def clipEmbedding(data):
    textdata = "Question:" + data.get("Question Text") + " Options:" + str(data.get("Options")) + " Correct Answer:" + data.get("Answer")
    
    # 检查是否有图片
    if data.get("Image") != '':
        image_path = '../dataset/' + data.get("Image")
        image = Image.open(image_path)
        
        # 生成文本和图像的嵌入，添加 truncation=True 和 max_length=77
        inputs = processor(text=[textdata], images=image, return_tensors="pt", padding=True, truncation=True, max_length=77)
        
        # 使用CLIP模型生成嵌入
        outputs = model(**inputs)
        image_embedding = outputs.image_embeds  # 图像嵌入
        text_embedding = outputs.text_embeds  # 文本嵌入
    else:
        # 如果没有图像，生成文本嵌入
        inputs = processor(text=[textdata], return_tensors="pt", padding=True, truncation=True, max_length=77)
        text_embedding = model.get_text_features(**inputs)
        
        # 创建一个与图像嵌入维度相同的零向量
        image_embedding = torch.zeros((text_embedding.shape[0], 512))  # 假设图像嵌入维度是512
    
    # 将文本和图像嵌入拼接在一起
    combined_embedding = torch.cat((text_embedding, image_embedding), dim=-1)
    
    return combined_embedding


def normalize(embeddings):
    # 归一化函数，计算余弦相似度时将向量进行归一化
    norms = torch.norm(embeddings, dim=1, keepdim=True)  # 计算每个向量的范数
    return embeddings / norms  # 将向量归一化，使其范数变为1




In [147]:

for error_log in data:
    # 生成嵌入
    error_log_embedding = clipEmbedding(error_log)

    # 对嵌入进行归一化，以便计算余弦相似度
    error_log_embedding = normalize(error_log_embedding)

    # 获取嵌入的维度
    embedding_dim = error_log_embedding.shape[1]  # 嵌入的维度是最后一个维度
    if index.ntotal==0:
        # 初始化 Faiss 索引，使用内积 (dot product) 作为距离度量
        index = faiss.IndexFlatIP(embedding_dim)  # 使用内积度量

    # 将生成的多模态嵌入转换为numpy数组并添加到Faiss索引中
    error_log_embedding_np = error_log_embedding.detach().numpy()  # 确保转换为numpy格式
    index.add(error_log_embedding_np)  # 将嵌入添加到Faiss索引中

    # 检查索引中存储的向量数量
print("Number of vectors in the index:", index.ntotal)

Number of vectors in the index: 99


In [148]:
# 查询函数
def query_embedding_faiss(query_data, index, k=5):
    # 生成查询嵌入
    query_embedding = clipEmbedding(query_data)
    query_embedding = normalize(query_embedding)  # 归一化查询向量
    
    # 转换为 numpy 格式
    query_embedding_np = query_embedding.detach().numpy()
    
    # 检索 Faiss 中与查询向量最相似的 k 个向量
    D, I = index.search(query_embedding_np, k)  # D 是余弦相似度，I 是对应的索引
    
    print("Cosine Similarities:", D)  # 打印查询到的余弦相似度
    print("Indices of closest vectors:", I)  # 打印最相似向量的索引
    
    return D, I


# 查询最相似的5个嵌入
D, I = query_embedding_faiss(query, index, k=5)


Cosine Similarities: [[0.99999976 0.89624727 0.86308324 0.8596045  0.8551398 ]]
Indices of closest vectors: [[96 95 84 74 90]]


In [149]:
I[0][0]

96

In [125]:
query

{'ID': 2270,
 'Question Number': 107,
 'Share Context': '',
 'Share Image': '',
 'Question Text': 'Consider the expected returns and standard deviations for the following portfolios:',
 'Image': 'images/Foundationofriskmanagement1_images/107u.png',
 'Options': {'A': ' Portfolio 1',
  'B': ' Portfolio 2',
  'C': ' Portfolio 3',
  'D': ' Portfolio 4'},
 'Answer': 'A',
 'Explanation': 'Portfolio 1 is not efficient because it has a lower expected return and higher risk than Portfolios 2, 3, and 4. The portfolio is not mean variance efficient due to its suboptimal risk-return profile.',
 'QA Type': 'Knowledge reasoning QA',
 'Question Type': 'text+image',
 'Level of Difficulty': 'Easy',
 'Knowledge Topics': 'mean variance efficiency, portfolio analysis',
 'General Topics': 'Foundation of Risk Management',
 'Book Label': 'foundation of risk management1'}

In [151]:
data[I[0][1]]

{'ID': 2266,
 'Question Number': 103,
 'Share Context': '',
 'Share Image': '',
 'Question Text': 'Which of the following portfolios falls below the Markowitz efficient frontier?',
 'Image': 'images/Foundationofriskmanagement1_images/103u.png',
 'Options': {'A': ' Portfolio A',
  'B': ' Portfolio B',
  'C': ' Portfolio C',
  'D': ' Portfolio D'},
 'Answer': 'B',
 'Explanation': 'Portfolio B is inefficient (falls below the efficient frontier) because for the same risk level (8.7%), you could have portfolio C with a higher expected return (15.1% versus 14.2%). This makes portfolio B suboptimal for the given risk-return profile.',
 'QA Type': 'Knowledge reasoning QA',
 'Question Type': 'text+image',
 'Level of Difficulty': 'Easy',
 'Knowledge Topics': 'Markowitz efficient frontier, portfolio theory',
 'General Topics': 'Foundation of Risk Management',
 'Book Label': 'foundation of risk management1'}

In [146]:
# 清空Faiss索引中的所有向量
index.reset()

# 检查索引是否清空
print("Number of vectors after reset:", index.ntotal)


Number of vectors after reset: 0
