In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from PIL import Image
import open_clip
from open_clip import tokenizer
import subprocess
import os
import numpy as np
from transformers import BertTokenizer, BertForQuestionAnswering

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [3]:
# 判断并选择设备
def select_device():
       if torch.cuda.is_available():
              device = torch.device("cuda")  # 优先使用CUDA（NVIDIA GPU）
              print("Using CUDA (GPU)")
       elif torch.backends.mps.is_available():
              device = torch.device("mps")  # 如果CUDA不可用但MPS可用，使用MPS（Apple Silicon）
              print("Using MPS (Apple Silicon)")
       else:
              device = torch.device("cpu")  # 如果都不可用，使用CPU
              print("Using CPU")
       return device

device = select_device()

Using CUDA (GPU)


# openclip test

In [4]:
clip_model, _, clip_preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k', device='cuda')
vit_tokenizer = open_clip.get_tokenizer('ViT-B-32')

In [5]:
clip_preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(224, 224))
    <function _convert_to_rgb at 0x7f84d931a830>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [6]:
clip_model.eval()
context_length = clip_model.context_length
vocab_size = clip_model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in clip_model.parameters()]):,}")
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 151,277,313
Context length: 77
Vocab size: 49408


In [7]:
# vit_tokenizer.encode("Which object can be found in a jazz club")
from open_clip import tokenizer
tokenizer.tokenize("Which object can be found in a jazz club")

tensor([[49406,  1448, 14115,   753,   655,  1546,   530,   320,  4528,  1736,
         49407,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]])

In [49]:
image = Image.open('data/KG_VQA/fvqa/exp_data/images/images/COCO_val2014_000000000136.jpg').convert("RGB")
image_input = clip_preprocess(image).unsqueeze(0).to(device)  # Unsqueeze 添加一个批次维度
text_tokens = tokenizer.tokenize("Which object can be found in a jazz club").to(device)

with torch.no_grad():
    image_features = clip_model.encode_image(image_input).float()
    text_features = clip_model.encode_text(text_tokens).float()
    
image_features.shape, text_features.shape

(torch.Size([1, 512]), torch.Size([1, 512]))

In [9]:

image = clip_preprocess(Image.open('data/KG_VQA/fvqa/exp_data/images/images/COCO_val2014_000000000136.jpg')).unsqueeze(0).to(device)
text = vit_tokenizer(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = clip_model.encode_image(image)
    text_features = clip_model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

print("Label probs:", text_probs)  # prints: [[1., 0., 0.]]

Label probs: tensor([[0.3589, 0.4755, 0.1656]], device='cuda:0')


# BertForQuestionAnswering test 

In [5]:
# 加载预训练的 BERT 模型和分词器
from transformers import BertTokenizerFast
bert_tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
bert = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [32]:
# test 
# 假设这是问题和上下文
question = "What is the capital of France?"
context = "Paris is the capital and most populous city of France."

# 编码问题和上下文
inputs = bert_tokenizer(question, context, return_tensors='pt')
input_ids = inputs['input_ids'].tolist()[0]

# 获取答案
with torch.no_grad():
    outputs = bert(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

# 找到答案的开始和结束位置
answer_start = torch.argmax(answer_start_scores)
answer_end = torch.argmax(answer_end_scores) + 1

# 转换回文本
answer = bert_tokenizer.convert_tokens_to_string(bert_tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: What is the capital of France?
Answer: paris


In [36]:
context.find('Paris')

0

In [12]:
# BertForQuestionAnswering训练数据格式
# {
#   "context": "文本上下文，包含问题的答案。",
#   "question": "问题文本？",
#   "answers": {
#     "text": ["答案文本"],
#     "answer_start": ['答案在上下文中的起始字符位置']
#   }
# }


# 自定义数据集

In [23]:
# class VQADataset(Dataset):
#     def __init__(self, questions, contexts, answers, images, clip_processor, clip_model, tokenizer=None):
#         super(VQADataset, self).__init__()
#         self.questions = questions
#         self.contexts = contexts
#         self.answers = answers
#         self.images = images
#         self.clip_model = clip_model
#         self.clip_processor = clip_processor
#         self.tokenizer = tokenizer
        
#     def __len__(self):
#         return len(self.questions)

#     def __getitem__(self, idx):
#         question = self.questions[idx]
#         context = self.contexts[idx]
#         answer = self.answers[idx]
#         image_path = self.images[idx]

#         # 处理文本
#         encoded_dict = self.tokenizer.encode_plus(
#             question, 
#             context,
#             max_length=512,
#             padding='max_length',
#             truncation=True,
#             return_tensors='pt'
#         )
#         text_tokens = tokenizer.tokenize(encoded_dict['input_ids'].squeeze(0))
#         # 处理图像
#         image = Image.open(image_path)
#         image = self.clip_processor(image).unsqueeze(0)
        
#         with torch.no_grad():
#             image_features = clip_model.encode_image(image).float()
#             text_features = clip_model.encode_text(text_tokens).float()
        
#         # 计算答案的 token 位置
#         answer_start = context.find(answer)
#         start_position = encoded_dict.char_to_token(0, answer_start)
#         end_position = encoded_dict.char_to_token(0, answer_start + len(answer) - 1)
        
#         # 如果找不到答案位置，则设置为最大长度
#         if start_position is None:
#             start_position = self.tokenizer.model_max_length
#         if end_position is None:
#             end_position = self.tokenizer.model_max_length

#         return {
#             'text_features': text_features.squeeze(0),
#             'attention_mask': encoded_dict['attention_mask'],
#             'start_positions': start_position,
#             'end_positions': end_position,
#             'image_features': image_features.squeeze(0)
#         }


In [21]:
class VQADataset(Dataset):
    def __init__(self, questions, contexts, answers, images, clip_processor, clip_model, clip_tokenizer):
        super(VQADataset, self).__init__()
        self.questions = questions
        self.contexts = contexts
        self.answers = answers
        self.images = images
        self.clip_model = clip_model
        self.clip_processor = clip_processor
        # TODO: 选择合适的分词器
        # 采用  openclip 的分词器对 context 进行编码
        # 采用  bert 的分词器对 question 进行编码
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        context = self.contexts[idx]
        answer = self.answers[idx]
        image_path = self.images[idx]

        # 处理文本
        text_input = question + " " + context  # OpenCLIP可能需要不同的输入格式
        # TODO：考虑下不用 openclip 的分词器的话，attention_mask的处理
        # encoded_dict = self.tokenizer.encode_plus(
        #     question, 
        #     context,
        #     max_length=512,
        #     padding='max_length',
        #     truncation=True,
        #     return_tensors='pt'
        # )
        encoded_dict = self.tokenizer.tokenize(text_input).to(device)

        # 处理图像
        image = Image.open(image_path)
        image = self.clip_processor(image).unsqueeze(0).to(device)
        
        with torch.no_grad():
            image_features = self.clip_model.encode_image(image).float()
            text_features = self.clip_model.encode_text(encoded_dict['input_ids']).float()
        
        # 计算答案的 token 位置
        answer_start = context.find(answer)
        start_position = encoded_dict.char_to_token(0, answer_start)
        end_position = encoded_dict.char_to_token(0, answer_start + len(answer) - 1)

        # 如果找不到答案位置，则设置为最大长度
        if start_position is None:
            start_position = self.tokenizer.model_max_length
        if end_position is None:
            end_position = self.tokenizer.model_max_length

        return {
            'text_features': text_features.squeeze(0),
            'attention_mask': encoded_dict['attention_mask'],
            'start_positions': start_position,
            'end_positions': end_position,
            'image_features': image_features.squeeze(0)
        }


In [14]:
# test
# 使用自定义 Dataset
# questions = ["What is in the picture?"]
# contexts = ["There is a dog in the picture."]
# answers = ["dog"]
# images = ['data/KG_VQA/fvqa/exp_data/images/images/COCO_val2014_000000000136.jpg']

# dataset = VQADataset(questions, contexts, answers, images, clip_preprocess, clip_model, tokenizer)

# # 创建 DataLoader
# data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

# # 遍历 DataLoader
# for data in data_loader:
#     print(data)

In [29]:
import json
from open_clip import tokenizer

project_root = os.getcwd()
train_data_dir = project_root+'/data/KG_VQA/fvqa/exp_data/train_seen_data'
test_data_dir = project_root+'/data/KG_VQA/fvqa/exp_data/test_unseen_data'
img_dir = project_root+"/data/KG_VQA/fvqa/exp_data/images/images"
sub_folders_train = ['train0', 'train1', 'train2', 'train3', 'train4']
sub_folders_test = ['test0', 'test1', 'test2', 'test3', 'test4']

def load_datasets(data_dir, sub_folders, img_dir):
    questions = []
    contexts = []
    answers = []
    img_names = []
    for folder in sub_folders:
        json_file = os.path.join(data_dir, folder, 'all_qs_dict_release_train_500.json' if 'train' in data_dir else 'all_qs_dict_release_test_500.json')
        with open(os.path.join(train_data_dir, json_file)) as f:
            data = json.load(f)
            keys = list(data.keys())
            for key in keys:
                questions.append(data[key]['question'])
                contexts.append(data[key]['fact_surface'].replace("[[", "").replace("]]", ""))
                answers.append(data[key]['answer'])
                img_names.append(os.path.join(img_dir, data[key]['img_file']))
            
    dataset = VQADataset(questions, contexts, answers, img_names, clip_preprocess, clip_model, tokenizer)
    
    return dataset

In [30]:
train_dataset = load_datasets(train_data_dir, sub_folders_train, img_dir)
test_dataset = load_datasets(test_data_dir, sub_folders_test, img_dir)
print("Train dataset size:", len(train_dataset))
print("Test dataset size:", len(test_dataset))

Train dataset size: 13662
Test dataset size: 13798


In [31]:
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False)

# vqa 模型

In [19]:
class VQAModel(nn.Module):
    def __init__(self, bert_model, feature_dim):
        super(VQAModel, self).__init__()
        self.bert_model = bert_model
        self.feature_transform = nn.Linear(feature_dim, bert_model.config.hidden_size)
    
    def forward(self, text_features, image_features, attention_mask, token_type_ids):
        # 将图像特征转换为与BERT隐藏层大小相同
        transformed_image_features = self.feature_transform(image_features)
        
        # 拼接图像特征和文本特征
        combined_features = torch.cat((text_features, transformed_image_features), dim=1)
        
        # 可能需要调整attention_mask和token_type_ids以适应拼接后的长度
        
        # 传递到BERT模型
        outputs = self.bert_model(input_ids=None, inputs_embeds=combined_features, 
                                  attention_mask=attention_mask, token_type_ids=token_type_ids)
        return outputs
    
    
    def forward(self, text_features, image_features, attention_mask, token_type_ids):
        # 拼接图像和文本特征
        combined_features = torch.cat((text_features, image_features), dim=1)
        
        # 转换特征
        transformed_features = self.feature_transform(combined_features)

        # 扩展 attention_mask 以包括图像特征
        # 假设图像特征的每个部分都应该被模型注意到
        image_attention_mask = torch.ones(image_features.size(0), image_features.size(1))
        combined_attention_mask = torch.cat((attention_mask, image_attention_mask), dim=1)
        
        # 传递到BERT模型
        outputs = self.bert_model(input_ids=None, inputs_embeds=transformed_features, 
                                attention_mask=combined_attention_mask, token_type_ids=token_type_ids)
        return outputs



# 训练

In [12]:
from transformers import AdamW

model = VQAModel(bert_model=bert, feature_dim=512)
optimizer = AdamW(model.parameters(), lr=5e-5)
# 损失函数
loss_fn = nn.CrossEntropyLoss()




In [13]:
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []

    total_accuracy = 0
    total_precision = 0
    total_recall = 0
    total_f1 = 0

    with torch.no_grad():
        for batch in dataloader:
            text_features = batch['text_features'].to(device)
            image_features = batch['image_features'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(text_features, image_features, attention_mask, token_type_ids)
            start_logits, end_logits = outputs.start_logits, outputs.end_logits

            # 取最大logit的索引为预测的开始和结束位置
            start_preds = torch.argmax(start_logits, dim=-1)
            end_preds = torch.argmax(end_logits, dim=-1)

            # 计算准确率和其他指标
            predictions.append((start_preds, end_preds))
            true_labels.append((start_positions, end_positions))

            # 转换为numpy数组以便使用sklearn计算性能指标
            start_preds = start_preds.cpu().numpy()
            end_preds = end_preds.cpu().numpy()
            start_true = start_positions.cpu().numpy()
            end_true = end_positions.cpu().numpy()

            total_accuracy += (accuracy_score(start_true, start_preds) + accuracy_score(end_true, end_preds)) / 2
            total_precision += (precision_score(start_true, start_preds, average='macro') + precision_score(end_true, end_preds, average='macro')) / 2
            total_recall += (recall_score(start_true, start_preds, average='macro') + recall_score(end_true, end_preds, average='macro')) / 2
            total_f1 += (f1_score(start_true, start_preds, average='macro') + f1_score(end_true, end_preds, average='macro')) / 2

    num_batches = len(dataloader)
    print(f"Average Accuracy: {total_accuracy / num_batches}")
    print(f"Average Precision: {total_precision / num_batches}")
    print(f"Average Recall: {total_recall / num_batches}")
    print(f"Average F1 Score: {total_f1 / num_batches}")

    return predictions, true_labels


In [25]:
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        outputs = model(text_features=batch['text_features'], image_features=batch['image_features'],
                        attention_mask=batch['attention_mask'], token_type_ids=batch['token_type_ids'])
        start_logits, end_logits = outputs.start_logits, outputs.end_logits
        loss = loss_fn(start_logits, batch['start_positions']) + loss_fn(end_logits, batch['end_positions'])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch}: Loss {total_loss / len(train_loader)}")
    
    # 评估模型
    model.eval()
    evaluate_model(model, test_loader, device)


RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor