In [1]:
import torch
from PIL import Image
import open_clip
from open_clip import tokenizer
import subprocess
import os
import numpy as np

from transformers import BertTokenizer, BertForQuestionAnswering


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

# openclip

In [2]:
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k', device='cuda')
vit_tokenizer = open_clip.get_tokenizer('ViT-B-32')

In [6]:
preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(224, 224))
    <function _convert_to_rgb at 0x7f068e64e830>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [7]:
model.eval()
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 151,277,313
Context length: 77
Vocab size: 49408


In [14]:
vit_tokenizer.encode("Which object can be found in a jazz club")

[1448, 14115, 753, 655, 1546, 530, 320, 4528, 1736]

In [27]:
image = Image.open('data/KG_VQA/fvqa/exp_data/images/images/COCO_val2014_000000000136.jpg').convert("RGB")
image_input = preprocess(image).unsqueeze(0)  # Unsqueeze 添加一个批次维度
text_tokens = tokenizer.tokenize("Which object can be found in a jazz club")

with torch.no_grad():
    image_features = model.encode_image(image_input).float()
    text_features = model.encode_text(text_tokens).float()
    
image_features.shape, text_features.shape

(torch.Size([1, 512]), torch.Size([1, 512]))

In [None]:

image = preprocess(Image.open("docs/CLIP.png")).unsqueeze(0)
text = tokenizer(["a diagram", "a dog", "a cat"])

with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

print("Label probs:", text_probs)  # prints: [[1., 0., 0.]]

# bert

In [3]:
# 加载预训练的 BERT 模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Question: What is the capital of France?
Answer: paris


In [4]:

# 假设这是问题和上下文
question = "What is the capital of France?"
context = "Paris is the capital and most populous city of France."

# 编码问题和上下文
inputs = tokenizer(question, context, return_tensors='pt')
input_ids = inputs['input_ids'].tolist()[0]

# 获取答案
with torch.no_grad():
    outputs = model(**inputs)
    answer_start_scores = outputs.start_logits
    answer_end_scores = outputs.end_logits

# 找到答案的开始和结束位置
answer_start = torch.argmax(answer_start_scores)
answer_end = torch.argmax(answer_end_scores) + 1

# 转换回文本
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

print(f"Question: {question}")
print(f"Answer: {answer}")


Question: What is the capital of France?
Answer: paris


# gpt2

In [3]:
from transformers import GPT2Model, GPT2Config

# 加载或创建适合任务的 Transformer 模型
configuration = GPT2Config.from_pretrained('gpt2', n_embd=4096)  # 确保输入尺寸匹配
transformer_model = GPT2Model(configuration)




OSError: We couldn't connect to 'https://huggingface.co' to load this file, couldn't find it in the cached files and it looks like gpt2 is not the path to a directory containing a file named config.json.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [None]:

# 使用 Transformer 模型处理融合的特征
transformer_output = transformer_model(inputs_embeds=4096)
