In [1]:
# 1. 检查环境和GPU状态
import torch
import transformers
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    # 使用正确的内存查询API
    print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory/1024**3:.2f}GB")
    print(f"Allocated GPU Memory: {torch.cuda.memory_allocated(0)/1024**3:.2f}GB")
    print(f"Reserved GPU Memory: {torch.cuda.memory_reserved(0)/1024**3:.2f}GB")

PyTorch version: 2.5.1+cu121
Transformers version: 4.53.2
CUDA available: True
GPU: NVIDIA GeForce RTX 3090
Total GPU Memory: 24.00GB
Allocated GPU Memory: 0.00GB
Reserved GPU Memory: 0.00GB


In [3]:
# 2. 清理GPU内存
import gc
gc.collect()
torch.cuda.empty_cache()
if torch.cuda.is_available():
    print(f"Allocated Memory after cleanup: {torch.cuda.memory_allocated(0)/1024**3:.2f}GB")
    print(f"Reserved Memory after cleanup: {torch.cuda.memory_reserved(0)/1024**3:.2f}GB")

Allocated Memory after cleanup: 0.00GB
Reserved Memory after cleanup: 0.00GB


In [4]:
# 3. 尝试使用最小的InstructBLIP模型
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
from PIL import Image
import requests

# 使用flan-t5-xl版本（更小）
model_name = "Salesforce/instructblip-flan-t5-xl"
print(f"Loading model: {model_name}")

# 使用float16减少内存使用
model = InstructBlipForConditionalGeneration.from_pretrained(
    model_name, 
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)
processor = InstructBlipProcessor.from_pretrained(model_name)

print("Model loaded successfully")

Loading model: Salesforce/instructblip-flan-t5-xl


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Model loaded successfully


In [6]:
# 4. 将模型移到GPU并检查内存
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(f"Model on {device}")
if device == "cuda":
    print(f" Memory after model loading: {torch.cuda.memory_reserved(0)/1024**3:.2f}GB")

Model on cuda
 Memory after model loading: 9.10GB


In [24]:
# 5. 测试简单的图像
# url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
image = Image.open("./RSGPT/dataset/RSIEval/images/P1384_0054.png").convert("RGB")
# image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
prompt = "All the cars on the road?"

# 准备输入
print("Processing inputs...")
inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
print("Inputs prepared")

Processing inputs...
Inputs prepared


In [None]:
# 6. 使用最简单的生成参数
print("Starting generation...")
import time
start_time = time.time()

# # 设置超时
# with torch.no_grad():
#     outputs = model.generate(
#         **inputs,
#         max_new_tokens=150,  # 使用max_new_tokens而不是max_length
#         do_sample=False,    # 关闭采样
#         num_beams=1         # 使用贪婪解码
#     )

# with torch.no_grad():
#     outputs = model.generate(
#       **inputs,
#       max_new_tokens=80,
#       do_sample=False,
#       num_beams=3,
#       repetition_penalty=1.2,
#       no_repeat_ngram_size=3,
#       early_stopping=True  # 遇到结束符就停止
#   )
    
# with torch.no_grad():
#      outputs = model.generate(
#       **inputs,
#       max_new_tokens=50,
#       do_sample=True,  # 开启采样
#       temperature=0.7,  # 控制随机性
#       top_p=0.9,  # nucleus采样
#       repetition_penalty=1.5
#   )
     
with torch.no_grad():
    outputs = model.generate(
      **inputs,
      max_new_tokens=300,  # RSGPT用这个
      num_beams=1,         # RSGPT对话用1
      do_sample=True,      # RSGPT对话用True
      top_p=0.9,
      repetition_penalty=1.0,  # RSGPT对话用1.0
      temperature=1.0
  )

end_time = time.time()
print(f"Generation completed in {end_time - start_time:.2f} seconds")

# 解码输出
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
print(f"\nGenerated text: {generated_text}")

Starting generation...
Generation completed in 0.14 seconds

Generated text: yes
