In [None]:
from transformers import Blip2ForConditionalGeneration, Blip2Processor, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training, PeftModel
from datasets import Dataset, DatasetDict, concatenate_datasets, load_from_disk
import torch
from PIL import Image


model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", device_map="auto")
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on {device}")

peft_model = PeftModel.from_pretrained(model, "../blip2/Model_2/blip-saved-model")
peft_model.eval()


In [None]:
from datasets import DatasetDict, load_from_disk
# load data set combined_datasets
train= load_from_disk('./PreprocessedData/train')
validation= load_from_disk('./PreprocessedData/validation')

dataset_dict= DatasetDict({'train': train, 'validation': validation})
dataset_dict

In [None]:
import os
import pandas as pd
from datasets import Dataset
import torch
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle
from io import BytesIO

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    pixel_values = [item['pixel_values'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Pad the sequences
    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)

    # Stack the pixel values
    pixel_values = torch.stack(pixel_values)

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'pixel_values': pixel_values, 'labels': labels}

# Define the dataset class
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, idx):
        data = self.dataset[idx]
        image_path = data['image_path'].replace('\\', '/')
        question = data['question']
        answer = data['answer']
        image = Image.open(image_path).convert('RGB')

        # Use the BLIP2 processor to process image and question
        encoding = self.processor(image, question, padding="max_length", truncation=True, return_tensors="pt")
        labels = self.processor.tokenizer.encode(
        answer, pad_to_max_length=True, return_tensors='pt'
        )
        encoding["labels"] = labels
        for k, v in encoding.items():
            encoding[k] = v.squeeze()
        return encoding

batch_size = 16

# Load datasets
train_dataset = VQADataset(dataset=dataset_dict['train'], processor=processor)
validation_dataset = VQADataset(dataset=dataset_dict['validation'], processor=processor)

# Create DataLoader
#train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, collate_fn=collate_fn)
#valid_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, collate_fn=collate_fn)

In [None]:
validation_dataset[0].keys()

In [None]:
dataset_dict['validation'][0]

In [None]:
data = dataset_dict['validation'][100]

image_path = data['image_path'].replace('\\', '/')
question = data['question'] + "Answer: "
true_answer = data['answer']

# Load and process the image and question
image = Image.open(image_path).convert('RGB')
inputs = processor(image, question, return_tensors="pt").to(device)

# Generate prediction
with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=50)
predicted_answer = processor.decode(out[0], skip_special_tokens=True).strip()

# show the image
image.show()
print(f"Question: {question}")
print(f"True answer: {true_answer}")
print(f"Predicted answer: {predicted_answer}")


In [None]:
import random
import pandas as pd

# Sample 1000 random samples from the validation set
num_samples = 1000
sampled_validation_data = random.sample(list(dataset_dict['validation']), num_samples)

# List to store the results
results = []

# Run inference on each sample
for data in tqdm(sampled_validation_data):
    image_path = data['image_path'].replace('\\', '/')
    question = data['question'] + "Answer: "
    true_answer = data['answer']

    # Load and process the image and question
    image = Image.open(image_path).convert('RGB')
    inputs = processor(image, question, return_tensors="pt").to(device)

    # Check for empty input_ids
    if inputs['input_ids'].size(1) == 0:
        print(f"Skipping sample with empty input_ids for question: {question}")
        continue

    # Generate prediction with max_new_tokens to avoid length issues
    with torch.no_grad():
        out = model.generate(**inputs, max_new_tokens=50)  # Limit the number of generated tokens
    predicted_answer = processor.decode(out[0], skip_special_tokens=True).strip()

    # Append the result
    results.append({
        "question": question,
        "true_answer": true_answer,
        "predicted_answer": predicted_answer
    })

# Save results to a CSV file
df = pd.DataFrame(results)
df.to_csv("validation_results.csv", index=False)

print("Results saved to validation_results.csv")
