In [None]:
import pandas as pd
from transformers import BlipProcessor, BlipForQuestionAnswering
from datasets import Dataset
import torch
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle
from io import BytesIO

# Load BLIP model and processor
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

torch.cuda.empty_cache()
torch.manual_seed(42)

class VQADataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, processor):
        self.dataframe = dataframe
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        question = row['question']
        answer = row['answer']
        image = Image.open(BytesIO(row['image']['bytes'])).convert("RGB")
        
        encoding = self.processor(image, question, padding="max_length", truncation=True, return_tensors="pt")
        labels = self.processor.tokenizer.encode(
            answer, max_length=8, pad_to_max_length=True, return_tensors='pt'
        )
        encoding["labels"] = labels
        for k, v in encoding.items():
            encoding[k] = v.squeeze()
        return encoding

# Load your dataframe
splits = {'train': 'data/train-00000-of-00001-eb8844602202be60.parquet', 'test': 'data/test-00000-of-00001-e5bc3d208bb4deeb.parquet'}
df = pd.read_parquet("hf://datasets/flaviagiammarino/vqa-rad/" + splits["train"])

# Split into train and validation sets
train_df = df.sample(frac=0.9, random_state=42)
valid_df = df.drop(train_df.index)

# Create dataset instances
train_dataset = VQADataset(dataframe=train_df, processor=processor)
valid_dataset = VQADataset(dataframe=valid_df, processor=processor)

# Data loaders
batch_size = 12
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=4e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

num_epochs = 100
patience = 10
min_eval_loss = float("inf")
early_stopping_hook = 0
tracking_information = []
scaler = torch.cuda.amp.GradScaler()

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
        
        epoch_loss += loss.item()
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in tqdm(valid_dataloader, desc=f"Validating Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
                loss = outputs.loss
            
            eval_loss += loss.item()

    train_loss = epoch_loss / len(train_dataloader)
    valid_loss = eval_loss / len(valid_dataloader)
    tracking_information.append((train_loss, valid_loss, optimizer.param_groups[0]["lr"]))
    print(f"Epoch: {epoch+1} - Training loss: {train_loss:.4f} - Validation loss: {valid_loss:.4f} - LR: {optimizer.param_groups[0]['lr']:.4e}")
    
    scheduler.step()
    
    if valid_loss < min_eval_loss:
        model.save_pretrained("Model/blip-saved-model")
        processor.save_pretrained("Model/blip-saved-model")
        print("Saved model to Model/blip-saved-model")
        min_eval_loss = valid_loss
        early_stopping_hook = 0
    else:
        early_stopping_hook += 1
        if early_stopping_hook > patience:
            print("Early stopping triggered.")
            break

pickle.dump(tracking_information, open("tracking_information.pkl", "wb"))
print("The finetuning process is complete!")


# ImageClef-2019-VQA-Med Dataset Preprocessing

In [None]:
import os
import pandas as pd
from transformers import BlipProcessor, BlipForQuestionAnswering
from datasets import Dataset
import torch
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle
from io import BytesIO

# Load BLIP model and processor
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

torch.cuda.empty_cache()
torch.manual_seed(42)

# Define the dataset class
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, image_folder, processor):
        self.dataframe = dataframe
        self.image_folder = image_folder
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_id = row['image_id']
        question = row['question']
        answer = row['answer']
        
        # Load image
        image_path = os.path.join(self.image_folder, f"{image_id}.jpg")
        image = Image.open(image_path).convert("RGB")
        
        # Process image and text
        encoding = self.processor(image, question, padding="max_length", truncation=True, return_tensors="pt")
        labels = self.processor.tokenizer.encode(
            answer, max_length=8, pad_to_max_length=True, return_tensors='pt'
        )
        encoding["labels"] = labels
        for k, v in encoding.items():
            encoding[k] = v.squeeze()
        return encoding

# Load the question-answer pairs
data_path = "data/ImageClef-2019-VQA-Med/ImageClef-2019-VQA-Med-Training/All_QA_Pairs_train.txt"
image_folder = "data/ImageClef-2019-VQA-Med/ImageClef-2019-VQA-Med-Training/Train_images"

# Read the data
data = []
with open(data_path, 'r') as file:
    for line in file:
        parts = line.strip().split('|')
        if len(parts) == 3:
            image_id, question, answer = parts
            data.append({'image_id': image_id, 'question': question, 'answer': answer})

df = pd.DataFrame(data)

# Split into train and validation sets
train_df = df.sample(frac=0.9, random_state=42)
valid_df = df.drop(train_df.index)

# Create dataset instances
train_dataset = VQADataset(dataframe=train_df, image_folder=image_folder, processor=processor)
valid_dataset = VQADataset(dataframe=valid_df, image_folder=image_folder, processor=processor)

# Data loaders
batch_size = 12
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=4e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

num_epochs = 100
patience = 4
min_eval_loss = float("inf")
early_stopping_hook = 0
tracking_information = []
scaler = torch.cuda.amp.GradScaler()

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
        
        epoch_loss += loss.item()
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in tqdm(valid_dataloader, desc=f"Validating Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels, attention_mask=attention_mask)
                loss = outputs.loss
            
            eval_loss += loss.item()

    train_loss = epoch_loss / len(train_dataloader)
    valid_loss = eval_loss / len(valid_dataloader)
    tracking_information.append((train_loss, valid_loss, optimizer.param_groups[0]["lr"]))
    print(f"Epoch: {epoch+1} - Training loss: {train_loss:.4f} - Validation loss: {valid_loss:.4f} - LR: {optimizer.param_groups[0]['lr']:.4e}")
    
    scheduler.step()
    
    if valid_loss < min_eval_loss:
        model.save_pretrained("Model_2/blip-saved-model")
        processor.save_pretrained("Model_2/blip-saved-model")
        print("Saved model to Model_2/blip-saved-model")
        min_eval_loss = valid_loss
        early_stopping_hook = 0
    else:
        early_stopping_hook += 1
        if early_stopping_hook > patience:
            print("Early stopping triggered.")
            break

pickle.dump(tracking_information, open("tracking_information.pkl", "wb"))
print("The finetuning process is complete!")


In [None]:
import os
import pandas as pd
from transformers import BlipProcessor, BlipForQuestionAnswering
from datasets import Dataset
import torch
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle
from io import BytesIO

# Load BLIP model and processor
model = BlipForQuestionAnswering.from_pretrained("Model/blip-saved-model")
processor = BlipProcessor.from_pretrained("Model/blip-saved-model")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

torch.cuda.empty_cache()
torch.manual_seed(42)

# Define the dataset class
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, image_folder, processor):
        self.dataframe = dataframe
        self.image_folder = image_folder
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_id = row['image_id']
        question = row['question']
        answer = row['answer']
        
        # Load image
        image_path = os.path.join(self.image_folder, f"{image_id}.jpg")
        image = Image.open(image_path).convert("RGB")
        
        # Process image and text
        encoding = self.processor(image, question, padding="max_length", truncation=True, return_tensors="pt")
        labels = self.processor.tokenizer.encode(
            answer, max_length=8, pad_to_max_length=True, return_tensors='pt'
        )
        encoding["labels"] = labels
        for k, v in encoding.items():
            encoding[k] = v.squeeze()
        return encoding

# Load the question-answer pairs
data_path = "data/ImageClef-2019-VQA-Med/ImageClef-2019-VQA-Med-Training/QAPairsByCategory/C2_Plane_train.txt"
image_folder = "data/ImageClef-2019-VQA-Med/ImageClef-2019-VQA-Med-Training/Train_images"

# Read the data
data = []
with open(data_path, 'r') as file:
    for line in file:
        parts = line.strip().split('|')
        if len(parts) == 3:
            image_id, question, answer = parts
            data.append({'image_id': image_id, 'question': question, 'answer': answer})

df = pd.DataFrame(data)

# Split into train and validation sets
train_df = df.sample(frac=0.9, random_state=42)
valid_df = df.drop(train_df.index)

# Create dataset instances
train_dataset = VQADataset(dataframe=train_df, image_folder=image_folder, processor=processor)
valid_dataset = VQADataset(dataframe=valid_df, image_folder=image_folder, processor=processor)

# Data loaders
batch_size = 12
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=4e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

num_epochs = 100
patience = 3
min_eval_loss = float("inf")
early_stopping_hook = 0
tracking_information = []
scaler = torch.cuda.amp.GradScaler()

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
        
        epoch_loss += loss.item()
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in tqdm(valid_dataloader, desc=f"Validating Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels, attention_mask=attention_mask)
                loss = outputs.loss
            
            eval_loss += loss.item()

    train_loss = epoch_loss / len(train_dataloader)
    valid_loss = eval_loss / len(valid_dataloader)
    tracking_information.append((train_loss, valid_loss, optimizer.param_groups[0]["lr"]))
    print(f"Epoch: {epoch+1} - Training loss: {train_loss:.4f} - Validation loss: {valid_loss:.4f} - LR: {optimizer.param_groups[0]['lr']:.4e}")
    
    scheduler.step()
    
    if valid_loss < min_eval_loss:
        model.save_pretrained("Model/blip-saved-model")
        processor.save_pretrained("Model/blip-saved-model")
        print("Saved model to Model/blip-saved-model")
        min_eval_loss = valid_loss
        early_stopping_hook = 0
    else:
        early_stopping_hook += 1
        if early_stopping_hook > patience:
            print("Early stopping triggered.")
            break

pickle.dump(tracking_information, open("tracking_information_1.pkl", "wb"))
print("The finetuning process is complete!")


In [None]:
import os
import pandas as pd
from transformers import BlipProcessor, BlipForQuestionAnswering
from datasets import Dataset
import torch
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle
from io import BytesIO

# Load BLIP model and processor
model = BlipForQuestionAnswering.from_pretrained("Model/blip-saved-model")
processor = BlipProcessor.from_pretrained("Model/blip-saved-model")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

torch.cuda.empty_cache()
torch.manual_seed(42)

# Define the dataset class
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, image_folder, processor):
        self.dataframe = dataframe
        self.image_folder = image_folder
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_id = row['image_id']
        question = row['question']
        answer = row['answer']
        
        # Load image
        image_path = os.path.join(self.image_folder, f"{image_id}.jpg")
        image = Image.open(image_path).convert("RGB")
        
        # Process image and text
        encoding = self.processor(image, question, padding="max_length", truncation=True, return_tensors="pt")
        labels = self.processor.tokenizer.encode(
            answer, max_length=8, pad_to_max_length=True, return_tensors='pt'
        )
        encoding["labels"] = labels
        for k, v in encoding.items():
            encoding[k] = v.squeeze()
        return encoding

# Load the question-answer pairs
data_path = "data/ImageClef-2019-VQA-Med/ImageClef-2019-VQA-Med-Training/QAPairsByCategory/C3_Organ_train.txt"
image_folder = "data/ImageClef-2019-VQA-Med/ImageClef-2019-VQA-Med-Training/Train_images"

# Read the data
data = []
with open(data_path, 'r') as file:
    for line in file:
        parts = line.strip().split('|')
        if len(parts) == 3:
            image_id, question, answer = parts
            data.append({'image_id': image_id, 'question': question, 'answer': answer})

df = pd.DataFrame(data)

# Split into train and validation sets
train_df = df.sample(frac=0.9, random_state=42)
valid_df = df.drop(train_df.index)

# Create dataset instances
train_dataset = VQADataset(dataframe=train_df, image_folder=image_folder, processor=processor)
valid_dataset = VQADataset(dataframe=valid_df, image_folder=image_folder, processor=processor)

# Data loaders
batch_size = 12
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=4e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

num_epochs = 100
patience = 3
min_eval_loss = float("inf")
early_stopping_hook = 0
tracking_information = []
scaler = torch.cuda.amp.GradScaler()

loss_val = []
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
        
        epoch_loss += loss.item()
        loss_val.append(loss.item())
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in tqdm(valid_dataloader, desc=f"Validating Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels, attention_mask=attention_mask)
                loss = outputs.loss
            
            eval_loss += loss.item()

    train_loss = epoch_loss / len(train_dataloader)
    valid_loss = eval_loss / len(valid_dataloader)
    tracking_information.append((train_loss, valid_loss, optimizer.param_groups[0]["lr"]))
    print(f"Epoch: {epoch+1} - Training loss: {train_loss:.4f} - Validation loss: {valid_loss:.4f} - LR: {optimizer.param_groups[0]['lr']:.4e}")
    
    scheduler.step()
    
    if valid_loss < min_eval_loss:
        model.save_pretrained("Model/blip-saved-model")
        processor.save_pretrained("Model/blip-saved-model")
        print("Saved model to Model/blip-saved-model")
        min_eval_loss = valid_loss
        early_stopping_hook = 0
    else:
        early_stopping_hook += 1
        if early_stopping_hook > patience:
            print("Early stopping triggered.")
            break

pickle.dump(tracking_information, open("tracking_information_2.pkl", "wb"))
print("The finetuning process is complete!")


In [None]:
import matplotlib.pyplot as plt
# write loss_list to a file
with open('loss_list_blip2.csv', 'w') as f:
    for item in loss_val:
        f.write("%s\n" % item)
        
# plot loss_list
plt.figure(figsize=(12, 6)) 
step = 2 
loss_epoch = loss_val[240:481]
plt.plot(loss_epoch[::step], color='blue', linewidth=1.0)  
plt.title("Epoch 1")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.show()

In [None]:
import os
import pandas as pd
from transformers import BlipProcessor, BlipForQuestionAnswering
from datasets import Dataset
import torch
from PIL import Image
from torch.utils.data import DataLoader
from tqdm import tqdm
import pickle
from io import BytesIO

# Load BLIP model and processor
model = BlipForQuestionAnswering.from_pretrained("Model/blip-saved-model")
processor = BlipProcessor.from_pretrained("Model/blip-saved-model")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model.to(device)

torch.cuda.empty_cache()
torch.manual_seed(42)

# Define the dataset class
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, image_folder, processor):
        self.dataframe = dataframe
        self.image_folder = image_folder
        self.processor = processor

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_id = row['image_id']
        question = row['question']
        answer = row['answer']
        
        # Load image
        image_path = os.path.join(self.image_folder, f"{image_id}.jpg")
        image = Image.open(image_path).convert("RGB")
        
        # Process image and text
        encoding = self.processor(image, question, padding="max_length", truncation=True, return_tensors="pt")
        labels = self.processor.tokenizer.encode(
            answer, max_length=8, pad_to_max_length=True, return_tensors='pt'
        )
        encoding["labels"] = labels
        for k, v in encoding.items():
            encoding[k] = v.squeeze()
        return encoding

# Load the question-answer pairs
data_path = "data/ImageClef-2019-VQA-Med/ImageClef-2019-VQA-Med-Training/QAPairsByCategory/C4_Abnormality_train.txt"
image_folder = "data/ImageClef-2019-VQA-Med/ImageClef-2019-VQA-Med-Training/Train_images"

# Read the data
data = []
with open(data_path, 'r') as file:
    for line in file:
        parts = line.strip().split('|')
        if len(parts) == 3:
            image_id, question, answer = parts
            data.append({'image_id': image_id, 'question': question, 'answer': answer})

df = pd.DataFrame(data)

# Split into train and validation sets
train_df = df.sample(frac=0.9, random_state=42)
valid_df = df.drop(train_df.index)

# Create dataset instances
train_dataset = VQADataset(dataframe=train_df, image_folder=image_folder, processor=processor)
valid_dataset = VQADataset(dataframe=valid_df, image_folder=image_folder, processor=processor)

# Data loaders
batch_size = 12
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)


# BLIP2 - 227k Dataset

In [None]:
import pandas as pd
from PIL import Image
from datasets import DatasetDict, load_from_disk
import os
from transformers import ViltConfig
import torch
import io
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from tqdm import tqdm

In [None]:
# load data set combined_datasets
train= load_from_disk('./PreprocessedData/train')
validation= load_from_disk('./PreprocessedData/validation')

dataset_dict= DatasetDict({'train': train, 'validation': validation})
dataset_dict

In [None]:
# Ensure image paths are processed correctly
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, dataset, processor):
        self.dataset = dataset
        self.processor = processor
    def __len__(self):
        return len(self.dataset)
    def __getitem__(self, idx):
        data = self.dataset[idx]
        image_path = data['image_path'].replace('\\', '/')
        question = data['question']
        answer = data['answer']
        image = Image.open(image_path).convert('RGB')

        # Use the BLIP2 processor to process image and question
        encoding = self.processor(image, question, padding="max_length", truncation=True, return_tensors="pt")
        labels = self.processor.tokenizer.encode(
        answer, max_length=8, pad_to_max_length=True, return_tensors='pt'
        )
        encoding["labels"] = labels
        for k, v in encoding.items():
            encoding[k] = v.squeeze()
        return encoding
    
# Load datasets
train_dataset = VQADataset(dataset=dataset_dict['train'], processor=processor)
validation_dataset = VQADataset(dataset=dataset_dict['validation'], processor=processor)

# Create DataLoader
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=2)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load BLIP2 processor and model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl", torch_dtype=torch.float16).to('cuda')



In [None]:
# Training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=4e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

num_epochs = 100
patience = 10
min_eval_loss = float("inf")
early_stopping_hook = 0
tracking_information = []
scaler = torch.cuda.amp.GradScaler()

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        with torch.cuda.amp.autocast():
            outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
            loss = outputs.loss
        
        epoch_loss += loss.item()
        optimizer.zero_grad()
        scaler.scale(loss).backward()
        #scaler.step(optimizer)
        #scaler.update()

    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in tqdm(valid_dataloader, desc=f"Validating Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, pixel_values=pixel_values, labels=labels)
                loss = outputs.loss
            
            eval_loss += loss.item()

    train_loss = epoch_loss / len(train_dataloader)
    valid_loss = eval_loss / len(valid_dataloader)
    tracking_information.append((train_loss, valid_loss, optimizer.param_groups[0]["lr"]))
    print(f"Epoch: {epoch+1} - Training loss: {train_loss:.4f} - Validation loss: {valid_loss:.4f} - LR: {optimizer.param_groups[0]['lr']:.4e}")
    
    scheduler.step()
    
    if valid_loss < min_eval_loss:
        model.save_pretrained("Model/blip-saved-model")
        processor.save_pretrained("Model/blip-saved-model")
        print("Saved model to Model/blip-saved-model")
        min_eval_loss = valid_loss
        early_stopping_hook = 0
    else:
        early_stopping_hook += 1
        if early_stopping_hook > patience:
            print("Early stopping triggered.")
            break

pickle.dump(tracking_information, open("tracking_information.pkl", "wb"))
print("The finetuning process is complete!")

## Inference

In [None]:
from transformers import ViltProcessor, ViltForQuestionAnswering
from transformers import BlipProcessor, BlipForQuestionAnswering
import requests
from PIL import Image
import json, os, csv
import logging
from tqdm import tqdm
import torch

# Set the path to your test data directory
test_data_dir = "Data/test_data/test_data"

# processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
# model = ViltForQuestionAnswering.from_pretrained("test_model/checkpoint-525")

processor = BlipProcessor.from_pretrained("Model/blip-saved-model")
model = BlipForQuestionAnswering.from_pretrained("Model/blip-saved-model").to("cuda")

# Create a list to store the results
results = []

# Iterate through each file in the test data directory
samples = os.listdir(test_data_dir)
for filename in tqdm(os.listdir(test_data_dir), desc="Processing"):
    sample_path = f"Data/test_data/{filename}"

    # Read the json file
    json_path = os.path.join(sample_path, "data.json")
    with open(json_path, "r") as json_file:
        data = json.load(json_file)
        question = data["question"]
        image_id = data["id"]

    # Read the corresponding image
    image_path = os.path.join(test_data_dir, f"{image_id}", "image.png")
    image = Image.open(image_path).convert("RGB")

    # prepare inputs
    encoding = processor(image, question, return_tensors="pt").to("cuda:0", torch.float16)

    out = model.generate(**encoding)
    generated_text = processor.decode(out[0], skip_special_tokens=True)


    results.append((image_id, generated_text))

# Write the results to a CSV file
csv_file_path = "Results/results.csv"
with open(csv_file_path, mode="w", newline="") as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(["ID", "Label"])  # Write header
    csv_writer.writerows(results)

print(f"Results saved to {csv_file_path}")

In [None]:
# pip install accelerate
import matplotlib.pyplot as plt
import requests
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from transformers import BlipProcessor, BlipForQuestionAnswering

processor = Blip2Processor.from_pretrained("Model_2/blip-saved-model")
model = BlipForQuestionAnswering.from_pretrained("Model_2/blip-saved-model").to("cuda")

In [None]:
img_url = 'https://prod-images-static.radiopaedia.org/images/17054297/07b3ca19d485b21a30bd8412dbbc33_big_gallery.jpeg'
raw_image = Image.open(requests.get(img_url, stream=True).raw)
raw_image.show()


txt = "is this person sick?"
question = f"Question: {txt} Answer:"
inputs = processor(images=raw_image, text=question, return_tensors="pt").to(device="cuda")

# Plot the image
plt.imshow(raw_image)
plt.axis('off') 
plt.show()

generated_ids = model.generate(**inputs, max_length=512)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print(generated_text)