In [4]:
import os
import cv2
import gc
import json
import numpy as np
import pandas as pd
import itertools
from tqdm.autonotebook import tqdm
import albumentations as A
import matplotlib.pyplot as plt

import torch
from torch import nn
import torch.nn.functional as F
import timm
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer


  check_for_updates()


In [5]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForPreTraining

In [6]:
# Step 1: Mount Google Drive to access the dataset.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import os
import json
import cv2
import torch
import numpy as np
import pandas as pd
import albumentations as A
import torch.nn as nn
import torch.nn.functional as F
import timm
import itertools
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel, AutoConfig
import matplotlib.pyplot as plt

# Add get_lr function
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

# First, update the CFG class with optimized parameters
class CFG:
    debug = False
    # Dataset paths remain the same
    image_path1 = '/content/drive/MyDrive/Bangla Image dataset with caption/Flickr8k_Dataset/Flicker8k_Dataset'
    captions_path1 = '/content/drive/MyDrive/Bangla Image dataset with caption/Flickr8k_Dataset'
    image_path2 = '/content/drive/MyDrive/Bangla Image dataset with caption/BNATURE/Pictures'
    captions_path2 = '/content/drive/MyDrive/Bangla Image dataset with caption/BNATURE/caption/captions.json'
    image_path3 = '/content/drive/MyDrive/Bangla Image dataset with caption/Bangla Lekha 2.0/images'
    captions_path3 = '/content/drive/MyDrive/Bangla Image dataset with caption/Bangla Lekha 2.0/captions.json'

    # Optimized training parameters
    batch_size = 64  # Increased batch size
    gradient_accumulation_steps = 2  # Reduced accumulation steps
    num_workers = 4  # Increased workers
    pin_memory = True
    mixed_precision = True

    # Optimized learning rates
    image_encoder_lr = 2e-4  # Increased learning rate
    text_encoder_lr = 2e-4   # Increased learning rate
    head_lr = 5e-4          # Increased learning rate
    weight_decay = 0.01

    # Early stopping and scheduler settings
    patience = 2
    factor = 0.7
    epochs = 10
    warmup_ratio = 0.05

    # Model parameters
    model_name = 'resnet50'
    image_embedding = 2048
    text_encoder_model = "csebuetnlp/banglabert"
    text_embedding = 768
    text_tokenizer = "csebuetnlp/banglabert"
    max_length = 128  # Reduced max length
    pretrained = True
    trainable = True
    temperature = 0.07  # Adjusted temperature
    size = 224
    projection_dim = 256
    dropout = 0.1

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Model parameters
    model_name = 'resnet50'
    image_embedding = 2048
    text_encoder_model = "csebuetnlp/banglabert"
    text_embedding = 768
    text_tokenizer = "csebuetnlp/banglabert"
    max_length = 200
    pretrained = True
    trainable = True
    temperature = 1.0
    size = 224
    num_projection_layers = 1
    projection_dim = 256
    dropout = 0.1

In [8]:
def load_captions():
    """Load captions with improved validation"""
    captions_list = []

    # Load Flickr8k dataset
    try:
        with open(os.path.join(CFG.captions_path1, 'BAN-Cap_captiondata.json'), 'r', encoding='utf-8') as f:
            captions_data1 = json.load(f)

        for entry in captions_data1:
            if isinstance(entry, dict) and 'caption_id' in entry and 'bengali_caption' in entry:
                filename = str(entry['caption_id']).split('#')[0]
                caption = str(entry['bengali_caption'])
                if caption and filename:
                    captions_list.append({
                        "image": filename.strip(),
                        "caption": caption.strip()
                    })
    except Exception as e:
        print(f"Error loading Flickr8k dataset: {str(e)}")

    # Load BNATURE dataset
    try:
        with open(CFG.captions_path2, 'r', encoding='utf-8') as f:
            captions_data2 = json.load(f)

        for entry in captions_data2:
            if isinstance(entry, dict) and 'caption_id' in entry and 'bengali_caption' in entry:
                filename = str(entry['caption_id'])
                caption = str(entry['bengali_caption'])
                if caption and filename:
                    captions_list.append({
                        "image": filename.strip(),
                        "caption": caption.strip()
                    })
    except Exception as e:
        print(f"Error loading BNATURE dataset: {str(e)}")

    # Load Bangla Lekha dataset with improved handling
    try:
        with open(CFG.captions_path3, 'r', encoding='utf-8') as f:
            captions_data3 = json.load(f)

        if isinstance(captions_data3, list):
            for entry in captions_data3:
                if isinstance(entry, dict) and 'filename' in entry and 'caption' in entry:
                    filename = str(entry['filename'])
                    caption = str(entry['caption'])
                    if caption and filename:
                        captions_list.append({
                            "image": filename.strip(),
                            "caption": caption.strip()
                        })
    except Exception as e:
        print(f"Error loading Bangla Lekha dataset: {str(e)}")

    df = pd.DataFrame(captions_list)
    df = df.dropna()
    df = df.drop_duplicates()
    df['id'] = df.index // 5

    print(f"Loaded {len(df)} valid caption entries")
    return df

In [9]:
class CLIPDataset(torch.utils.data.Dataset):
    def __init__(self, image_filenames, captions, tokenizer, transforms):
        self.image_filenames = image_filenames
        self.captions = list(captions)
        self.encoded_captions = tokenizer(
            list(captions),
            padding='max_length',
            truncation=True,
            max_length=CFG.max_length,
            return_tensors='pt'
        )
        self.transforms = transforms

        self.valid_indices = []
        for idx in range(len(self.image_filenames)):
            try:
                image_found = False
                for path in [CFG.image_path1, CFG.image_path2, CFG.image_path3]:
                    if os.path.exists(os.path.join(path, self.image_filenames[idx])):
                        image_found = True
                        break

                if image_found:
                    self.valid_indices.append(idx)
            except Exception as e:
                continue

        print(f"Found {len(self.valid_indices)} valid images out of {len(image_filenames)}")

    def __getitem__(self, idx):
        try:
            actual_idx = self.valid_indices[idx]

            item = {
                'input_ids': self.encoded_captions['input_ids'][actual_idx],
                'attention_mask': self.encoded_captions['attention_mask'][actual_idx],
            }

            image_path = None
            for path in [CFG.image_path1, CFG.image_path2, CFG.image_path3]:
                if os.path.exists(os.path.join(path, self.image_filenames[actual_idx])):
                    image_path = path
                    break

            if image_path is None:
                raise FileNotFoundError(f"Image {self.image_filenames[actual_idx]} not found in any path")

            image = cv2.imread(os.path.join(image_path, self.image_filenames[actual_idx]))
            if image is None:
                raise ValueError(f"Failed to load image: {self.image_filenames[actual_idx]}")

            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            image = self.transforms(image=image)['image']
            item['image'] = torch.tensor(image).permute(2, 0, 1).float()
            item['caption'] = self.captions[actual_idx]

            return item
        except Exception as e:
            print(f"Error processing item {idx}: {str(e)}")
            raise e

    def __len__(self):
        return len(self.valid_indices)

In [10]:


def build_loaders(dataframe, tokenizer, mode):
    """
    Build data loaders with error handling
    """
    transforms = get_transforms(mode=mode)

    try:
        dataset = CLIPDataset(
            image_filenames=dataframe["image"].values,
            captions=dataframe["caption"].values,
            tokenizer=tokenizer,
            transforms=transforms
        )

        # Custom collate function to handle potential None values
        def collate_fn(batch):
            # Filter out None values
            batch = [item for item in batch if item is not None]
            if len(batch) == 0:
                raise RuntimeError("Empty batch after filtering")

            return {
                'image': torch.stack([item['image'] for item in batch]),
                'input_ids': torch.stack([item['input_ids'] for item in batch]),
                'attention_mask': torch.stack([item['attention_mask'] for item in batch]),
                'caption': [item['caption'] for item in batch]
            }

        dataloader = torch.utils.data.DataLoader(
            dataset,
            batch_size=CFG.batch_size,
            num_workers=CFG.num_workers,
            shuffle=True if mode == "train" else False,
            collate_fn=collate_fn,
            drop_last=True  # Drop incomplete batches
        )

        return dataloader

    except Exception as e:
        print(f"Error building dataloader: {str(e)}")
        raise e

In [11]:

class ImageEncoder(nn.Module):
    def __init__(
        self, model_name=CFG.model_name, pretrained=CFG.pretrained, trainable=CFG.trainable
    ):
        super().__init__()
        self.model = timm.create_model(
            model_name, pretrained, num_classes=0, global_pool="avg"
        )
        for p in self.model.parameters():
            p.requires_grad = trainable

    def forward(self, x):
        return self.model(x)

from transformers import AutoTokenizer, AutoModel, AutoConfig

class TextEncoder(nn.Module):
    def __init__(self, model_name=CFG.text_encoder_model, pretrained=CFG.pretrained, trainable=CFG.trainable):
        super().__init__()
        if pretrained:
            self.model = AutoModel.from_pretrained(model_name)
        else:
            self.model = AutoModel(config=AutoConfig.from_pretrained(model_name))

        for p in self.model.parameters():
            p.requires_grad = trainable
        self.target_token_idx = 0

    def forward(self, input_ids, attention_mask):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = output.last_hidden_state
        return last_hidden_state[:, self.target_token_idx, :]

class ProjectionHead(nn.Module):
    def __init__(
        self,
        embedding_dim,
        projection_dim=CFG.projection_dim,
        dropout=CFG.dropout
    ):
        super().__init__()
        self.projection = nn.Linear(embedding_dim, projection_dim)
        self.gelu = nn.GELU()
        self.fc = nn.Linear(projection_dim, projection_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(projection_dim)

    def forward(self, x):
        projected = self.projection(x)
        x = self.gelu(projected)
        x = self.fc(x)
        x = self.dropout(x)
        x = x + projected
        x = self.layer_norm(x)
        return x


In [12]:
class CLIPModel(nn.Module):
    def __init__(
        self,
        temperature=CFG.temperature,
        image_embedding=CFG.image_embedding,
        text_embedding=CFG.text_embedding,
    ):
        super().__init__()
        self.image_encoder = ImageEncoder()
        self.text_encoder = TextEncoder()
        self.image_projection = ProjectionHead(embedding_dim=image_embedding)
        self.text_projection = ProjectionHead(embedding_dim=text_embedding)
        self.temperature = temperature

    def forward(self, batch):
        image_features = self.image_encoder(batch["image"])
        text_features = self.text_encoder(
            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
        )
        image_embeddings = self.image_projection(image_features)
        text_embeddings = self.text_projection(text_features)

        logits = (text_embeddings @ image_embeddings.T) / self.temperature
        images_similarity = image_embeddings @ image_embeddings.T
        texts_similarity = text_embeddings @ text_embeddings.T
        targets = F.softmax(
            (images_similarity + texts_similarity) / 2 * self.temperature, dim=-1
        )
        texts_loss = F.cross_entropy(logits, targets, reduction='none')
        images_loss = F.cross_entropy(logits.T, targets.T, reduction='none')
        loss = (images_loss + texts_loss) / 2.0
        return loss.mean()


In [13]:

def cross_entropy(preds, targets, reduction='none'):
    log_softmax = nn.LogSoftmax(dim=-1)
    loss = (-targets * log_softmax(preds)).sum(1)
    if reduction == "none":
        return loss
    elif reduction == "mean":
        return loss.mean()

In [14]:

def get_transforms(mode="train"):
    if mode == "train":
        return A.Compose([
            A.Resize(CFG.size, CFG.size, always_apply=True),
            A.HorizontalFlip(p=0.5),
            A.RandomBrightnessContrast(p=0.5),
            A.Normalize(max_pixel_value=255.0, always_apply=True),
        ])
    else:
        return A.Compose([
            A.Resize(CFG.size, CFG.size, always_apply=True),
            A.Normalize(max_pixel_value=255.0, always_apply=True),
        ])


In [15]:

def make_train_valid_dfs():
    dataframe = load_captions()
    max_id = dataframe["id"].max() + 1 if not CFG.debug else 100
    image_ids = np.arange(0, max_id)
    np.random.seed(42)
    valid_ids = np.random.choice(
        image_ids, size=int(0.2 * len(image_ids)), replace=False
    )
    train_ids = [id_ for id_ in image_ids if id_ not in valid_ids]
    train_dataframe = dataframe[dataframe["id"].isin(train_ids)].reset_index(drop=True)
    valid_dataframe = dataframe[dataframe["id"].isin(valid_ids)].reset_index(drop=True)
    return train_dataframe, valid_dataframe

In [16]:

def build_loaders(dataframe, tokenizer, mode):
    transforms = get_transforms(mode=mode)
    dataset = CLIPDataset(
        dataframe["image"].values,
        dataframe["caption"].values,
        tokenizer=tokenizer,
        transforms=transforms,
    )
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=CFG.batch_size,
        num_workers=CFG.num_workers,
        shuffle=True if mode == "train" else False,
    )
    return dataloader



In [17]:
class AvgMeter:
    """
    Computes and stores the average and current value
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [18]:
import torch
import itertools
from tqdm import tqdm
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import numpy as np

In [19]:
import torch
import itertools
from tqdm import tqdm
from transformers import AutoTokenizer, get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
import numpy as np
import gc

# Memory optimizations
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True
if torch.cuda.is_available():
    torch.backends.cuda.enable_mem_efficient_sdp(True)

# Configure PyTorch memory allocator
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,garbage_collection_threshold:0.8,expandable_segments:True'


In [22]:
import torch
import itertools
from tqdm import tqdm
from transformers import AutoTokenizer, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import matplotlib.pyplot as plt
import numpy as np
import torch.nn.functional as F

# Set memory and speed optimizations
torch.cuda.empty_cache()
torch.backends.cudnn.benchmark = True
if torch.cuda.is_available():
    torch.backends.cuda.enable_mem_efficient_sdp(True)

# Configure PyTorch memory allocator
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512,expandable_segments:True'

def compute_cosine_similarity(embeddings_a, embeddings_b, temperature=0.07):
    # Process in chunks to save memory
    chunk_size = 256
    num_chunks = (embeddings_a.size(0) + chunk_size - 1) // chunk_size
    similarity_chunks = []

    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, embeddings_a.size(0))
        chunk_a = embeddings_a[start_idx:end_idx]

        # Normalize chunks
        chunk_a = F.normalize(chunk_a, p=2, dim=-1)
        chunk_b = F.normalize(embeddings_b, p=2, dim=-1)

        # Compute similarity for chunk
        chunk_sim = torch.mm(chunk_a, chunk_b.t()) / temperature
        similarity_chunks.append(chunk_sim)

    return torch.cat(similarity_chunks, dim=0)

def compute_recall_at_k(similarities, k):
    argsort = torch.argsort(similarities, dim=-1, descending=True)
    diagonal = torch.arange(similarities.size(0), device=similarities.device)
    topk_indices = argsort[:, :k]
    recall_at_k = (topk_indices == diagonal.view(-1, 1)).any(dim=-1).float().mean()
    return recall_at_k.item()

def compute_precision_at_k(similarities, k):
    """
    Compute precision@k for the similarities matrix
    """
    argsort = torch.argsort(similarities, dim=-1, descending=True)
    diagonal = torch.arange(similarities.size(0), device=similarities.device)
    topk_indices = argsort[:, :k]

    # Count correct matches in top-k
    correct = (topk_indices == diagonal.view(-1, 1)).float().sum(dim=1)

    # Precision is number of correct matches divided by k
    precision_at_k = (correct / k).mean()

    return precision_at_k.item()

class AvgMeter:
    """
    Compute and store the average and current value
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def get_lr(optimizer):
    """
    Get the current learning rate from the optimizer
    """
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [28]:
def train(model, train_loader, optimizer, lr_scheduler, scaler, epoch):
    model.train()
    loss_meter = AvgMeter()
    similarity_meter = AvgMeter()

    # Add meters for recall tracking during training
    recall_meters = {k: AvgMeter() for k in [1, 5, 10]}

    tqdm_object = tqdm(train_loader, total=len(train_loader))

    # Store embeddings for recall calculation
    image_embeddings_list = []
    text_embeddings_list = []

    for batch_idx, batch in enumerate(tqdm_object):
        batch = {k: v.to(CFG.device, non_blocking=True) for k, v in batch.items() if k != "caption"}

        accumulation_steps = 2

        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            loss = model(batch)

            # Calculate embeddings and similarities
            with torch.no_grad():
                image_features = model.image_encoder(batch["image"])
                text_features = model.text_encoder(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"]
                )
                image_embeddings = model.image_projection(image_features)
                text_embeddings = model.text_projection(text_features)

                # Store embeddings for recall calculation
                image_embeddings_list.append(image_embeddings.detach().cpu())
                text_embeddings_list.append(text_embeddings.detach().cpu())

                similarity = F.cosine_similarity(
                    image_embeddings.unsqueeze(1),
                    text_embeddings.unsqueeze(0),
                    dim=-1
                ).mean()

            loss = loss / accumulation_steps

        scaler.scale(loss).backward()

        if (batch_idx + 1) % accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 2.0)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            lr_scheduler.step()

        loss_meter.update(loss.item() * accumulation_steps, batch["image"].size(0))
        similarity_meter.update(similarity.item(), batch["image"].size(0))

        # Calculate recall metrics every 100 batches
        if batch_idx % 100 == 0 and batch_idx > 0:
            image_embeddings = torch.cat(image_embeddings_list)
            text_embeddings = torch.cat(text_embeddings_list)

            i2t_similarities = compute_cosine_similarity(image_embeddings, text_embeddings)
            t2i_similarities = i2t_similarities.t()

            for k in [1, 5, 10]:
                recall = compute_recall_at_k(i2t_similarities, k)
                recall_meters[k].update(recall, image_embeddings.size(0))

            # Clear lists to save memory
            image_embeddings_list = []
            text_embeddings_list = []

        if batch_idx % 50 == 0:
            torch.cuda.empty_cache()

        # Fixed the syntax error in the postfix dictionary
        tqdm_object.set_postfix({
            'loss': f"{loss_meter.avg:.4f}",
            'sim': f"{similarity_meter.avg:.4f}",
            'recall_5': f"{recall_meters[5].avg:.4f}",  # Changed from r@5 to recall_5
            'lr': f"{get_lr(optimizer):.6f}"
        })

    train_recalls = {k: meter.avg for k, meter in recall_meters.items()}
    return loss_meter.avg, similarity_meter.avg, train_recalls

def validate(model, valid_loader):
    model.eval()
    loss_meter = AvgMeter()
    similarity_meter = AvgMeter()
    precision_meters = {k: AvgMeter() for k in [1, 5, 10]}

    image_embeddings_list = []
    text_embeddings_list = []

    with torch.no_grad():
        for batch_idx, batch in enumerate(tqdm(valid_loader)):
            batch = {k: v.to(CFG.device, non_blocking=True) for k, v in batch.items() if k != "caption"}

            with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
                loss = model(batch)

                image_features = model.image_encoder(batch["image"])
                text_features = model.text_encoder(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"]
                )
                image_embeddings = model.image_projection(image_features)
                text_embeddings = model.text_projection(text_features)

                similarity = F.cosine_similarity(
                    image_embeddings.unsqueeze(1),
                    text_embeddings.unsqueeze(0),
                    dim=-1
                ).mean()

                image_embeddings_list.append(image_embeddings.cpu())
                text_embeddings_list.append(text_embeddings.cpu())

            loss_meter.update(loss.item(), batch["image"].size(0))
            similarity_meter.update(similarity.item(), batch["image"].size(0))

            if batch_idx % 20 == 0:
                torch.cuda.empty_cache()

    image_embeddings = torch.cat(image_embeddings_list)
    text_embeddings = torch.cat(text_embeddings_list)

    i2t_similarities = compute_cosine_similarity(image_embeddings, text_embeddings)
    t2i_similarities = i2t_similarities.t()

    metrics = {
        'val_similarity': similarity_meter.avg,
        'val_loss': loss_meter.avg
    }

    for k in [1, 5, 10]:
        recall_i2t = compute_recall_at_k(i2t_similarities, k)
        recall_t2i = compute_recall_at_k(t2i_similarities, k)
        precision_i2t = compute_precision_at_k(i2t_similarities, k)
        precision_t2i = compute_precision_at_k(t2i_similarities, k)

        metrics[f'image_to_text_recall@{k}'] = recall_i2t
        metrics[f'text_to_image_recall@{k}'] = recall_t2i
        metrics[f'image_to_text_precision@{k}'] = precision_i2t
        metrics[f'text_to_image_precision@{k}'] = precision_t2i

    return metrics

def plot_training_progress(metrics_history, epoch):
    plt.figure(figsize=(20, 15))

    # Plot 1: Losses
    plt.subplot(3, 2, 1)
    plt.plot(metrics_history['train_loss'], label='Train Loss', marker='o')
    plt.plot(metrics_history['val_loss'], label='Val Loss', marker='x')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    # Plot 2: Training Recalls
    plt.subplot(3, 2, 2)
    for k in [1, 5, 10]:
        plt.plot(
            metrics_history['train_recalls'][f'R@{k}'],
            label=f'Train R@{k}',
            marker='o'
        )
    plt.title('Training Recalls')
    plt.xlabel('Epoch')
    plt.ylabel('Recall')
    plt.legend()
    plt.grid(True)

    # Plot 3: Validation Recalls
    plt.subplot(3, 2, 3)
    for k in [1, 5, 10]:
        plt.plot(
            [m[f'image_to_text_recall@{k}'] for m in metrics_history['val_metrics']],
            label=f'Val R@{k}',
            marker='o'
        )
    plt.title('Validation Recalls')
    plt.xlabel('Epoch')
    plt.ylabel('Recall')
    plt.legend()
    plt.grid(True)

    # Plot 4: Precision
    plt.subplot(3, 2, 4)
    for k in [1, 5, 10]:
        plt.plot(
            [m[f'image_to_text_precision@{k}'] for m in metrics_history['val_metrics']],
            label=f'P@{k}',
            marker='o'
        )
    plt.title('Image-to-Text Precision')
    plt.xlabel('Epoch')
    plt.ylabel('Precision')
    plt.legend()
    plt.grid(True)

    # Plot 5: Similarities
    plt.subplot(3, 2, 5)
    plt.plot(metrics_history['train_similarity'], label='Train Similarity', marker='o')
    plt.plot([m['val_similarity'] for m in metrics_history['val_metrics']],
             label='Val Similarity', marker='x')
    plt.title('Cosine Similarities')
    plt.xlabel('Epoch')
    plt.ylabel('Similarity')
    plt.legend()
    plt.grid(True)

    # Plot 6: Learning Rate
    plt.subplot(3, 2, 6)
    plt.plot(metrics_history['learning_rates'], label='Learning Rate', marker='o')
    plt.title('Learning Rate Progress')
    plt.xlabel('Epoch')
    plt.ylabel('Learning Rate')
    plt.yscale('log')
    plt.grid(True)

    plt.tight_layout()
    plt.savefig(f'training_progress_epoch_{epoch+1}.png')
    plt.close()

def main():
    print("Starting optimized training...")

    # Set memory optimization flags
    torch.backends.cudnn.benchmark = True
    if torch.cuda.is_available():
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True

    train_df, valid_df = make_train_valid_dfs()
    tokenizer = AutoTokenizer.from_pretrained(CFG.text_tokenizer)

    train_loader = build_loaders(train_df, tokenizer, mode="train")
    valid_loader = build_loaders(valid_df, tokenizer, mode="valid")

    model = CLIPModel(
        temperature=CFG.temperature,
        image_embedding=CFG.image_embedding,
        text_embedding=CFG.text_embedding
    )

    # Load checkpoint with memory optimization
    checkpoint = torch.load('/content/best_model_epoch_8.pth', map_location='cpu', weights_only=True)
    model.load_state_dict(checkpoint['model_state_dict'])
    best_val_loss = checkpoint['best_val_loss']
    best_recall = checkpoint['best_recall']
    del checkpoint['model_state_dict']  # Free memory

    # Move model to GPU after loading weights
    model = model.to(CFG.device)

    # Apply memory optimizations to model
    model.half()  # Convert to half precision
    for param in model.parameters():
        param.grad = None  # Clear gradients

    # Modified optimizer setup with memory-efficient settings
    params = [
        {"params": model.image_encoder.parameters(), "lr": CFG.image_encoder_lr * 0.5},
        {"params": model.text_encoder.parameters(), "lr": CFG.text_encoder_lr * 0.5},
        {"params": itertools.chain(
            model.image_projection.parameters(),
            model.text_projection.parameters()
        ), "lr": CFG.head_lr * 0.7}
    ]

    optimizer = torch.optim.AdamW(params, weight_decay=CFG.weight_decay)
    scaler = torch.amp.GradScaler()

    num_training_steps = len(train_loader) * CFG.epochs
    num_warmup_steps = int(num_training_steps * 0.05)

    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )

    early_stopping_counter = 0

    # Modified metrics history initialization
    metrics_history = checkpoint.get('metrics_history', {
        'train_loss': [],
        'val_loss': [],
        'train_similarity': [],
        'val_metrics': [],
        'learning_rates': [],
        'train_recalls': {k: [] for k in [1, 5, 10]}
    })

    del checkpoint  # Free memory
    torch.cuda.empty_cache()

    for epoch in range(CFG.epochs):
        print(f"\nEpoch {epoch + 1}")

        # Clear cache before each epoch
        torch.cuda.empty_cache()
        gc.collect()

        # Training with recall tracking
        train_loss, train_similarity, train_recalls = train(
            model, train_loader, optimizer, scheduler, scaler, epoch
        )

        # Clear cache before validation
        torch.cuda.empty_cache()

        # Validation
        val_metrics = validate(model, valid_loader)

        # Update metrics history
        metrics_history['train_loss'].append(train_loss)
        metrics_history['val_loss'].append(val_metrics['val_loss'])
        metrics_history['train_similarity'].append(train_similarity)
        metrics_history['val_metrics'].append(val_metrics)
        metrics_history['learning_rates'].append(get_lr(optimizer))

        for k in [1, 5, 10]:
            metrics_history['train_recalls'][k].append(train_recalls[k])

        # Plot progress
        plot_training_progress(metrics_history, epoch)

        # Print metrics
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val Loss: {val_metrics['val_loss']:.4f}")
        print(f"Train Similarity: {train_similarity:.4f}")
        print(f"Val Similarity: {val_metrics['val_similarity']:.4f}")
        for k in [1, 5, 10]:
            print(f"Train R@{k}: {train_recalls[k]:.4f}")
            print(f"Val R@{k}: {val_metrics[f'image_to_text_recall@{k}']:.4f}")
            print(f"Val P@{k}: {val_metrics[f'image_to_text_precision@{k}']:.4f}")

        # Save best model with memory optimization
        current_recall = val_metrics['image_to_text_recall@5']
        if val_metrics['val_loss'] < best_val_loss or current_recall > best_recall:
            best_val_loss = min(val_metrics['val_loss'], best_val_loss)
            best_recall = max(current_recall, best_recall)

            torch.cuda.empty_cache()  # Clear cache before saving

            save_dict = {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'best_val_loss': best_val_loss,
                'best_recall': best_recall,
                'metrics_history': metrics_history
            }

            torch.save(save_dict, f'best_model_epoch_{epoch+1}.pth')
            del save_dict  # Free memory

            print(f"Saved best model - Val Loss: {best_val_loss:.4f}, R@5: {best_recall:.4f}")
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= CFG.patience:
                print("Early stopping triggered")
                break

        # Memory cleanup
        torch.cuda.empty_cache()
        gc.collect()

if __name__ == "__main__":
    main()

Starting optimized training...
Loaded 88641 valid caption entries
Found 70919 valid images out of 70919
Found 17722 valid images out of 17722

Epoch 1


  0%|          | 1/4433 [00:03<4:31:39,  3.68s/it, loss=0.1929, sim=0.3906, recall_5=0.0000, lr=0.000000]


ValueError: Attempting to unscale FP16 gradients.

In [None]:
import torch

# Define the Google Drive path for saving the model
save_path = "/content/drive/MyDrive/Bangla Image dataset with caption/banglaclipcombinedfinalnowok22pro.pt"

# Define your model class
model = CLIPModel().to(CFG.device)

# Load the checkpoint
checkpoint_path = "/content/best_model_epoch_20.pth"
checkpoint = torch.load(checkpoint_path, weights_only=True)  # Add weights_only=True to address the warning

# Extract just the model state dict from the checkpoint
if "model_state_dict" in checkpoint:
    # If the checkpoint contains the full training state
    model.load_state_dict(checkpoint["model_state_dict"])
else:
    # If the checkpoint contains only the model state
    model.load_state_dict(checkpoint)

# Save the model's state dictionary to the specified Google Drive path
torch.save(model.state_dict(), save_path)

print(f"Model saved successfully at: {save_path}")

#Interface

In [None]:
import torch
from transformers import AutoTokenizer
from tqdm import tqdm

def get_image_embeddings(dataframe, model_path):
    """
    Loads a model and generates image embeddings for the provided dataset.
    Parameters:
    - dataframe: DataFrame containing image file names.
    - model_path: Path to the saved model file.
    Returns:
    - model: Loaded CLIPModel instance.
    - image_embeddings: Tensor of image embeddings for the dataset.
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(CFG.text_tokenizer)
    model = CLIPModel().to(CFG.device)

    # Load model with weights_only=True
    checkpoint = torch.load(model_path, map_location=CFG.device, weights_only=True)
    if "model_state_dict" in checkpoint:
        model.load_state_dict(checkpoint["model_state_dict"])
    else:
        model.load_state_dict(checkpoint)

    model.eval()

    # Prepare data loader
    transforms = get_transforms(mode="valid")
    dataset = CLIPDataset(
        dataframe["image"].values,
        dataframe["caption"].values,
        tokenizer=tokenizer,
        transforms=transforms,
    )
    dataloader = torch.utils.data.DataLoader(
        dataset,
        batch_size=CFG.batch_size,
        num_workers=CFG.num_workers,
        shuffle=False
    )

    # Generate image embeddings
    image_embeddings_list = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating image embeddings"):
            images = batch["image"].to(CFG.device)
            image_features = model.image_encoder(images)
            image_embeddings = model.image_projection(image_features)
            image_embeddings_list.append(image_embeddings)

            # Free up memory
            if len(image_embeddings_list) % 10 == 0:
                torch.cuda.empty_cache()

    # Concatenate all embeddings
    image_embeddings = torch.cat(image_embeddings_list, dim=0)
    return model, image_embeddings

def find_matches(model, image_embeddings, query, image_filenames, n=9):
    """
    Find matching images for a given text query.
    Parameters:
    - model: Loaded CLIPModel instance
    - image_embeddings: Pre-computed image embeddings
    - query: Text query (in Bangla)
    - image_filenames: List of image filenames
    - n: Number of matches to return
    """
    tokenizer = AutoTokenizer.from_pretrained(CFG.text_tokenizer)

    # Tokenize the query
    inputs = tokenizer(
        query,
        padding=True,
        truncation=True,
        max_length=CFG.max_length,
        return_tensors="pt"
    )

    # Move inputs to device
    inputs = {k: v.to(CFG.device) for k, v in inputs.items()}

    with torch.no_grad():
        # Get text features
        text_features = model.text_encoder(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"]
        )
        text_embeddings = model.text_projection(text_features)

        # Calculate similarities
        similarities = torch.cosine_similarity(
            text_embeddings.unsqueeze(1),
            image_embeddings.unsqueeze(0),
            dim=-1
        )

        # Get top matches
        best_matches = torch.topk(similarities[0], k=n)
        match_indices = best_matches.indices.cpu().numpy()
        match_scores = best_matches.values.cpu().numpy()

    return [(image_filenames[idx], score) for idx, score in zip(match_indices, match_scores)]

def main():
    # Load validation data
    train_df, valid_df = make_train_valid_dfs()
    print(f"Loaded {len(valid_df)} valid caption entries")

    # Set the correct model path
    model_path = "/content/best_model_epoch_20.pth"

    try:
        # Generate the model and image embeddings
        model, image_embeddings = get_image_embeddings(valid_df, model_path)

        # Define multiple Bangla queries
        queries = [
            "কক্সবাজারের সমুদ্র সৈকত",  # Cox's Bazar sea beach
            "ঢাকার ঐতিহাসিক স্থাপত্য",   # Historical architecture of Dhaka
            "সুন্দরবনের রয়্যাল বেঙ্গল টাইগার",  # Royal Bengal Tiger of Sundarbans
            "পার্বত্য চট্টগ্রামের প্রাকৃতিক দৃশ্য",  # Natural scenery of Chittagong Hill Tracts
            "শাহজালাল আন্তর্জাতিক বিমানবন্দর"  # Shahjalal International Airport
        ]

        # Process each query and display results
        for query in queries:
            print(f"\n{'='*50}")
            print(f"Query: {query}")
            print(f"{'='*50}")

            matches = find_matches(
                model=model,
                image_embeddings=image_embeddings,
                query=query,
                image_filenames=valid_df["image"].values,
                n=5  # Showing top 5 matches for each query
            )

            # Print results with detailed similarity scores
            print("\nTop matches:")
            print(f"{'Rank':<6}{'Similarity':<12}{'Image Path'}")
            print("-" * 60)

            for idx, (image_path, score) in enumerate(matches, 1):
                similarity_percentage = score * 100
                print(f"{idx:<6}{similarity_percentage:.2f}%{' '*4}{image_path}")

    except Exception as e:
        print(f"Error occurred: {str(e)}")

    finally:
        # Clean up memory
        torch.cuda.empty_cache()

if __name__ == "__main__":
    main()

In [None]:
# Cell 1: Install required packages
!pip install huggingface-hub transformers

# Cell 2: Import necessary libraries
from huggingface_hub import HfApi, login
from transformers import AutoProcessor, CLIPProcessor
import torch
import os
import json

# Cell 3: Create configuration file
config = {
    "image_size": CFG.size,
    "projection_dim": CFG.projection_dim,
    "temperature": CFG.temperature,
    "image_encoder": CFG.model_name,
    "text_encoder": CFG.text_encoder_model,
    "max_length": CFG.max_length,
    "image_embedding": CFG.image_embedding,
    "text_embedding": CFG.text_embedding
}
with open("config.json", "w") as f:
    json.dump(config, f, indent=2)

# Cell 4: Load and prepare model with error handling
try:
    model = CLIPModel().to('cpu')
    checkpoint = torch.load("/content/best_model_epoch_20.pth",  # Updated path to match your model
                          map_location='cpu',
                          weights_only=True)  # Added weights_only parameter

    if "model_state_dict" in checkpoint:
        model.load_state_dict(checkpoint["model_state_dict"])
    else:
        model.load_state_dict(checkpoint)

    model.eval()
    print("Model loaded successfully")
except Exception as e:
    print(f"Error loading model: {str(e)}")

# Cell 5: Create directories and save model components with error handling
try:
    model_dir = "BanglaCLIP14"  # Updated version number
    os.makedirs(model_dir, exist_ok=True)

    # Save model components
    torch.save(model.image_encoder.state_dict(), f"{model_dir}/image_encoder.pt")
    torch.save(model.text_encoder.state_dict(), f"{model_dir}/text_encoder.pt")
    torch.save(model.image_projection.state_dict(), f"{model_dir}/image_projection.pt")
    torch.save(model.text_projection.state_dict(), f"{model_dir}/text_projection.pt")

    # Also save config
    with open(f"{model_dir}/config.json", "w") as f:
        json.dump(config, f, indent=2)

    print("Model components saved successfully")
except Exception as e:
    print(f"Error saving model components: {str(e)}")

# Cell 6: Create model card
model_card = f"""# BanglaCLIP14

This is a CLIP model trained on Bengali (Bangla) text and images. The model can be used for:
- Image-text similarity matching
- Zero-shot image classification
- Text-to-image retrieval
- Image-to-text retrieval

## Model Details
- Trained on combined dataset of Flickr8k-Bengali, BNATURE, and Bangla Lekha
- Image Encoder: ResNet50
- Text Encoder: BanglaBERT
- Projection Dimension: {CFG.projection_dim}
- Training Temperature: {CFG.temperature}

## Usage
```python
from transformers import AutoProcessor, CLIPModel
import torch

# Load model
model = CLIPModel.from_pretrained("Mansuba/BanglaCLIP14")
processor = AutoProcessor.from_pretrained("Mansuba/BanglaCLIP14")

# Example usage code will go here