In [1]:
# Step 1: Mount Google Drive to access the dataset.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Data Pre PROCESSING

In [2]:
# Dataset for Bangla CLIP
from google.colab import drive
import json
from pathlib import Path
from tqdm import tqdm
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Base path for the dataset
base_path = "/content/drive/MyDrive/Bangla Image dataset with caption"

# Paths for individual datasets
bnature_images_path = f"{base_path}/BNATURE/Pictures"
bnature_caption_file = f"{base_path}/BNATURE/caption/caption.txt"

bangla_lekha_images_path = f"{base_path}/Bangla Lekha 2.0/images"
bangla_lekha_caption_file = f"{base_path}/Bangla Lekha 2.0/captions.json"

flickr_images_path = f"{base_path}/Flickr8k_Dataset/Flicker8k_Dataset"
flickr_caption_file = f"{base_path}/Flickr8k_Dataset/BAN-Cap_captiondata.csv"

# Initialize dataframes
train_dataframe = {'caption': [], 'image': []}
valid_dataframe = {'caption': [], 'image': []}

# Bangla Lekha 2.0 Dataset
with open(bangla_lekha_caption_file, encoding='utf-8') as fh:
    data = json.load(fh)

trn_split = int(0.8 * len(data))
for sample in tqdm(data[:trn_split]):
    fn = sample['filename']
    cp = sample['caption']
    my_file = Path(f"{bangla_lekha_images_path}/{fn}")
    if my_file.is_file():
        for tc in cp:
            tc = tc.replace(',', ' ')
            train_dataframe['caption'].append(tc)
            train_dataframe['image'].append(f"{bangla_lekha_images_path}/{fn}")

for sample in tqdm(data[trn_split:]):
    fn = sample['filename']
    cp = sample['caption']
    my_file = Path(f"{bangla_lekha_images_path}/{fn}")
    if my_file.is_file():
        for vc in cp:
            vc = vc.replace(',', ' ')
            valid_dataframe['caption'].append(vc)
            valid_dataframe['image'].append(f"{bangla_lekha_images_path}/{fn}")

# BNature Dataset
lines = open(bnature_caption_file, "r").readlines()

trn_split = int(0.8 * len(lines))
for line in tqdm(lines[:trn_split]):
    parts = line.split()
    if len(parts) < 2:  # Ensure the line has at least two elements
        continue
    fn = parts[0].strip()
    cp = ' '.join(part.strip() for part in parts[1:])
    my_file = Path(f"{bnature_images_path}/{fn}")
    if my_file.is_file():
        cp = cp.replace(',', ' ')
        train_dataframe['caption'].append(cp)
        train_dataframe['image'].append(f"{bnature_images_path}/{fn}")

for line in tqdm(lines[trn_split:]):
    parts = line.split()
    if len(parts) < 2:  # Ensure the line has at least two elements
        continue
    fn = parts[0].strip()
    cp = ' '.join(part.strip() for part in parts[1:])
    my_file = Path(f"{bnature_images_path}/{fn}")
    if my_file.is_file():
        cp = cp.replace(',', ' ')
        valid_dataframe['caption'].append(cp)
        valid_dataframe['image'].append(f"{bnature_images_path}/{fn}")

# Flickr8k Bangla Translation Dataset
ban_caps = pd.read_csv(flickr_caption_file)
cap_ids = list(ban_caps['caption_id'])
ban_trans = list(ban_caps['bengali_caption'])

trn_split = int(0.8 * len(cap_ids))
for j in tqdm(range(len(cap_ids[:trn_split]))):
    ci = cap_ids[j].split("#")[0]
    bt = ban_trans[j]
    fn = ci
    cp = bt
    my_file = Path(f"{flickr_images_path}/{fn}")
    if my_file.is_file():
        cp = cp.replace(',', ' ')
        train_dataframe['caption'].append(cp)
        train_dataframe['image'].append(f"{flickr_images_path}/{fn}")

for j in tqdm(range(len(cap_ids[trn_split:]))):
    ci = cap_ids[j].split("#")[0]
    bt = ban_trans[j]
    fn = ci
    cp = bt
    my_file = Path(f"{flickr_images_path}/{fn}")
    if my_file.is_file():
        cp = cp.replace(',', ' ')
        valid_dataframe['caption'].append(cp)
        valid_dataframe['image'].append(f"{flickr_images_path}/{fn}")

# Convert to DataFrame and save to CSV
train_dataframe = pd.DataFrame(train_dataframe)
valid_dataframe = pd.DataFrame(valid_dataframe)

print(train_dataframe.head())
print(valid_dataframe.head())

# Save the datasets to Google Drive
train_dataframe.to_csv(f'{base_path}/train_df_bang.csv', index=False)
valid_dataframe.to_csv(f'{base_path}/valid_df_bang.csv', index=False)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


100%|██████████| 7323/7323 [00:37<00:00, 194.77it/s] 
100%|██████████| 1831/1831 [00:00<00:00, 2914.97it/s]
100%|██████████| 31284/31284 [00:41<00:00, 748.77it/s] 
100%|██████████| 7822/7822 [00:03<00:00, 2508.17it/s]
100%|██████████| 32364/32364 [00:47<00:00, 683.27it/s] 
100%|██████████| 8091/8091 [00:03<00:00, 2632.81it/s]


                                             caption  \
0  তিন জন মেয়ে মানুষ আছে। এক জন দাড়িয়ে আছে আর দুই...   
1  একটি হলুদ জামা পায়জামা পরা মহিলা দাড়িয়ে হাতে এ...   
2                           অনেক মেয়ে মানুষ বসে আছে।   
3  একটি নীল জামা পরা মহিলা একটি নীল ল্যাপটপ এর দি...   
4                    অনেক মানুষ একসাথে বসে কাজ করছে।   

                                               image  
0  /content/drive/MyDrive/Bangla Image dataset wi...  
1  /content/drive/MyDrive/Bangla Image dataset wi...  
2  /content/drive/MyDrive/Bangla Image dataset wi...  
3  /content/drive/MyDrive/Bangla Image dataset wi...  
4  /content/drive/MyDrive/Bangla Image dataset wi...  
                                             caption  \
0                                     একটি শিশু আছে।   
1  একটি গাড়ির আয়না ধরে মুখ দেখছেন একজন টোকাতে বাচ...   
2                          দুইজন মানুষ হেঁটে যাচ্ছে।   
3  কাধে মালামাল নিয়ে হেটে যাচ্ছেন ২ জন পুরুষ মানু...   
4                             একজন পুরুষ জুতা সারছে। 

In [3]:
import os
from google.colab import drive

drive.mount('/content/drive')
base_path = "/content/drive/MyDrive/Bangla Image dataset with caption"

print("Checking files in Google Drive directory:")
print(os.listdir(base_path))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Checking files in Google Drive directory:
['Flickr8k_Dataset', 'mscoco_images 10k', 'Bangla Lekha 2.0', 'BNATURE', 'valid_df_bang.csv', 'train_df_bang.csv']


#CFG

In [4]:
import torch  # Importing torch module

class CFG:
    # Debugging mode
    debug = True

    # Dataset paths
    dataset_root = "/content/drive/MyDrive/Bangla Image dataset with caption"
    train_json = f"{dataset_root}/train_df_bang.csv"
    val_json = f"{dataset_root}/valid_df_bang.csv"

    # Hyperparameters
    batch_size = 200
    num_workers = 4
    head_lr = 1e-3
    image_encoder_lr = 1e-4
    text_encoder_lr = 1e-5
    weight_decay = 1e-3
    patience = 2
    factor = 0.8
    epochs = 350
    lr = 1e-3

    # GPU/CPU configuration
    gpu = 1
    device = torch.device(f"cuda:{gpu}" if torch.cuda.is_available() else "cpu")

    # Model configuration
    model_name = 'resnet50'
    image_embedding = 1000  # Adjusted for ResNet50
    text_encoder_model = "csebuetnlp/banglabert"
    text_tokenizer = "csebuetnlp/banglabert"
    max_length = 100

    # Model tags for logging and saving
    model_tag = f"{model_name}_{text_encoder_model.replace('/', '_')}_aug"
    log_tag = model_tag

    # Pretraining and training settings
    pretrained = True
    trainable = True
    temperature = 1.0

    # Image input size
    size = 224

    # Projection head configuration
    num_projection_layers = 1
    projection_dim = 256
    dropout = 0.1


In [5]:
CFG = CFG()

#prepare_datafiles

In [6]:
import json
from glob import glob
from tqdm import tqdm
import os
import pandas as pd
from pathlib import Path
import torch

# Debugging mode
debug = True

# Dataset paths updated to match Google Drive structure
base_path = "/content/drive/MyDrive/Bangla Image dataset with caption"

# Subdataset paths
datasets = [
    {
        "name": "BNature",
        "image_dir": f"{base_path}/BNATURE/Pictures",
        "caption_file": f"{base_path}/BNATURE/caption/caption.txt",
        "caption_format": "text"  # Captions are in plain text
    },
    {
        "name": "Bangla Lekha 2.0",
        "image_dir": f"{base_path}/Bangla Lekha 2.0/images",
        "caption_file": f"{base_path}/Bangla Lekha 2.0/captions.json",
        "caption_format": "json"  # Captions are in JSON
    },
    {
        "name": "Flickr8k",
        "image_dir": f"{base_path}/Flickr8k_Dataset/Flicker8k_Dataset",
        "caption_file": f"{base_path}/Flickr8k_Dataset/BAN-Cap_captiondata.csv",
        "caption_format": "csv"  # Captions are in CSV
    }
]

# Initialize dataframes
train_dataframe = {"caption": [], "image": []}
valid_dataframe = {"caption": [], "image": []}

for dataset in datasets:
    print(f"Processing dataset: {dataset['name']}")
    image_dir = dataset["image_dir"]
    caption_file = dataset["caption_file"]
    caption_format = dataset["caption_format"]

    # Process captions based on format
    captions = []
    if caption_format == "json":
        with open(caption_file, encoding="utf-8") as fh:
            captions = json.load(fh)
    elif caption_format == "text":
        with open(caption_file, encoding="utf-8") as fh:
            captions = [{"filename": line.split(",")[0], "caption": line.strip()} for line in fh.readlines()]
    elif caption_format == "csv":
        # Dynamically fetch the column names
        caption_df = pd.read_csv(caption_file)
        if debug:
            print(f"Columns in {caption_file}: {caption_df.columns.tolist()}")
        # Infer filename and caption columns
        filename_col = caption_df.columns[0]  # Assume the first column is for filenames
        caption_col = caption_df.columns[1]  # Assume the second column is for captions
        captions = caption_df.rename(columns={filename_col: "filename", caption_col: "caption"}).to_dict(orient="records")

    # Split into train/validation
    trn_split = int(0.8 * len(captions))
    for sample in tqdm(captions[:trn_split]):
        fn = sample["filename"]
        cp = sample["caption"]
        img_path = Path(f"{image_dir}/{fn}")
        if img_path.is_file():
            train_dataframe["caption"].append(cp)
            train_dataframe["image"].append(str(img_path))

    for sample in tqdm(captions[trn_split:]):
        fn = sample["filename"]
        cp = sample["caption"]
        img_path = Path(f"{image_dir}/{fn}")
        if img_path.is_file():
            valid_dataframe["caption"].append(cp)
            valid_dataframe["image"].append(str(img_path))

# Convert to pandas DataFrame
train_dataframe = pd.DataFrame(train_dataframe)
valid_dataframe = pd.DataFrame(valid_dataframe)

# Save to CSV
train_csv_path = f"{base_path}/train_df_bang.csv"  # Output path for train CSV
val_csv_path = f"{base_path}/valid_df_bang.csv"   # Output path for validation CSV
train_dataframe.to_csv(train_csv_path, index=False)
valid_dataframe.to_csv(val_csv_path, index=False)

print(f"Training DataFrame saved to {train_csv_path}")
print(f"Validation DataFrame saved to {val_csv_path}")
print(train_dataframe.head())
print(valid_dataframe.head())


Processing dataset: BNature


100%|██████████| 31284/31284 [00:11<00:00, 2638.66it/s]
100%|██████████| 7822/7822 [00:03<00:00, 2523.23it/s]


Processing dataset: Bangla Lekha 2.0


100%|██████████| 7323/7323 [00:03<00:00, 2415.19it/s]
100%|██████████| 1831/1831 [00:00<00:00, 2408.71it/s]


Processing dataset: Flickr8k
Columns in /content/drive/MyDrive/Bangla Image dataset with caption/Flickr8k_Dataset/BAN-Cap_captiondata.csv: ['caption_id', 'english_caption', 'bengali_caption']


100%|██████████| 32364/32364 [00:11<00:00, 2722.36it/s]
100%|██████████| 8091/8091 [00:02<00:00, 2847.65it/s]


Training DataFrame saved to /content/drive/MyDrive/Bangla Image dataset with caption/train_df_bang.csv
Validation DataFrame saved to /content/drive/MyDrive/Bangla Image dataset with caption/valid_df_bang.csv
                                             caption  \
0  [তিন জন মেয়ে মানুষ আছে। এক জন দাড়িয়ে আছে আর দু...   
1  [অনেক মেয়ে মানুষ বসে আছে।, একটি নীল জামা পরা ম...   
2  [অনেক মানুষ একসাথে বসে কাজ করছে।, ২ টি  ছোট ছে...   
3  [ছয় জন মানুষ দাড়িয়ে আছে।, ৬ জন মানুষ এলোমেলো দ...   
4  [এক জন মেয়ে মানুষ মাথায় ঘোমটা দিয়ে কাজ করছে। ম...   

                                               image  
0  /content/drive/MyDrive/Bangla Image dataset wi...  
1  /content/drive/MyDrive/Bangla Image dataset wi...  
2  /content/drive/MyDrive/Bangla Image dataset wi...  
3  /content/drive/MyDrive/Bangla Image dataset wi...  
4  /content/drive/MyDrive/Bangla Image dataset wi...  
                                             caption  \
0  [একটি শিশু আছে।, একটি গাড়ির আয়না ধরে মুখ দেখছে...   
1  [দুইজন মান

#Dataset

In [7]:
!pip install git+https://github.com/csebuetnlp/normalizer

Collecting git+https://github.com/csebuetnlp/normalizer
  Cloning https://github.com/csebuetnlp/normalizer to /tmp/pip-req-build-4ew8vlin
  Running command git clone --filter=blob:none --quiet https://github.com/csebuetnlp/normalizer /tmp/pip-req-build-4ew8vlin
  Resolved https://github.com/csebuetnlp/normalizer to commit d405944dde5ceeacb7c2fd3245ae2a9dea5f35c9
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting emoji==1.4.2 (from normalizer==0.0.1)
  Downloading emoji-1.4.2.tar.gz (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m185.0/185.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy==6.0.3 (from normalizer==0.0.1)
  Downloading ftfy-6.0.3.tar.gz (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nor

In [8]:
from normalizer import normalize

In [9]:
# PyTorch Dataset for CLIP Bangla
from logging import config
import os
import cv2
import torch
import albumentations as A
import pandas as pd

from normalizer import normalize  # pip install git+https://github.com/csebuetnlp/normalizer


class CLIPDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, transforms):
        """
        Dataset for CLIP Bangla
        """
        # Use DataFrame directly instead of reading CSV
        self.image_filenames = dataframe['image'].tolist()
        self.captions = [normalize(cap_sen) for cap_sen in dataframe['caption'].tolist()]

        self.transforms = transforms

    def __getitem__(self, idx):
        image_path = self.image_filenames[idx]
        image = cv2.imread(image_path)
        if image is None:
            raise FileNotFoundError(f"Image not found at {image_path}")
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = self.transforms(image=image)['image']

        image = torch.tensor(image).permute(2, 0, 1).float()
        caption = self.captions[idx]

        return image, caption

    def __len__(self):
        return len(self.captions)



def get_transforms(mode="train"):
    if mode == "train":
        config = {
            'aug_prob': 0.2
        }
        return A.Compose(
            [
                A.HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=config['aug_prob']),
                A.RandomBrightnessContrast(brightness_limit=(-0.1, 0.1), contrast_limit=(-0.1, 0.1), p=config['aug_prob']),
                A.CoarseDropout(p=config['aug_prob']),
                A.GaussNoise(p=config['aug_prob']),
                A.ZoomBlur(p=config['aug_prob']),
                A.RandomFog(p=config['aug_prob']),
                A.Rotate((-20., 20.), p=0.5),
                A.MotionBlur(p=config['aug_prob']),
                A.Resize(CFG.size, CFG.size, always_apply=True),  # Use CFG.size
                A.Normalize(max_pixel_value=255.0, always_apply=True),
            ]
        )
    else:
        return A.Compose(
            [
                A.Resize(CFG.size, CFG.size, always_apply=True),  # Use CFG.size
                A.Normalize(max_pixel_value=255.0, always_apply=True),
            ]
        )


if __name__ == "__main__":
    # Paths to the train and validation CSV files
    train_df = pd.read_csv(CFG.train_json).dropna()  # Ensure no missing data
    val_df = pd.read_csv(CFG.val_json).dropna()

    # Initialize tokenizer and transforms
    tokenizer = None  # Add tokenizer logic if required
    train_transforms = get_transforms(mode="train")
    val_transforms = get_transforms(mode="val")

    # Create datasets
    train_dataset = CLIPDataset(dataframe=train_df, tokenizer=tokenizer, transforms=train_transforms)
    val_dataset = CLIPDataset(dataframe=val_df, tokenizer=tokenizer, transforms=val_transforms)

    # Sample usage
    print(f"Number of training samples: {len(train_dataset)}")
    print(f"Number of validation samples: {len(val_dataset)}")

    # Fetch a single sample for debugging
    image, caption = train_dataset[0]
    print(f"Sample caption: {caption}")
    print(f"Image tensor shape: {image.shape}")


  check_for_updates()


Number of training samples: 7323
Number of validation samples: 1831
Sample caption: ['তিন জন মেয়ে মানুষ আছে। এক জন দাড়িয়ে আছে আর দুই জন বসে আছে।', 'একটি হলুদ জামা পায়জামা পরা মহিলা দাড়িয়ে হাতে একটি বেত নিয়ে পিটানোর ভাব দেখাচ্ছে আর ছোট একটি মেয়ে পিছনে ব্যাগ নিয়ে বসে কাঁদছে। ']
Image tensor shape: torch.Size([3, 224, 224])


#Clip model

In [10]:
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import models, transforms
from transformers import AutoTokenizer, AutoModel
import cv2

# Define the configuration as a dictionary or object
class CFG:
    image_embedding = 1408  # Output size of EfficientNet-B2
    text_encoder_model = "bert-base-uncased"  # Replace with your Bangla model if available
    text_max_length = 200
    batch_size = 16

class CLIPModel(nn.Module):
    """CLIP model for Bangla"""
    def __init__(self):
        super(CLIPModel, self).__init__()
        self.image_encoder = models.efficientnet_b2(weights="EfficientNet_B2_Weights.DEFAULT")
        self.image_encoder.classifier = nn.Identity()  # Corrected to EfficientNet-B2's attribute

        self.image_out = nn.Sequential(
            nn.Linear(CFG.image_embedding, 256), nn.ReLU(), nn.Linear(256, 256)
        )

        self.text_encoder = AutoModel.from_pretrained(CFG.text_encoder_model)
        self.target_token_idx = 0

        self.text_out = nn.Sequential(
            nn.Linear(768, 256), nn.ReLU(), nn.Linear(256, 256)
        )

    def forward(self, image, text, mask):
        image_vec = self.image_encoder(image)
        image_vec = self.image_out(image_vec)

        text_out = self.text_encoder(text, attention_mask=mask)
        last_hidden_states = text_out.last_hidden_state

        last_hidden_states = last_hidden_states[:, self.target_token_idx, :]
        text_vec = self.text_out(last_hidden_states)

        return image_vec, text_vec

    def get_image_embeddings(self, image):
        image_vec = self.image_encoder(image)
        image_vec = self.image_out(image_vec)

        return image_vec

    def get_text_embeddings(self, text, mask):
        text_out = self.text_encoder(text, attention_mask=mask)
        last_hidden_states = text_out.last_hidden_state

        last_hidden_states = last_hidden_states[:, self.target_token_idx, :]
        text_vec = self.text_out(last_hidden_states)

        return text_vec


if __name__ == '__main__':
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Random dummy inputs for testing
    images = torch.randn(40, 3, 224, 224).to(device)
    input_ids = torch.randint(5, 300, size=(40, CFG.text_max_length)).to(device)
    attention_mask = torch.ones(40, CFG.text_max_length).to(device)

    print("Building CLIP")
    clip_model = CLIPModel().to(device)
    print(clip_model)

    img_vec, text_vec = clip_model(images, input_ids, attention_mask)
    print(img_vec.shape)
    print(text_vec.shape)


Building CLIP


Downloading: "https://download.pytorch.org/models/efficientnet_b2_rwightman-c35c1473.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b2_rwightman-c35c1473.pth
100%|██████████| 35.2M/35.2M [00:00<00:00, 55.9MB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

CLIPModel(
  (image_encoder): EfficientNet(
    (features): Sequential(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): SiLU(inplace=True)
      )
      (1): Sequential(
        (0): MBConv(
          (block): Sequential(
            (0): Conv2dNormActivation(
              (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
              (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
              (2): SiLU(inplace=True)
            )
            (1): SqueezeExcitation(
              (avgpool): AdaptiveAvgPool2d(output_size=1)
              (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
              (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
              (activation): SiLU(inplace=True)
              (sca

#Tokenizer

In [11]:
from transformers import AutoTokenizer, AutoModelForPreTraining

# Load tokenizer and model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglabert")
model = AutoModelForPreTraining.from_pretrained("csebuetnlp/banglabert")

# Example captions
captions = [
    'একটি দমকা লাল জ্যাকেটের একটি মহিলা একটি আইস স্কেটিং রিংতে ছবির জন্য পোজ দিচ্ছেন। একটি দমকা লাল জ্যাকেটের একটি মহিলা একটি আইস স্কেটিং রিংতে ছবির জন্য পোজ দিচ্ছেন।'
]

# Encode captions
encoded_captions = tokenizer(
    captions, padding=True, truncation=True, max_length=5, return_tensors="pt"
)
print("Encoded captions:", encoded_captions)

# Tokenize without encoding for inspection
tokens = tokenizer.tokenize(captions[0])
print("Tokenized caption:", tokens)


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/528k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Encoded captions: {'input_ids': tensor([[    2,   990, 23482,  2087,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
Tokenized caption: ['একটি', 'দমকা', 'লাল', 'জ্যাকেট', '##ের', 'একটি', 'মহিলা', 'একটি', 'আইস', 'স্কে', '##টিং', 'রিং', '##তে', 'ছবির', 'জন্য', 'পো', '##জ', 'দিচ্ছেন', '।', 'একটি', 'দমকা', 'লাল', 'জ্যাকেট', '##ের', 'একটি', 'মহিলা', 'একটি', 'আইস', 'স্কে', '##টিং', 'রিং', '##তে', 'ছবির', 'জন্য', 'পো', '##জ', 'দিচ্ছেন', '।']


#Training Model

In [12]:
import os

print("Current directory:", os.getcwd())
print("Files in the directory:", os.listdir('/content'))


Current directory: /content
Files in the directory: ['.config', 'drive', 'sample_data']


In [13]:
print(vars(CFG))


{'__module__': '__main__', 'image_embedding': 1408, 'text_encoder_model': 'bert-base-uncased', 'text_max_length': 200, 'batch_size': 16, '__dict__': <attribute '__dict__' of 'CFG' objects>, '__weakref__': <attribute '__weakref__' of 'CFG' objects>, '__doc__': None}


In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import DataLoader
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from transformers import AutoTokenizer
from PIL import Image
import time
from google.colab import drive
import os

# Mount Google Drive
if not os.path.ismount('/content/drive'):
    drive.mount('/content/drive')

# Paths to CSV files
train_csv_path = '/content/drive/MyDrive/Bangla Image dataset with caption/train_df_bang.csv'
valid_csv_path = '/content/drive/MyDrive/Bangla Image dataset with caption/valid_df_bang.csv'

# Load the training and validation dataframes
train_df = pd.read_csv(train_csv_path, encoding='utf8').dropna()
valid_df = pd.read_csv(valid_csv_path, encoding='utf8').dropna()

# Ensure the tokenizer and CFG setup is consistent
tokenizer = AutoTokenizer.from_pretrained(CFG.text_encoder_model)

# Collate Function for DataLoader
def custom_collate_fn(samples):
    img, caption = zip(*samples)  # Unzip images and captions
    img = torch.stack(img)  # Stack images into a batch
    captions = list(caption)  # Convert tuple to list
    token_list = tokenizer(captions, padding=True, truncation=True, max_length=CFG.text_max_length, return_tensors="pt")
    return img, token_list["input_ids"], token_list["attention_mask"]

import os
import tempfile

# Ensure CFG class has all required attributes
class CFG:
    # Debugging mode
    debug = True

    # Dataset paths
    dataset_root = "/content/drive/MyDrive/Bangla Image dataset with caption"
    train_json = f"{dataset_root}/train_df_bang.csv"
    val_json = f"{dataset_root}/valid_df_bang.csv"

    # Hyperparameters
    batch_size = 200
    num_workers = 4  # Correctly defined num_workers
    head_lr = 1e-3
    image_encoder_lr = 1e-4
    text_encoder_lr = 1e-5
    weight_decay = 1e-3
    patience = 2
    factor = 0.8
    epochs = 350

    # GPU/CPU configuration
    gpu = 1
    device = torch.device(f"cuda:{gpu}" if torch.cuda.is_available() else "cpu")

    # Model configuration
    model_name = 'resnet50'
    image_embedding = 1000  # Adjusted for ResNet50
    text_encoder_model = "csebuetnlp/banglabert"
    text_tokenizer = "csebuetnlp/banglabert"
    text_max_length = 100  # Updated from max_length for consistency

    # Model tags for logging and saving
    model_tag = f"{model_name}_{text_encoder_model.replace('/', '_')}_aug"
    log_tag = model_tag

    # Pretraining and training settings
    pretrained = True
    trainable = True
    temperature = 1.0

    # Image input size
    size = 224

    # Projection head configuration
    num_projection_layers = 1
    projection_dim = 256
    dropout = 0.1

# Training Function
def train_and_val_model(model, criterion, train_loader, val_loader, optimizer, num_epochs=10, scheduler=None):
    since = time.time()
    best_loss = float('inf')
    pbar = tqdm(range(num_epochs))

    for epoch in pbar:
        model.train()
        running_loss = 0.0

        for input, texts, masks in train_loader:
            input, texts, masks = input.to(CFG.device), texts.to(CFG.device), masks.to(CFG.device)
            optimizer.zero_grad()
            image_vec, text_vec = model(input, texts, masks)
            logits = torch.matmul(text_vec, image_vec.T)
            targets = torch.arange(logits.size(0)).long().to(CFG.device)
            texts_loss = criterion(logits, targets)
            images_loss = criterion(logits.T, targets)
            loss = (images_loss + texts_loss) / 2.0
            loss.backward()
            optimizer.step()
            if scheduler:
                scheduler.step()
            running_loss += loss.item()

        train_loss = running_loss
        model.eval()
        running_loss = 0.0
        with torch.no_grad():
            for input, texts, masks in val_loader:
                input, texts, masks = input.to(CFG.device), texts.to(CFG.device), masks.to(CFG.device)
                image_vec, text_vec = model(input, texts, masks)
                logits = torch.matmul(text_vec, image_vec.T)
                targets = torch.arange(logits.size(0)).long().to(CFG.device)
                texts_loss = criterion(logits, targets)
                images_loss = criterion(logits.T, targets)
                loss = (images_loss + texts_loss) / 2.0
                running_loss += loss.item()

        val_loss = running_loss
        if val_loss < best_loss:
            best_loss = val_loss
            best_model_path = f"saved_models/{CFG.model_tag}_best_epoch_{epoch}.pt"
            torch.save(model.state_dict(), best_model_path)
            print("Saved Best Model!")

        print(f"Epoch {epoch} :: Train Loss: {train_loss:.4f} :: Val Loss: {val_loss:.4f}")
        pbar.set_description(f"train loss {train_loss:.4f} val loss {val_loss:.4f}")

    time_elapsed = time.time() - since
    print(f"Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s")
    return model
# Function to build DataLoaders
def build_data_loaders(dataframe, tokenizer, mode="train"):
    # Define transformations for images
    transform = transforms.Compose([
        transforms.Resize((CFG.size, CFG.size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Create dataset
    dataset = CLIPDataset(dataframe, tokenizer, transform=transform)

    # DataLoader
    dataloader = DataLoader(
        dataset,
        batch_size=CFG.batch_size,
        shuffle=(mode == "train"),
        num_workers=CFG.num_workers,
        collate_fn=custom_collate_fn,  # Use your custom collate function
    )

    return dataloader

# Build DataLoaders
train_loader = build_data_loaders(train_df, tokenizer, mode="train")
valid_loader = build_data_loaders(valid_df, tokenizer, mode="valid")

# Initialize the model
model = CLIPModel().to(CFG.device)

# Define the loss function (criterion)
criterion = nn.CrossEntropyLoss()  # For classification tasks

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=CFG.head_lr)

# Train the model
num_epochs = 1  # Reduced for quick testing
model = train_and_val_model(
    model, criterion, train_loader, valid_loader, optimizer, num_epochs=num_epochs, scheduler=None
)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

TypeError: CLIPDataset.__init__() got an unexpected keyword argument 'transform'