In [1]:
!pip install transformers torch torchvision scikit-learn pandas pillow tqdm openpyxl


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
!pip install ftfy


Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m37.0 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1


In [None]:
import sys
sys.path.append("/kaggle/input/dataset2")




import os
import argparse
from pathlib import Path
import pandas as pd
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torchvision.transforms as T
import torchvision.models as models
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from normalizer import normalize




class MemeDataset(Dataset):
    def __init__(self, df, images_dir, tokenizer, max_length=128, image_size=224, use_normalizer=True):
        self.df = df.reset_index(drop=True)
        self.images_dir = Path(images_dir)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.image_size = image_size
        self.use_normalizer = use_normalizer


        self.transform = T.Compose([
            T.Resize((image_size, image_size)),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])


    def __len__(self):
        return len(self.df)


    def __getitem__(self, idx):
        row = self.df.loc[idx]
        img_path = self.images_dir / row['image_file_name']
        try:
            img = Image.open(img_path).convert('RGB')
        except Exception:
            img = Image.new('RGB', (self.image_size, self.image_size), color=(0, 0, 0))
        img = self.transform(img)


        text = str(row['text']) if pd.notna(row['text']) else ""
       
        # Normalize text using the normalizer
        if self.use_normalizer and text:
            try:
                text = normalize(text)
            except Exception as e:
                print(f"Warning: Normalization failed for text at index {idx}: {e}")
                # Fall back to original text if normalization fails
                pass
       
        tok = self.tokenizer(text, truncation=True, padding='max_length',
                             max_length=self.max_length, return_tensors='pt')
        input_ids = tok['input_ids'].squeeze(0)
        attention_mask = tok['attention_mask'].squeeze(0)


        label = int(row['label'])
        return {
            'image': img,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }




class MultimodalClassifier(nn.Module):
    def __init__(self, text_model_name='csebuetnlp/banglishbert',
                 num_labels=3, text_feat_dim=768, hidden_dim=512,
                 dropout=0.2, freeze_text=False, freeze_image=False):
        super().__init__()


        # TEXT ENCODER (BanglishBERT)
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        if freeze_text:
            for p in self.text_encoder.parameters():
                p.requires_grad = False


        # Determine BERT hidden size
        hidden_size = self.text_encoder.config.hidden_size


        self.text_proj = nn.Linear(hidden_size, text_feat_dim)


        # IMAGE ENCODER: ConvNeXt-Base
        convnext = models.convnext_base(weights=models.ConvNeXt_Base_Weights.DEFAULT)
        features = convnext.features
        if freeze_image:
            for p in features.parameters():
                p.requires_grad = False


        self.image_encoder = nn.Sequential(
            *list(features.children()),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(1),
        )


        image_feat_dim = 1024  # ConvNeXt‑Base has 1024 channels


        self.image_proj = nn.Linear(image_feat_dim, image_feat_dim)


        # CLASSIFIER HEAD
        concat_dim = text_feat_dim + image_feat_dim
        self.classifier = nn.Sequential(
            nn.Linear(concat_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_labels),
        )


    def forward(self, input_ids, attention_mask, images):
        text_out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        if hasattr(text_out, 'pooler_output') and text_out.pooler_output is not None:
            pooled = text_out.pooler_output
        else:
            last_hidden = text_out.last_hidden_state
            mask = attention_mask.unsqueeze(-1).float()
            summed = (last_hidden * mask).sum(1)
            denom = mask.sum(1).clamp(min=1e-9)
            pooled = summed / denom


        txt_feat = self.text_proj(pooled)


        img_feat = self.image_encoder(images)  # [B, 1024]
        img_feat = self.image_proj(img_feat)


        feat = torch.cat([txt_feat, img_feat], dim=1)
        logits = self.classifier(feat)
        return logits




def find_discrepancies(df, images_dir):
    images_dir = Path(images_dir)
    referenced = set(df['image_file_name'].astype(str).tolist())
    actual = set([p.name for p in images_dir.glob('*') if p.is_file()])
    missing = sorted(list(referenced - actual))
    orphan = sorted(list(actual - referenced))
    return missing, orphan




def prepare_dataframe(path, images_dir, drop_label_value=2):
    df = pd.read_excel(path)
    assert 'image_file_name' in df.columns and 'text' in df.columns and 'label' in df.columns, \
        "metadata.xlsx must contain columns: image_file_name, text, label"


    df = df[df['label'] != drop_label_value].copy()
    df['image_file_name'] = df['image_file_name'].astype(str).str.strip()


    missing, orphan = find_discrepancies(df, images_dir)
    if missing:
        print(f"Missing images for {len(missing)} metadata entries")
        df = df[~df['image_file_name'].isin(missing)].copy()


    if orphan:
        print(f"Found {len(orphan)} orphan image files not in metadata:")
        for o in orphan[:20]:
            print("  -", o)
        if len(orphan) > 20:
            print("  ... and more")


    unique_labels = sorted(df['label'].unique().tolist())
    label_map = {orig: idx for idx, orig in enumerate(unique_labels)}
    df['label'] = df['label'].map(label_map)
    print("Label mapping:", label_map)
    return df, orphan, label_map




def compute_class_weights(df):
    counts = df['label'].value_counts().sort_index().values
    weights = 1.0 / counts
    sample_weights = df['label'].map(lambda x: weights[x]).values
    return sample_weights




def collate_fn(batch):
    images = torch.stack([b['image'] for b in batch])
    input_ids = torch.stack([b['input_ids'] for b in batch])
    attention_mask = torch.stack([b['attention_mask'] for b in batch])
    labels = torch.stack([b['label'] for b in batch])
    return {
        'image': images,
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }




def train_one_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0.0
    criterion = nn.CrossEntropyLoss()
    for batch in tqdm(dataloader, desc="Train"):
        images = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)


        optimizer.zero_grad()
        logits = model(input_ids=input_ids, attention_mask=attention_mask, images=images)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()


        total_loss += loss.item() * images.size(0)
    return total_loss / len(dataloader.dataset)




@torch.no_grad()
def evaluate(model, dataloader, device, label_map):
    model.eval()
    preds = []
    trues = []
    for batch in tqdm(dataloader, desc="Eval"):
        images = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)


        logits = model(input_ids=input_ids, attention_mask=attention_mask, images=images)
        batch_preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
        batch_trues = labels.cpu().numpy().tolist()
        preds.extend(batch_preds)
        trues.extend(batch_trues)


    acc = accuracy_score(trues, preds)
    report = classification_report(trues, preds, digits=4)
    return acc, report, trues, preds




def main(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Device:", device)


    df, orphan_files, label_map = prepare_dataframe(args.data, args.images_dir, drop_label_value=2)


    if args.delete_orphans and orphan_files:
        for fname in orphan_files:
            p = Path(args.images_dir) / fname
            try:
                p.unlink()
            except Exception as e:
                print("Could not delete:", p, e)
        print("Deleted orphans.")


    train_df, test_df = train_test_split(df, test_size=args.test_size, stratify=df['label'], random_state=42)
    train_df, val_df = train_test_split(train_df, test_size=args.val_size, stratify=train_df['label'], random_state=42)
    print("Train / Val / Test sizes:", len(train_df), len(val_df), len(test_df))


    tokenizer = AutoTokenizer.from_pretrained(args.text_model)
   
    use_normalizer = not args.disable_normalizer
    if use_normalizer:
        print("Text normalization enabled")
    else:
        print("Text normalization disabled")
   
    train_dataset = MemeDataset(train_df, args.images_dir, tokenizer,
                                max_length=args.max_length, image_size=args.image_size,
                                use_normalizer=use_normalizer)
    val_dataset = MemeDataset(val_df, args.images_dir, tokenizer,
                              max_length=args.max_length, image_size=args.image_size,
                              use_normalizer=use_normalizer)
    test_dataset = MemeDataset(test_df, args.images_dir, tokenizer,
                               max_length=args.max_length, image_size=args.image_size,
                               use_normalizer=use_normalizer)


    sample_weights = compute_class_weights(train_df)
    sampler = WeightedRandomSampler(sample_weights, num_samples=len(train_dataset), replacement=True)


    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=sampler, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)


    num_labels = len(label_map)
    model = MultimodalClassifier(text_model_name=args.text_model,
                                 num_labels=num_labels,
                                 text_feat_dim=args.text_feat_dim,
                                 hidden_dim=args.hidden_dim,
                                 dropout=args.dropout,
                                 freeze_text=args.freeze_text,
                                 freeze_image=args.freeze_image)
    model.to(device)


    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr)


    best_val_acc = 0.0
    os.makedirs(args.out_dir, exist_ok=True)


    for epoch in range(1, args.epochs + 1):
        print(f"Epoch {epoch}/{args.epochs}")
        train_loss = train_one_epoch(model, train_loader, optimizer, device)
        print("Train loss:", train_loss)
        val_acc, val_report, _, _ = evaluate(model, val_loader, device, label_map)
        print("Validation Acc:", val_acc)
        print("Validation report:\n", val_report)


        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save({'model_state_dict': model.state_dict(), 'label_map': label_map},
                       os.path.join(args.out_dir, "best_model.pt"))
            print("Saved best model.")


    print("Testing best model …")
    ckpt = torch.load(os.path.join(args.out_dir, "best_model.pt"), map_location=device)
    model.load_state_dict(ckpt['model_state_dict'])


    test_acc, test_report, trues, preds = evaluate(model, test_loader, device, label_map)
    print("Test Acc:", test_acc)
    print("Test report:\n", test_report)


    out = test_df.reset_index(drop=True).copy()
    out['pred_idx'] = preds
    inv_map = {v: k for k, v in label_map.items()}
    out['pred_orig'] = out['pred_idx'].map(inv_map)
    out.to_csv(os.path.join(args.out_dir, "test_predictions.csv"), index=False)
    print("Saved test predictions to", os.path.join(args.out_dir, "test_predictions.csv"))




if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', type=str, default='/kaggle/input/dataset2/metadata.xlsx')
    parser.add_argument('--images_dir', type=str, default='/kaggle/input/dataset2/images')
    parser.add_argument('--out_dir', type=str, default='/kaggle/working/output')
    parser.add_argument('--epochs', type=int, default=30)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--lr', type=float, default=2e-5)
    parser.add_argument('--text_model', type=str, default='csebuetnlp/banglishbert')
    parser.add_argument('--max_length', type=int, default=64)
    parser.add_argument('--image_size', type=int, default=224)
    parser.add_argument('--val_size', type=float, default=0.1)
    parser.add_argument('--test_size', type=float, default=0.1)
    parser.add_argument('--text_feat_dim', type=int, default=768)  # match BERT hidden
    parser.add_argument('--hidden_dim', type=int, default=512)
    parser.add_argument('--dropout', type=float, default=0.2)
    parser.add_argument('--freeze_text', action='store_true')
    parser.add_argument('--freeze_image', action='store_true')
    parser.add_argument('--delete-orphans', action='store_true')
    parser.add_argument('--disable-normalizer', action='store_true',
                        help='Disable text normalization (enabled by default)')


    args = parser.parse_args([])
    main(args)



Device: cuda
Found 4 orphan image files not in metadata:
  - FB_IMG_1751540473613.jpg
  - FB_IMG_1751739942837.jpg
  - FB_IMG_1754929300743.jpg
  - FB_IMG_1755921270397.jpg
Label mapping: {0: 0, 1: 1, 3: 2}
Train / Val / Test sizes: 5508 612 680


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Text normalization enabled


2025-11-21 19:08:39.205868: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763752119.392460      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763752119.441696      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/convnext_base-6075fbad.pth" to /root/.cache/torch/hub/checkpoints/convnext_base-6075fbad.pth
100%|██████████| 338M/338M [00:01<00:00, 198MB/s] 


Epoch 1/30


Train: 100%|██████████| 345/345 [08:38<00:00,  1.50s/it]


Train loss: 0.9141876508381026


Eval: 100%|██████████| 39/39 [00:26<00:00,  1.50it/s]


Validation Acc: 0.6356209150326797
Validation report:
               precision    recall  f1-score   support

           0     0.5870    0.4709    0.5226       172
           1     0.5056    0.6818    0.5806       132
           2     0.7365    0.7078    0.7219       308

    accuracy                         0.6356       612
   macro avg     0.6097    0.6202    0.6084       612
weighted avg     0.6447    0.6356    0.6354       612

Saved best model.
Epoch 2/30


Train: 100%|██████████| 345/345 [08:31<00:00,  1.48s/it]


Train loss: 0.6367476462625297


Eval: 100%|██████████| 39/39 [00:23<00:00,  1.70it/s]


Validation Acc: 0.6601307189542484
Validation report:
               precision    recall  f1-score   support

           0     0.6491    0.4302    0.5175       172
           1     0.5714    0.5758    0.5736       132
           2     0.6959    0.8247    0.7548       308

    accuracy                         0.6601       612
   macro avg     0.6388    0.6102    0.6153       612
weighted avg     0.6559    0.6601    0.6490       612

Saved best model.
Epoch 3/30


Train: 100%|██████████| 345/345 [08:28<00:00,  1.48s/it]


Train loss: 0.43975068153996094


Eval: 100%|██████████| 39/39 [00:23<00:00,  1.64it/s]


Validation Acc: 0.6421568627450981
Validation report:
               precision    recall  f1-score   support

           0     0.5959    0.5058    0.5472       172
           1     0.4944    0.6667    0.5677       132
           2     0.7569    0.7078    0.7315       308

    accuracy                         0.6422       612
   macro avg     0.6157    0.6268    0.6155       612
weighted avg     0.6550    0.6422    0.6444       612

Epoch 4/30


Train: 100%|██████████| 345/345 [08:26<00:00,  1.47s/it]


Train loss: 0.2799917005432652


Eval: 100%|██████████| 39/39 [00:22<00:00,  1.72it/s]


Validation Acc: 0.6617647058823529
Validation report:
               precision    recall  f1-score   support

           0     0.6446    0.4535    0.5324       172
           1     0.5645    0.5303    0.5469       132
           2     0.7003    0.8344    0.7615       308

    accuracy                         0.6618       612
   macro avg     0.6365    0.6061    0.6136       612
weighted avg     0.6554    0.6618    0.6508       612

Saved best model.
Epoch 5/30


Train: 100%|██████████| 345/345 [08:20<00:00,  1.45s/it]


Train loss: 0.17970946411782782


Eval: 100%|██████████| 39/39 [00:22<00:00,  1.76it/s]


Validation Acc: 0.6503267973856209
Validation report:
               precision    recall  f1-score   support

           0     0.5696    0.5233    0.5455       172
           1     0.5966    0.5379    0.5657       132
           2     0.7075    0.7695    0.7372       308

    accuracy                         0.6503       612
   macro avg     0.6246    0.6102    0.6161       612
weighted avg     0.6448    0.6503    0.6463       612

Epoch 6/30


Train: 100%|██████████| 345/345 [08:17<00:00,  1.44s/it]


Train loss: 0.12149307366863661


Eval: 100%|██████████| 39/39 [00:22<00:00,  1.77it/s]


Validation Acc: 0.6633986928104575
Validation report:
               precision    recall  f1-score   support

           0     0.6496    0.4419    0.5260       172
           1     0.6275    0.4848    0.5470       132
           2     0.6768    0.8636    0.7589       308

    accuracy                         0.6634       612
   macro avg     0.6513    0.5968    0.6106       612
weighted avg     0.6585    0.6634    0.6477       612

Saved best model.
Epoch 7/30


Train: 100%|██████████| 345/345 [08:15<00:00,  1.43s/it]


Train loss: 0.08107698259681745


Eval: 100%|██████████| 39/39 [00:22<00:00,  1.76it/s]


Validation Acc: 0.6764705882352942
Validation report:
               precision    recall  f1-score   support

           0     0.6667    0.4651    0.5479       172
           1     0.6182    0.5152    0.5620       132
           2     0.6963    0.8636    0.7710       308

    accuracy                         0.6765       612
   macro avg     0.6604    0.6146    0.6270       612
weighted avg     0.6711    0.6765    0.6632       612

Saved best model.
Epoch 8/30


Train: 100%|██████████| 345/345 [08:15<00:00,  1.44s/it]


Train loss: 0.07392950105354373


Eval: 100%|██████████| 39/39 [00:21<00:00,  1.78it/s]


Validation Acc: 0.6617647058823529
Validation report:
               precision    recall  f1-score   support

           0     0.6074    0.4767    0.5342       172
           1     0.6154    0.4848    0.5424       132
           2     0.6944    0.8409    0.7606       308

    accuracy                         0.6618       612
   macro avg     0.6391    0.6008    0.6124       612
weighted avg     0.6529    0.6618    0.6499       612

Epoch 9/30


Train: 100%|██████████| 345/345 [08:11<00:00,  1.43s/it]


Train loss: 0.05131201605791376


Eval: 100%|██████████| 39/39 [00:21<00:00,  1.79it/s]


Validation Acc: 0.6421568627450981
Validation report:
               precision    recall  f1-score   support

           0     0.5850    0.5000    0.5392       172
           1     0.5812    0.5152    0.5462       132
           2     0.6868    0.7760    0.7287       308

    accuracy                         0.6422       612
   macro avg     0.6177    0.5970    0.6047       612
weighted avg     0.6354    0.6422    0.6361       612

Epoch 10/30


Train: 100%|██████████| 345/345 [08:11<00:00,  1.42s/it]


Train loss: 0.05463695562696226


Eval: 100%|██████████| 39/39 [00:21<00:00,  1.79it/s]


Validation Acc: 0.6519607843137255
Validation report:
               precision    recall  f1-score   support

           0     0.6364    0.3663    0.4649       172
           1     0.6018    0.5152    0.5551       132
           2     0.6700    0.8701    0.7571       308

    accuracy                         0.6520       612
   macro avg     0.6360    0.5839    0.5924       612
weighted avg     0.6458    0.6520    0.6314       612

Epoch 11/30


Train: 100%|██████████| 345/345 [08:14<00:00,  1.43s/it]


Train loss: 0.04893709722528952


Eval: 100%|██████████| 39/39 [00:21<00:00,  1.77it/s]


Validation Acc: 0.6633986928104575
Validation report:
               precision    recall  f1-score   support

           0     0.6786    0.4419    0.5352       172
           1     0.6122    0.4545    0.5217       132
           2     0.6716    0.8766    0.7606       308

    accuracy                         0.6634       612
   macro avg     0.6542    0.5910    0.6058       612
weighted avg     0.6608    0.6634    0.6457       612

Epoch 12/30


Train:  50%|█████     | 174/345 [04:09<04:04,  1.43s/it]