In [10]:
!pip install transformers torch torchvision scikit-learn pandas pillow tqdm openpyxl


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [11]:
!pip install ftfy


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
import sys
sys.path.append("/kaggle/input/dataset6")

import os
import argparse
from pathlib import Path
import pandas as pd
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torchvision.transforms as T
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
from normalizer import normalize


class MemeDataset(Dataset):
    def __init__(self, df, images_dir, tokenizer, max_length=128, image_size=224, use_normalizer=True):
        self.df = df.reset_index(drop=True)
        self.images_dir = Path(images_dir)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.image_size = image_size
        self.use_normalizer = use_normalizer

        
        self.transform = T.Compose([
            T.Resize((image_size, image_size)),
            T.ToTensor(),  # produces float in [0,1]
            T.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.loc[idx]
        img_path = self.images_dir / row['image_file_name']
        try:
            img = Image.open(img_path).convert('RGB')
        except Exception:
            img = Image.new('RGB', (self.image_size, self.image_size), color=(0, 0, 0))
        img = self.transform(img)  # tensor [C,H,W], normalized

        text = str(row['text']) if pd.notna(row['text']) else ""

        # Normalize text using the normalizer
        if self.use_normalizer and text:
            try:
                text = normalize(text)
            except Exception as e:
                print(f"Warning: Normalization failed for text at index {idx}: {e}")
                
                pass

        tok = self.tokenizer(text, truncation=True, padding='max_length',
                             max_length=self.max_length, return_tensors='pt')
        input_ids = tok['input_ids'].squeeze(0)
        attention_mask = tok['attention_mask'].squeeze(0)

        label = int(row['label'])
        return {
            'image': img,  
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }


class MultimodalClassifier(nn.Module):
    def __init__(
        self,
        text_model_name='csebuetnlp/banglishbert',
        num_labels=3,
        text_feat_dim=768,
        hidden_dim=512,
        dropout=0.2,
        freeze_text=False,
        freeze_image=False
    ):
        super().__init__()

        # TEXT ENCODER 
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        if freeze_text:
            for p in self.text_encoder.parameters():
                p.requires_grad = False

        bert_hidden = self.text_encoder.config.hidden_size
        self.text_proj = nn.Linear(bert_hidden, text_feat_dim)

        #  IMAGE ENCODER (ViT) 
        
        from transformers import ViTModel
        self.image_encoder = ViTModel.from_pretrained("google/vit-base-patch16-224")
        if freeze_image:
            for p in self.image_encoder.parameters():
                p.requires_grad = False

        image_feat_dim = self.image_encoder.config.hidden_size  
        self.image_proj = nn.Linear(image_feat_dim, image_feat_dim)

        # CLASSIFIER 
        concat_dim = text_feat_dim + image_feat_dim
        self.classifier = nn.Sequential(
            nn.Linear(concat_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_labels),
        )

    def forward(self, images, input_ids, attention_mask):
        """
        images: Tensor shape (B, 3, H, W), already normalized to ImageNet mean/std
        input_ids, attention_mask: tensors on the same device
        """
        device = next(self.parameters()).device
        
        images = images.to(device)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        # IMAGE FORWARD 
        # ViTModel expects "pixel_values" shaped (B, C, H, W)
        img_out = self.image_encoder(pixel_values=images)
        image_feat = img_out.last_hidden_state[:, 0, :]  # CLS token
        image_feat = self.image_proj(image_feat)

        #  TEXT FORWARD 
        txt_out = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        text_feat = txt_out.last_hidden_state[:, 0, :]  # CLS token
        text_feat = self.text_proj(text_feat)

        # FUSION 
        fusion = torch.cat([image_feat, text_feat], dim=1)
        logits = self.classifier(fusion)

        return logits


def find_discrepancies(df, images_dir):
    images_dir = Path(images_dir)
    referenced = set(df['image_file_name'].astype(str).tolist())
    actual = set([p.name for p in images_dir.glob('*') if p.is_file()])
    missing = sorted(list(referenced - actual))
    orphan = sorted(list(actual - referenced))
    return missing, orphan


def prepare_dataframe(path, images_dir, drop_label_value=2):
    df = pd.read_excel(path)
    assert 'image_file_name' in df.columns and 'text' in df.columns and 'label' in df.columns, \
        "metadata.xlsx must contain columns: image_file_name, text, label"

    df = df[df['label'] != drop_label_value].copy()
    df['image_file_name'] = df['image_file_name'].astype(str).str.strip()

    missing, orphan = find_discrepancies(df, images_dir)
    if missing:
        print(f"Missing images for {len(missing)} metadata entries")
        df = df[~df['image_file_name'].isin(missing)].copy()

    if orphan:
        print(f"Found {len(orphan)} orphan image files not in metadata:")
        for o in orphan[:20]:
            print("  -", o)
        if len(orphan) > 20:
            print("  ... and more")

    unique_labels = sorted(df['label'].unique().tolist())
    label_map = {orig: idx for idx, orig in enumerate(unique_labels)}
    df['label'] = df['label'].map(label_map)
    print("Label mapping:", label_map)
    return df, orphan, label_map


def compute_class_weights(df):
    counts = df['label'].value_counts().sort_index().values
    weights = 1.0 / counts
    # map label (already remapped to 0..K-1) to its sample weight
    sample_weights = df['label'].map(lambda x: weights[x]).values
    return sample_weights


def collate_fn(batch):
    # each b['image'] is a tensor [3,H,W] same H,W by our transform
    images = torch.stack([b['image'] for b in batch])
    input_ids = torch.stack([b['input_ids'] for b in batch])
    attention_mask = torch.stack([b['attention_mask'] for b in batch])
    labels = torch.stack([b['label'] for b in batch])
    return {
        'image': images,
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }


def train_one_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0.0
    criterion = nn.CrossEntropyLoss()
    for batch in tqdm(dataloader, desc="Train"):
        images = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        
        logits = model(images, input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * images.size(0)
    return total_loss / len(dataloader.dataset)


@torch.no_grad()
def evaluate(model, dataloader, device, label_map):
    model.eval()
    preds = []
    trues = []
    for batch in tqdm(dataloader, desc="Eval"):
        images = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        logits = model(images, input_ids, attention_mask)
        batch_preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
        batch_trues = labels.cpu().numpy().tolist()
        preds.extend(batch_preds)
        trues.extend(batch_trues)

    acc = accuracy_score(trues, preds)
    report = classification_report(trues, preds, digits=4)
    return acc, report, trues, preds


def main(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Device:", device)

    df, orphan_files, label_map = prepare_dataframe(args.data, args.images_dir, drop_label_value=2)

    if args.delete_orphans and orphan_files:
        for fname in orphan_files:
            p = Path(args.images_dir) / fname
            try:
                p.unlink()
            except Exception as e:
                print("Could not delete:", p, e)
        print("Deleted orphans.")

    train_df, test_df = train_test_split(df, test_size=args.test_size, stratify=df['label'], random_state=42)
    train_df, val_df = train_test_split(train_df, test_size=args.val_size, stratify=train_df['label'], random_state=42)
    print("Train / Val / Test sizes:", len(train_df), len(val_df), len(test_df))

    tokenizer = AutoTokenizer.from_pretrained(args.text_model)

    use_normalizer = not args.disable_normalizer
    if use_normalizer:
        print("Text normalization enabled")
    else:
        print("Text normalization disabled")

    train_dataset = MemeDataset(train_df, args.images_dir, tokenizer,
                                max_length=args.max_length, image_size=args.image_size,
                                use_normalizer=use_normalizer)
    val_dataset = MemeDataset(val_df, args.images_dir, tokenizer,
                              max_length=args.max_length, image_size=args.image_size,
                              use_normalizer=use_normalizer)
    test_dataset = MemeDataset(test_df, args.images_dir, tokenizer,
                               max_length=args.max_length, image_size=args.image_size,
                               use_normalizer=use_normalizer)

    sample_weights = compute_class_weights(train_df)
    sampler = WeightedRandomSampler(sample_weights, num_samples=len(train_dataset), replacement=True)

    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=sampler, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn)

    num_labels = len(label_map)
    model = MultimodalClassifier(text_model_name=args.text_model,
                                 num_labels=num_labels,
                                 text_feat_dim=args.text_feat_dim,
                                 hidden_dim=args.hidden_dim,
                                 dropout=args.dropout,
                                 freeze_text=args.freeze_text,
                                 freeze_image=args.freeze_image)
    model.to(device)

    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr)

    best_val_acc = 0.0
    os.makedirs(args.out_dir, exist_ok=True)

    for epoch in range(1, args.epochs + 1):
        print(f"Epoch {epoch}/{args.epochs}")
        train_loss = train_one_epoch(model, train_loader, optimizer, device)
        print("Train loss:", train_loss)
        val_acc, val_report, _, _ = evaluate(model, val_loader, device, label_map)
        print("Validation Acc:", val_acc)
        print("Validation report:\n", val_report)

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save({'model_state_dict': model.state_dict(), 'label_map': label_map},
                       os.path.join(args.out_dir, "best_model.pt"))
            print("Saved best model.")

    print("Testing best model …")
    ckpt = torch.load(os.path.join(args.out_dir, "best_model.pt"), map_location=device)
    model.load_state_dict(ckpt['model_state_dict'])

    test_acc, test_report, trues, preds = evaluate(model, test_loader, device, label_map)
    print("Test Acc:", test_acc)
    print("Test report:\n", test_report)

    out = test_df.reset_index(drop=True).copy()
    out['pred_idx'] = preds
    inv_map = {v: k for k, v in label_map.items()}
    out['pred_orig'] = out['pred_idx'].map(inv_map)
    out.to_csv(os.path.join(args.out_dir, "test_predictions.csv"), index=False)
    print("Saved test predictions to", os.path.join(args.out_dir, "test_predictions.csv"))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', type=str, default='/kaggle/input/dataset6/metadata.xlsx')
    parser.add_argument('--images_dir', type=str, default='/kaggle/input/dataset6/images')
    parser.add_argument('--out_dir', type=str, default='/kaggle/working/output')
    parser.add_argument('--epochs', type=int, default=10)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--lr', type=float, default=2e-5)
    parser.add_argument('--text_model', type=str, default='csebuetnlp/banglishbert')
    parser.add_argument('--max_length', type=int, default=64)
    parser.add_argument('--image_size', type=int, default=224)
    parser.add_argument('--val_size', type=float, default=0.1)
    parser.add_argument('--test_size', type=float, default=0.1)
    parser.add_argument('--text_feat_dim', type=int, default=768)  # match BERT hidden
    parser.add_argument('--hidden_dim', type=int, default=512)
    parser.add_argument('--dropout', type=float, default=0.2)
    parser.add_argument('--freeze_text', action='store_true')
    parser.add_argument('--freeze_image', action='store_true')
    parser.add_argument('--delete-orphans', action='store_true')
    parser.add_argument('--disable-normalizer', action='store_true',
                        help='Disable text normalization (enabled by default)')

    args = parser.parse_args([])
    main(args)


Device: cuda
Found 4 orphan image files not in metadata:
  - FB_IMG_1751540473613.jpg
  - FB_IMG_1751739942837.jpg
  - FB_IMG_1754929300743.jpg
  - FB_IMG_1755921270397.jpg
Label mapping: {0: 0, 1: 1, 3: 2}
Train / Val / Test sizes: 5508 612 680
Text normalization enabled


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10


Train: 100%|██████████| 345/345 [06:38<00:00,  1.16s/it]


Train loss: 0.8855725892142446


Eval: 100%|██████████| 39/39 [00:25<00:00,  1.51it/s]


Validation Acc: 0.6258169934640523
Validation report:
               precision    recall  f1-score   support

           0     0.7833    0.2733    0.4052       172
           1     0.4434    0.7121    0.5465       132
           2     0.7118    0.7857    0.7469       308

    accuracy                         0.6258       612
   macro avg     0.6462    0.5904    0.5662       612
weighted avg     0.6740    0.6258    0.6076       612

Saved best model.
Epoch 2/10


Train: 100%|██████████| 345/345 [06:43<00:00,  1.17s/it]


Train loss: 0.48130913288721416


Eval: 100%|██████████| 39/39 [00:23<00:00,  1.65it/s]


Validation Acc: 0.5800653594771242
Validation report:
               precision    recall  f1-score   support

           0     0.7763    0.3430    0.4758       172
           1     0.3822    0.7500    0.5064       132
           2     0.7112    0.6396    0.6735       308

    accuracy                         0.5801       612
   macro avg     0.6232    0.5775    0.5519       612
weighted avg     0.6585    0.5801    0.5819       612

Epoch 3/10


Train: 100%|██████████| 345/345 [06:41<00:00,  1.16s/it]


Train loss: 0.24895207187297472


Eval: 100%|██████████| 39/39 [00:23<00:00,  1.65it/s]


Validation Acc: 0.6617647058823529
Validation report:
               precision    recall  f1-score   support

           0     0.5976    0.5698    0.5833       172
           1     0.6623    0.3864    0.4880       132
           2     0.6900    0.8312    0.7541       308

    accuracy                         0.6618       612
   macro avg     0.6500    0.5958    0.6085       612
weighted avg     0.6581    0.6618    0.6487       612

Saved best model.
Epoch 4/10


Train: 100%|██████████| 345/345 [06:40<00:00,  1.16s/it]


Train loss: 0.13539565882132204


Eval:  77%|███████▋  | 30/39 [00:18<00:05,  1.61it/s]