In [1]:
!pip install transformers torch torchvision scikit-learn pandas pillow tqdm openpyxl


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
!pip install ftfy


Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1


In [None]:


import sys
sys.path.append("/kaggle/input/dataset59")  

import os
import argparse
from pathlib import Path
import pandas as pd
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torchvision.transforms as T
from tqdm import tqdm


import timm
from sentence_transformers import SentenceTransformer


try:
    from normalizer import normalize
except Exception:
    def normalize(x): 
        return x  


def clean_text(text):
    if text is None or (isinstance(text, float) and pd.isna(text)):
        return ""
    text = str(text).strip()
    return text


class MemeDataset(Dataset):
    def __init__(self, df, images_dir, tokenizer=None, max_length=128, image_size=224,
                 use_normalizer=True):
        self.df = df.reset_index(drop=True)
        self.images_dir = Path(images_dir)
        self.tokenizer = tokenizer  
        self.max_length = max_length
        self.image_size = image_size
        self.use_normalizer = use_normalizer

        self.transform = T.Compose([
            T.Resize((image_size, image_size)),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225]),
        ])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.loc[idx]
        img_path = self.images_dir / row['image_file_name']
        try:
            img = Image.open(img_path).convert('RGB')
        except Exception:
            
            img = Image.new('RGB', (self.image_size, self.image_size), color=(0, 0, 0))
        img = self.transform(img)

        text = clean_text(row.get('text', ""))

        if self.use_normalizer and text:
            try:
                text = normalize(text)
            except Exception:
                pass

        
        input_ids = torch.zeros(self.max_length, dtype=torch.long)
        attention_mask = torch.zeros(self.max_length, dtype=torch.long)

        label = int(row['label'])
        return {
            'image': img,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long),
            'text': text
        }


def collate_fn(batch):
    images = torch.stack([b['image'] for b in batch])
    input_ids = torch.stack([b['input_ids'] for b in batch])
    attention_mask = torch.stack([b['attention_mask'] for b in batch])
    labels = torch.stack([b['label'] for b in batch])
    texts = [b['text'] for b in batch]
    return {
        'image': images,
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels,
        'texts': texts
    }


class MultimodalClassifier(nn.Module):
    """
    ConvNeXt base image encoder + SentenceTransformer text encoder fusion classifier.
    """
    def __init__(self,
                 sentence_model_name: str,
                 image_model_name: str = "convnext_base",
                 proj_dim: int = 512,
                 num_labels: int = 2,
                 freeze_image: bool = False,
                 freeze_text: bool = False):
        super().__init__()

        # SentenceTransformer text encoder
        self.text_encoder = SentenceTransformer(sentence_model_name)
        raw_text_dim = self.text_encoder.get_sentence_embedding_dimension()

        if freeze_text:
            for p in self.text_encoder.parameters():
                p.requires_grad = False

        self.text_proj = nn.Linear(raw_text_dim, proj_dim)

        # Image encoder (ConvNeXt from timm)
        self.image_encoder = timm.create_model(image_model_name, pretrained=True, num_classes=0)
        image_feat_dim = self.image_encoder.num_features

        if freeze_image:
            for p in self.image_encoder.parameters():
                p.requires_grad = False

        self.image_proj = nn.Linear(image_feat_dim, proj_dim)

        # Classifier head
        self.classifier = nn.Sequential(
            nn.Linear(proj_dim * 2, proj_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(proj_dim, num_labels)
        )

    def forward(self, images, texts):
        """
        images: tensor [B, C, H, W]
        texts: list[str] of length B
        """
        # IMAGE: get features and project
        img_feat = self.image_encoder(images)           
        img_feat = self.image_proj(img_feat)            

        
        txt_feat = self.text_encoder.encode(texts, convert_to_tensor=True)  
        # move to same device as image features
        if txt_feat.device != img_feat.device:
            txt_feat = txt_feat.to(img_feat.device)
        txt_feat = self.text_proj(txt_feat)             

        # CONCAT + CLASSIFY
        fused = torch.cat([img_feat, txt_feat], dim=1)  
        logits = self.classifier(fused)
        return logits


def find_discrepancies(df, images_dir):
    images_dir = Path(images_dir)
    referenced = set(df['image_file_name'].astype(str).tolist())
    actual = set([p.name for p in images_dir.glob('*') if p.is_file()])
    missing = sorted(list(referenced - actual))
    orphan = sorted(list(actual - referenced))
    return missing, orphan


def prepare_dataframe(path, images_dir, drop_label_value=None):
    df = pd.read_excel(path)
    required_cols = {'image_file_name', 'text', 'label'}
    if not required_cols.issubset(set(df.columns)):
        raise AssertionError(f"metadata.xlsx must contain columns: {required_cols}")

    if drop_label_value is not None:
        df = df[df['label'] != drop_label_value].copy()

    df['image_file_name'] = df['image_file_name'].astype(str).str.strip()

    missing, orphan = find_discrepancies(df, images_dir)
    if missing:
        print(f"Missing images for {len(missing)} metadata entries; these rows will be dropped.")
        df = df[~df['image_file_name'].isin(missing)].copy()

    if orphan:
        print(f"Found {len(orphan)} orphan image files not in metadata (showing up to 20):")
        for o in orphan[:20]:
            print("  -", o)
        if len(orphan) > 20:
            print("  ... and more")

    unique_labels = sorted(df['label'].unique().tolist())
    label_map = {orig: idx for idx, orig in enumerate(unique_labels)}
    df['label'] = df['label'].map(label_map)
    print("Label mapping:", label_map)
    return df, orphan, label_map


def compute_sample_weights(df, power=0.5):
    counts = df['label'].value_counts().sort_index().values
    # avoid division by zero
    counts = np.maximum(counts, 1)
    weights = (1.0 / counts) ** power
    weights = weights / weights.sum() * len(weights)
    sample_weights = df['label'].map(lambda x: weights[x]).values
    return sample_weights


def train_one_epoch(model, dataloader, optimizer, criterion, device, scheduler=None):
    model.train()
    total_loss = 0.0
    for batch in tqdm(dataloader, desc="Train", leave=False):
        images = batch['image'].to(device)
        labels = batch['labels'].to(device)
        texts = batch['texts']

        optimizer.zero_grad()
        logits = model(images=images, texts=texts)
        loss = criterion(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        if scheduler is not None:
            scheduler.step()

        total_loss += loss.item() * images.size(0)
    return total_loss / len(dataloader.dataset)


@torch.no_grad()
def evaluate(model, dataloader, device, label_map):
    model.eval()
    preds = []
    trues = []
    for batch in tqdm(dataloader, desc="Eval", leave=False):
        images = batch['image'].to(device)
        labels = batch['labels'].to(device)
        texts = batch['texts']

        logits = model(images=images, texts=texts)
        batch_preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
        batch_trues = labels.cpu().numpy().tolist()
        preds.extend(batch_preds)
        trues.extend(batch_trues)

    acc = accuracy_score(trues, preds)
    report = classification_report(trues, preds, digits=4)
    cm = confusion_matrix(trues, preds)
    return acc, report, trues, preds, cm


def main(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Device:", device)

    df, orphan_files, label_map = prepare_dataframe(args.data, args.images_dir, drop_label_value=args.drop_label_value)

    if args.delete_orphans and orphan_files:
        for fname in orphan_files:
            p = Path(args.images_dir) / fname
            try:
                p.unlink()
            except Exception as e:
                print("Could not delete:", p, e)
        print("Deleted orphans.")

    # splits
    train_df, test_df = train_test_split(df, test_size=args.test_size, stratify=df['label'], random_state=42)
    train_df, val_df = train_test_split(train_df, test_size=args.val_size, stratify=train_df['label'], random_state=42)
    print(f"Train / Val / Test sizes: {len(train_df)} / {len(val_df)} / {len(test_df)}")

    # Datasets & Dataloaders
    tokenizer = None  # not needed for SentenceTransformer; kept for compatibility
    train_dataset = MemeDataset(train_df, args.images_dir, tokenizer, max_length=args.max_length,
                                image_size=args.image_size, use_normalizer=not args.disable_normalizer)
    val_dataset = MemeDataset(val_df, args.images_dir, tokenizer, max_length=args.max_length,
                              image_size=args.image_size, use_normalizer=not args.disable_normalizer)
    test_dataset = MemeDataset(test_df, args.images_dir, tokenizer, max_length=args.max_length,
                               image_size=args.image_size, use_normalizer=not args.disable_normalizer)

    sample_weights = compute_sample_weights(train_df, power=args.weight_power)
    sampler = WeightedRandomSampler(sample_weights, num_samples=len(train_dataset), replacement=True)

    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=sampler,
                              collate_fn=collate_fn, num_workers=args.num_workers, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False,
                            collate_fn=collate_fn, num_workers=args.num_workers, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False,
                             collate_fn=collate_fn, num_workers=args.num_workers, pin_memory=True)

    num_labels = len(label_map)
    model = MultimodalClassifier(sentence_model_name=args.sentence_model,
                                 image_model_name=args.image_model,
                                 proj_dim=args.proj_dim,
                                 num_labels=num_labels,
                                 freeze_image=args.freeze_image,
                                 freeze_text=args.freeze_text)
    model.to(device)

    # Loss, optimizer, scheduler
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(filter(lambda p: p.requires_grad, model.parameters()),
                                  lr=args.lr, weight_decay=args.weight_decay)

    total_steps = len(train_loader) * args.epochs
    warmup_steps = int(0.1 * total_steps) if total_steps > 0 else 0

    def lr_lambda(current_step):
        if current_step < warmup_steps:
            return float(current_step) / float(max(1, warmup_steps))
        return max(0.0, float(total_steps - current_step) / float(max(1, total_steps - warmup_steps)))

    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

    best_val_acc = 0.0
    os.makedirs(args.out_dir, exist_ok=True)

    print("\nStarting training...")
    for epoch in range(1, args.epochs + 1):
        print(f"\nEpoch {epoch}/{args.epochs}")
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device, scheduler)
        print(f"Train loss: {train_loss:.6f}")

        val_acc, val_report, _, _, val_cm = evaluate(model, val_loader, device, label_map)
        print(f"Validation Acc: {val_acc:.4f}")
        if args.verbose:
            print("Validation report:\n", val_report)
            print("Validation confusion matrix:\n", val_cm)

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save({
                'model_state_dict': model.state_dict(),
                'label_map': label_map,
                'epoch': epoch,
                'val_acc': val_acc
            }, os.path.join(args.out_dir, "best_model.pt"))
            print(f"✓ Saved best model (val_acc: {val_acc:.4f})")

    # final evaluation on test
    print("\n" + "="*60)
    print("FINAL EVALUATION ON TEST SET")
    print("="*60)
    ckpt_path = os.path.join(args.out_dir, "best_model.pt")
    if os.path.exists(ckpt_path):
        ckpt = torch.load(ckpt_path, map_location=device)
        model.load_state_dict(ckpt['model_state_dict'])
        print(f"Loaded model from epoch {ckpt.get('epoch', '?')} with val_acc: {ckpt.get('val_acc', 0):.4f}")
    else:
        print("No checkpoint found, using current model weights.")

    test_acc, test_report, trues, preds, test_cm = evaluate(model, test_loader, device, label_map)
    print(f"\n{'='*60}")
    print(f"TEST ACCURACY: {test_acc:.4f} ({test_acc*100:.2f}%)")
    print(f"{'='*60}")
    print("\nTest Classification Report:")
    print(test_report)
    print("\nTest Confusion Matrix:")
    print(test_cm)

    # Save predictions CSV
    out = test_df.reset_index(drop=True).copy()
    out['pred_idx'] = preds
    inv_map = {v: k for k, v in label_map.items()}
    out['pred_orig'] = out['pred_idx'].map(inv_map)
    out['true_orig'] = out['label'].map(inv_map)
    out.to_csv(os.path.join(args.out_dir, "test_predictions.csv"), index=False)
    print(f"\nResults saved to {args.out_dir}/")


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--data', type=str, default='/kaggle/input/dataset59/metadata.xlsx')
    parser.add_argument('--images_dir', type=str, default='/kaggle/input/dataset59/images')
    parser.add_argument('--out_dir', type=str, default='/kaggle/working/output')
    parser.add_argument('--epochs', type=int, default=10)    
    parser.add_argument('--batch_size', type=int, default=10)         
    parser.add_argument('--lr', type=float, default=2e-5)
    parser.add_argument('--weight_decay', type=float, default=0.005)
    parser.add_argument('--sentence_model', type=str,
                        default='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    parser.add_argument('--image_model', type=str, default='convnext_base')
    parser.add_argument('--proj_dim', type=int, default=512)
    parser.add_argument('--max_length', type=int, default=128)
    parser.add_argument('--image_size', type=int, default=224)
    parser.add_argument('--val_size', type=float, default=0.1)
    parser.add_argument('--test_size', type=float, default=0.1)
    parser.add_argument('--drop_label_value', type=int, default=None,
                        help='If you want to drop a specific label value from metadata (optional)')
    parser.add_argument('--weight_power', type=float, default=0.5)
    parser.add_argument('--freeze_text', action='store_true')
    parser.add_argument('--freeze_image', action='store_true')
    parser.add_argument('--delete_orphans', action='store_true')
    parser.add_argument('--disable_normalizer', action='store_true', dest='disable_normalizer')
    parser.add_argument('--num_workers', type=int, default=0)
    parser.add_argument('--verbose', action='store_true')

    
    parser.add_argument('--delete-orphans', dest='delete_orphans', action='store_true')

    args = parser.parse_args([])

    
    
    args.delete_orphans = getattr(args, 'delete_orphans', False)
    args.disable_normalizer = getattr(args, 'disable_normalizer', False)
    args.freeze_text = getattr(args, 'freeze_text', False)
    args.freeze_image = getattr(args, 'freeze_image', False)

    main(args)


2025-11-24 10:36:41.322516: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763980601.500174      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763980601.553024      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Device: cuda
Label mapping: {0: 0, 1: 1, 2: 2, 3: 3}
Train / Val / Test sizes: 5510 / 613 / 681


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/354M [00:00<?, ?B/s]


Starting training...

Epoch 1/10


                                                        

Train loss: 1.132968


                                                     

Validation Acc: 0.5612
✓ Saved best model (val_acc: 0.5612)

Epoch 2/10


                                                        

Train loss: 0.830880


                                                     

Validation Acc: 0.5530

Epoch 3/10


                                                        

Train loss: 0.551097


                                                     

Validation Acc: 0.5465

Epoch 4/10


                                                        

Train loss: 0.312597


                                                     

Validation Acc: 0.5449

Epoch 5/10


                                                        

Train loss: 0.177263


                                                     

Validation Acc: 0.5285

Epoch 6/10


                                                        

Train loss: 0.125016


                                                     

Validation Acc: 0.5122

Epoch 7/10


                                                        

Train loss: 0.097969


                                                     

Validation Acc: 0.5498

Epoch 8/10


                                                        

Train loss: 0.069008


                                                     

Validation Acc: 0.5334

Epoch 9/10


                                                        

Train loss: 0.041195


                                                     

Validation Acc: 0.5188

Epoch 10/10


                                                        

Train loss: 0.040054


                                                     

Validation Acc: 0.5285

FINAL EVALUATION ON TEST SET


UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL numpy.core.multiarray.scalar was not an allowed global by default. Please use `torch.serialization.add_safe_globals([scalar])` or the `torch.serialization.safe_globals([scalar])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.