In [1]:
!pip install -U gdown

import gdown

# 第一个文件
url1 = "https://drive.google.com/uc?id=1PbdlIg0p8Lm8HO2Wv17b5vzelXONpbuf"
output1 = "memes.zip"  # 你可以改成实际文件名或路径
gdown.download(url1, output1, quiet=False)

# 第二个文件
url2 = "https://drive.google.com/uc?id=1WhUQxJ2b1SjY5geBmthr8CvnR2GTt5l4"
output2 = "processed_data_all_labels.csv"
gdown.download(url2, output2, quiet=False)



Downloading...
From (original): https://drive.google.com/uc?id=1PbdlIg0p8Lm8HO2Wv17b5vzelXONpbuf
From (redirected): https://drive.google.com/uc?id=1PbdlIg0p8Lm8HO2Wv17b5vzelXONpbuf&confirm=t&uuid=f106769f-c812-4f4e-bdfe-03c92db3ceeb
To: /content/memes.zip
100%|██████████| 557M/557M [00:11<00:00, 49.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1WhUQxJ2b1SjY5geBmthr8CvnR2GTt5l4
To: /content/processed_data_all_labels.csv
100%|██████████| 1.04M/1.04M [00:00<00:00, 9.57MB/s]


'processed_data_all_labels.csv'

In [2]:
# Unzip the downloaded file file1.ext
import zipfile
import os

# Specify the zip file path
zip_path = '/content/memes.zip' # Path to file1.ext
extract_dir = '/content/'  # Extract to Colab local storage (assuming this contains the images)

# Create extraction directory
os.makedirs(extract_dir, exist_ok=True)

# Check if the zip file exists
if not os.path.exists(zip_path):
    print(f"❌ Error: Zip file not found at {zip_path}")
else:
    # Extract the zip file
    try:
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_dir)
        print(f"✅ Extraction complete for {os.path.basename(zip_path)}. Files are located at: {extract_dir}")
    except zipfile.BadZipFile:
        print(f"❌ Error: File at {zip_path} is not a valid zip file.")
    except Exception as e:
        print(f"❌ An unexpected error occurred during extraction of {os.path.basename(zip_path)}: {e}")

✅ Extraction complete for memes.zip. Files are located at: /content/


In [3]:
!pip install -U sentence-transformers

import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
from PIL import Image
from tqdm import tqdm
import ast
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Dataset and collate_fn
class MemeDataset(Dataset):
    def __init__(self, csv_file, image_dir):
        self.dataframe = pd.read_csv(csv_file)
        self.image_dir = image_dir

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_id = str(self.dataframe.iloc[idx]['id'])
        possible_extensions = ['.jpeg', '.jpg', '.png', '.JPEG', '.JPG', '.PNG']

        img_path = None
        for ext in possible_extensions:
            temp_path = os.path.join(self.image_dir, img_id + ext)
            if os.path.exists(temp_path):
                img_path = temp_path
                break

        if img_path is None:
            raise FileNotFoundError(f"Image not found: {img_id}")

        image = Image.open(img_path).convert("RGB")
        text = self.dataframe.iloc[idx]['text']

        label_str = self.dataframe.iloc[idx]['task4_hard']
        label_vec = ast.literal_eval(label_str)
        label_idx = label_vec.index(1.0)
        label = torch.tensor(label_idx, dtype=torch.float)

        return {'image': image, 'text': text, 'label': label}

def custom_collate_fn(batch):
    images = [item['image'] for item in batch]
    texts = [item['text'] for item in batch]
    labels = torch.stack([item['label'] for item in batch])

    return {
        'image': images,
        'text': texts,
        'label': labels
    }

Using device: cuda


# Task
Evaluate the trained multimodal classifier model on a subset of the memes dataset and report the accuracy.

## Load the trained model

### Subtask:
Load the model state dictionary from the saved file.


**Reasoning**:
Instantiate the model, load the state dictionary, and move the model to the correct device.



## Prepare evaluation data

### Subtask:
Prepare evaluation data by creating a new `MemeDataset` and `DataLoader` for a subset of the data.


**Reasoning**:
Prepare evaluation data by creating a new MemeDataset and DataLoader for a subset of the data.



In [4]:
from torch.utils.data import random_split

CSV_PATH = 'processed_data_all_labels.csv'
IMAGE_DIR = 'memes'

# Create a new instance of the MemeDataset class
full_dataset = MemeDataset(csv_file=CSV_PATH, image_dir=IMAGE_DIR)

# Determine the sizes for the training and evaluation sets
dataset_size = len(full_dataset)
eval_size = int(0.2 * dataset_size) # 20% for evaluation
train_size = dataset_size - eval_size # Remaining for training

# Split the dataset into training and evaluation sets
train_dataset, eval_dataset = random_split(full_dataset, [train_size, eval_size])

# Create a DataLoader for the evaluation dataset
EVAL_BATCH_SIZE = 16 # Or 32, depending on memory
eval_loader = DataLoader(eval_dataset, batch_size=EVAL_BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)

print(f"Full dataset size: {dataset_size}")
print(f"Training dataset size: {train_size}")
print(f"Evaluation dataset size: {eval_size}")
print("✅ Evaluation DataLoader created.")

Full dataset size: 4044
Training dataset size: 3236
Evaluation dataset size: 808
✅ Evaluation DataLoader created.


## Evaluate the model

### Subtask:
Iterate through the evaluation DataLoader, get predictions from the model, and calculate accuracy.


**Reasoning**:
Iterate through the evaluation DataLoader, get predictions from the model, and calculate accuracy.



## Load and split data

### Subtask:
Load the dataset and split it into training, validation, and test sets using an 8:1:1 ratio.


**Reasoning**:
Load the CSV, create the dataset, calculate split sizes, and perform the random split.



In [5]:
# Load the CSV file
dataframe = pd.read_csv(CSV_PATH)

# Create the full dataset
full_dataset = MemeDataset(csv_file=CSV_PATH, image_dir=IMAGE_DIR)

# Calculate dataset sizes for 8:1:1 split
dataset_size = len(full_dataset)
train_size = int(0.8 * dataset_size)
val_size = int(0.1 * dataset_size)
test_size = dataset_size - train_size - val_size # Allocate remaining to test

# Split the dataset
train_dataset, val_dataset, test_dataset = random_split(full_dataset, [train_size, val_size, test_size])

print(f"Full dataset size: {dataset_size}")
print(f"Training dataset size: {train_size}")
print(f"Validation dataset size: {val_size}")
print(f"Test dataset size: {test_size}")

Full dataset size: 4044
Training dataset size: 3235
Validation dataset size: 404
Test dataset size: 405


## Create dataloaders

### Subtask:
Create DataLoaders for the training, validation, and test sets.


**Reasoning**:
Create DataLoaders for the training, validation, and test sets using the defined batch size and the custom collate function.



In [6]:
# Define batch size
BATCH_SIZE = 16 # You can adjust this if needed

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)

print(f"Batch size for DataLoaders: {BATCH_SIZE}")
print("✅ DataLoaders for training, validation, and test sets created.")

Batch size for DataLoaders: 16
✅ DataLoaders for training, validation, and test sets created.


## Define and initialize model

### Subtask:
Define and initialize the multimodal classifier model.


**Reasoning**:
Instantiate the MultimodalClassifier model, move it to the device, and print a confirmation message.



## Train the model with validation

### Subtask:
Train the model on the training set and evaluate it on the validation set during training to monitor performance and detect overfitting.


**Reasoning**:
Set the model to training mode, define the loss function and optimizer, and then iterate through epochs to train the model on the training data and evaluate on the validation data.



**Reasoning**:
Load the dataset into a pandas DataFrame and display the first few rows and the column names and their data types to understand the data structure.



**Reasoning**:
The previous attempt to load the data failed because the file 'train.csv' was not found. The correct file path is '/data/train.csv'. I will try loading the data again using the correct path.



**Reasoning**:
The dataset was not found in the previous two attempts. I will try loading the dataset from the default path '/kaggle/input/train.csv' which is common in Kaggle environments.



**Reasoning**:
The previous attempts to load the data failed. Since the path to the dataset is still unknown, I will try listing the files in the current directory and in the '/kaggle/input/' directory to locate the correct file path for 'train.csv'.



**Reasoning**:
Based on the file listing, the 'train.csv' file is not in the current directory, '/kaggle/input/', or '/data/'. However, 'processed_data_all_labels.csv' is present in the current directory. It is likely that the dataset for this task is actually 'processed_data_all_labels.csv'. I will load this file into a DataFrame and display its head and info to confirm its structure.



In [7]:
# Load the dataset 'processed_data_all_labels.csv'
df = pd.read_csv('processed_data_all_labels.csv')

# Display the first few rows
display(df.head())

# Display column names and data types
display(df.info())

Unnamed: 0,id,lang,text,task4_hard,task4_soft,task5_hard,task5_soft,task6_hard,task6_soft
0,110001,es,2+2=5 MITO Albert Einstein tenía bajo rendimie...,"[1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0, 0.0, 0.0, 0.0]","[1.0, 0.1667, 0.0, 0.0, 0.1667]"
1,110002,es,CUANDO UNA MUJER VA A LUCHAR POR SUS DERECHOS,"[1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0]","[0.8333, 0.1667]","[1.0, 1.0, 0.0, 0.0, 0.0]","[0.6667, 0.6667, 0.0, 0.0, 0.1667]"
2,110003,es,ІЯ ЕГЕЯ Е MOA ¿El Partido Republicano busca pe...,"[0.0, 1.0]","[0.3333, 0.6667]","[1.0, 0.0]","[0.3333, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.1667, 0.1667, 0.0, 0.3333]"
3,110004,es,"Paises que ""apoyan"" los derechos de la mujer A...","[1.0, 0.0]","[0.5, 0.5]","[0.0, 1.0]","[0.0, 0.5]","[0.0, 0.0, 0.0, 0.0, 0.0]","[0.3333, 0.1667, 0.0, 0.0, 0.0]"
4,110005,es,Ya verás como este 8 de marzo hay uno que te s...,"[0.0, 1.0]","[0.3333, 0.6667]","[0.0, 1.0]","[0.1667, 0.1667]","[0.0, 0.0, 0.0, 0.0, 0.0]","[0.3333, 0.0, 0.0, 0.0, 0.0]"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4044 entries, 0 to 4043
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          4044 non-null   int64 
 1   lang        4044 non-null   object
 2   text        4044 non-null   object
 3   task4_hard  4044 non-null   object
 4   task4_soft  4044 non-null   object
 5   task5_hard  4044 non-null   object
 6   task5_soft  4044 non-null   object
 7   task6_hard  4044 non-null   object
 8   task6_soft  4044 non-null   object
dtypes: int64(1), object(8)
memory usage: 284.5+ KB


None

**Reasoning**:
The previous step successfully loaded the data into a DataFrame. The 'task4_hard' column, which is relevant to the subtask, is of type 'object' and contains list-like strings. To use this column for analysis, it needs to be converted to a more usable format. I will convert the 'task4_hard' column from string representation of a list to an actual list of floats, then extract the first element of each list, which represents the label for task4_hard, and convert it to an integer type for further analysis.



In [8]:
import ast

# Convert the 'task4_hard' column from string representation of list to actual list
df['task4_hard'] = df['task4_hard'].apply(ast.literal_eval)

# Extract the first element of the list and convert to integer
df['task4_hard_label'] = df['task4_hard'].apply(lambda x: int(x[0]))

# Display the first few rows with the new column
display(df[['task4_hard', 'task4_hard_label']].head())

# Display the data type of the new column
display(df['task4_hard_label'].info())

Unnamed: 0,task4_hard,task4_hard_label
0,"[1.0, 0.0]",1
1,"[1.0, 0.0]",1
2,"[0.0, 1.0]",0
3,"[1.0, 0.0]",1
4,"[0.0, 1.0]",0


<class 'pandas.core.series.Series'>
RangeIndex: 4044 entries, 0 to 4043
Series name: task4_hard_label
Non-Null Count  Dtype
--------------  -----
4044 non-null   int64
dtypes: int64(1)
memory usage: 31.7 KB


None

**Reasoning**:
The 'task4_hard_label' column has been successfully created and converted to an integer type. Now, to understand the distribution of labels in the 'task4_hard' column as required by the subtask, I will calculate and display the value counts for the 'task4_hard_label' column.



In [9]:
# Calculate and display the value counts for the 'task4_hard_label' column
label_counts = df['task4_hard_label'].value_counts()
print("Value counts for 'task4_hard_label':")
print(label_counts)

Value counts for 'task4_hard_label':
task4_hard_label
1    2324
0    1720
Name: count, dtype: int64


In [10]:
# Add more regularization and early stopping
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from sentence_transformers import SentenceTransformer
from PIL import Image
from tqdm import tqdm
import ast
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification # Import AutoTokenizer and AutoModelForSequenceClassification


class MultimodalClassifier(nn.Module):
    def __init__(self, num_classes=1):
        super(MultimodalClassifier, self).__init__()
        self.img_model = SentenceTransformer('clip-ViT-B-32')
        # Use the specified text model
        self.tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")
        self.text_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")


        embedding_dim = 512  # CLIP default dimension
        # Get the correct text model embedding dimension.
        # For AutoModelForSequenceClassification, we'll use the hidden size of the base model.
        text_embedding_dim = self.text_model.base_model.config.hidden_size
        combined_embedding_dim = embedding_dim + text_embedding_dim


        # Enhanced classifier with more regularization
        self.classifier = nn.Sequential(
            nn.Linear(combined_embedding_dim, 512), # Adjust input dimension for combined embeddings
            nn.ReLU(),
            nn.BatchNorm1d(512),  # Add Batch Normalization
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.4),  # More Dropout
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, images, texts):
        with torch.no_grad():
            img_embeddings = self.img_model.encode(
                images,
                convert_to_tensor=True,
                device=self._get_device(),
                show_progress_bar=False,
                batch_size=len(images)
            )

            # Process text with the specified model
            encoded_texts = self.tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(self._get_device())
            # Get the embeddings from the base model's last hidden state
            text_embeddings = self.text_model.base_model(**encoded_texts).last_hidden_state.mean(dim=1)


        combined_embeddings = torch.cat((img_embeddings, text_embeddings), dim=1)
        logits = self.classifier(combined_embeddings)
        return logits

    def _get_device(self):
        return next(self.parameters()).device

# Dataset and collate_fn remain unchanged
class MemeDataset(Dataset):
    def __init__(self, csv_file, image_dir):
        self.dataframe = pd.read_csv(csv_file)
        self.image_dir = image_dir

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_id = str(self.dataframe.iloc[idx]['id'])
        possible_extensions = ['.jpeg', '.jpg', '.png', '.JPEG', '.JPG', '.PNG']

        img_path = None
        for ext in possible_extensions:
            temp_path = os.path.join(self.image_dir, img_id + ext)
            if os.path.exists(temp_path):
                img_path = temp_path
                break

        if img_path is None:
            raise FileNotFoundError(f"Image not found: {img_id}")

        image = Image.open(img_path).convert("RGB")
        text = self.dataframe.iloc[idx]['text']

        label_str = self.dataframe.iloc[idx]['task4_hard']
        label_vec = ast.literal_eval(label_str)
        label_idx = label_vec.index(1.0)
        label = torch.tensor(label_idx, dtype=torch.float)

        return {'image': image, 'text': text, 'label': label}

def custom_collate_fn(batch):
    images = [item['image'] for item in batch]
    texts = [item['text'] for item in batch]
    labels = torch.stack([item['label'] for item in batch])

    return {
        'image': images,
        'text': texts,
        'label': labels
    }

# Training settings
EPOCHS = 10  # Increase epochs, use early stopping
LEARNING_RATE = 1e-4  # Reduce learning rate
BATCH_SIZE = 16
PATIENCE = 3  # Early stopping patience

CSV_PATH = 'processed_data_all_labels.csv'
IMAGE_DIR = '/content/memes/' # Updated IMAGE_DIR to the extraction location

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Data splitting 8:1:1
full_dataset = MemeDataset(csv_file=CSV_PATH, image_dir=IMAGE_DIR)
dataset_size = len(full_dataset)
train_size = int(0.8 * dataset_size)
val_size = int(0.1 * dataset_size)
test_size = dataset_size - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    full_dataset, [train_size, val_size, test_size]
)

print(f"Train set: {train_size}, Validation set: {val_size}, Test set: {test_size}")

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=custom_collate_fn)

# Model and optimizer
model = MultimodalClassifier(num_classes=1).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)  # Add weight decay

# Early stopping mechanism
best_val_loss = float('inf')
patience_counter = 0

# Training loop
for epoch in range(EPOCHS):
    print(f"\n--- Epoch {epoch+1}/{EPOCHS} ---")

    # Training phase
    model.train()
    progress_bar = tqdm(train_loader, desc="Training")
    epoch_train_loss = 0

    for batch_idx, batch in enumerate(progress_bar):
        images = batch['image']
        texts = batch['text']
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(images, texts)
        loss = criterion(outputs.squeeze(1), labels)
        loss.backward()
        optimizer.step()

        epoch_train_loss += loss.item()
        progress_bar.set_postfix({
            'loss': loss.item(),
            'avg_loss': epoch_train_loss / (batch_idx + 1)
        })

    # Validation phase
    model.eval()
    val_progress = tqdm(val_loader, desc="Validation")
    epoch_val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_progress:
            images = batch['image']
            texts = batch['text']
            labels = batch['label'].to(device)

            outputs = model(images, texts)
            loss = criterion(outputs.squeeze(1), labels)
            epoch_val_loss += loss.item()

            probs = torch.sigmoid(outputs.squeeze(1))
            preds = (probs > 0.5).float()
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_train_loss = epoch_train_loss / len(train_loader)
    avg_val_loss = epoch_val_loss / len(val_loader)
    val_accuracy = correct / total

    print(f"Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), 'best_model.pth')
        print("✅ Saved best model")
    else:
        patience_counter += 1
        print(f"⚠️ Validation loss did not improve ({patience_counter}/{PATIENCE})")

        if patience_counter >= PATIENCE:
            print("🛑 Early stopping triggered, stopping training")
            break

# Load best model for testing
print("\n--- Testing Phase ---")
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

test_correct = 0
test_total = 0

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        images = batch['image']
        texts = batch['text']
        labels = batch['label'].to(device)

        outputs = model(images, texts)
        probs = torch.sigmoid(outputs.squeeze(1))
        preds = (probs > 0.5).float()

        test_correct += (preds == labels).sum().item()
        test_total += labels.size(0)

test_accuracy = test_correct / test_total
print(f"\n🎯 Final test accuracy: {test_accuracy:.4f}")

Using device: cuda
Train set: 3235, Validation set: 404, Test set: 405


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/604 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

0_CLIPModel/pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

0_CLIPModel/model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]


--- Epoch 1/10 ---



Training:   0%|          | 0/203 [00:00<?, ?it/s][AAsking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.

Training:   0%|          | 0/203 [00:05<?, ?it/s, loss=0.653, avg_loss=0.653][A
Training:   0%|          | 1/203 [00:05<17:04,  5.07s/it, loss=0.653, avg_loss=0.653][A
Training:   0%|          | 1/203 [00:05<17:04,  5.07s/it, loss=0.707, avg_loss=0.68] [A
Training:   1%|          | 2/203 [00:05<08:40,  2.59s/it, loss=0.707, avg_loss=0.68][A
Training:   1%|          | 2/203 [00:08<08:40,  2.59s/it, loss=0.67, avg_loss=0.677][A
Training:   1%|▏         | 3/203 [00:08<08:16,  2.48s/it, loss=0.67, avg_loss=0.677][A
Training:   1%|▏         | 3/203 [00:10<08:16,  2.48s/it, loss=0.593, avg_loss=0.656][A
Training:   2%|▏         | 4/203 [00:10<07:45,  2.34s/it, loss=0.593, avg_loss=0.656][A
Training:   2%|▏         | 4/203 [00:10<07:45,  2.34s/it, loss=0.723, avg_loss=0.669][A
Training:   2%|

Train Loss: 0.6759, Val Loss: 0.6561, Val Acc: 0.6337
✅ Saved best model

--- Epoch 2/10 ---


Training: 100%|██████████| 203/203 [01:46<00:00,  1.90it/s, loss=0.521, avg_loss=0.64]
Validation: 100%|██████████| 26/26 [00:10<00:00,  2.44it/s]


Train Loss: 0.6398, Val Loss: 0.6429, Val Acc: 0.6485
✅ Saved best model

--- Epoch 3/10 ---


Training: 100%|██████████| 203/203 [01:29<00:00,  2.26it/s, loss=0.804, avg_loss=0.628]
Validation: 100%|██████████| 26/26 [00:10<00:00,  2.57it/s]


Train Loss: 0.6276, Val Loss: 0.6410, Val Acc: 0.6485
✅ Saved best model

--- Epoch 4/10 ---


Training: 100%|██████████| 203/203 [01:27<00:00,  2.31it/s, loss=0.674, avg_loss=0.604]
Validation: 100%|██████████| 26/26 [00:10<00:00,  2.55it/s]


Train Loss: 0.6042, Val Loss: 0.6339, Val Acc: 0.6361
✅ Saved best model

--- Epoch 5/10 ---


Training: 100%|██████████| 203/203 [01:28<00:00,  2.30it/s, loss=0.879, avg_loss=0.596]
Validation: 100%|██████████| 26/26 [00:09<00:00,  2.67it/s]


Train Loss: 0.5957, Val Loss: 0.6451, Val Acc: 0.6460
⚠️ Validation loss did not improve (1/3)

--- Epoch 6/10 ---


Training: 100%|██████████| 203/203 [01:30<00:00,  2.25it/s, loss=0.903, avg_loss=0.584]
Validation: 100%|██████████| 26/26 [00:09<00:00,  2.67it/s]


Train Loss: 0.5838, Val Loss: 0.6523, Val Acc: 0.6287
⚠️ Validation loss did not improve (2/3)

--- Epoch 7/10 ---


Training: 100%|██████████| 203/203 [01:27<00:00,  2.31it/s, loss=0.145, avg_loss=0.561]
Validation: 100%|██████████| 26/26 [00:09<00:00,  2.64it/s]


Train Loss: 0.5607, Val Loss: 0.6630, Val Acc: 0.6238
⚠️ Validation loss did not improve (3/3)
🛑 Early stopping triggered, stopping training

--- Testing Phase ---


Testing: 100%|██████████| 26/26 [00:11<00:00,  2.25it/s]


🎯 Final test accuracy: 0.6420





# Task
Use the trained multimodal model to extract features, then train and evaluate a fully connected layer model and a LightGBM model on the extracted features.

## Extract features

### Subtask:
Use the trained multimodal model to extract features (combined image and text embeddings) for the training and test datasets.


**Reasoning**:
I will set the model to evaluation mode and iterate through the training and test DataLoaders to extract the combined image and text embeddings before the classifier layer, storing them and their corresponding labels in lists. I will then convert these lists to PyTorch tensors.



In [11]:
# Set the model to evaluation mode
model.eval()

# Initialize lists to store features and labels
train_features_list = []
train_labels_list = []
test_features_list = []
test_labels_list = []

# Extract features and labels from the training set
print("Extracting features from training set...")
with torch.no_grad():
    for batch in tqdm(train_loader, desc="Training Feature Extraction"):
        images = batch['image']
        texts = batch['text']
        labels = batch['label'].to(device)

        # Get image embeddings
        img_embeddings = model.img_model.encode(
            images,
            convert_to_tensor=True,
            device=device,
            show_progress_bar=False,
            batch_size=len(images)
        )

        # Get text embeddings from the base model's last hidden state
        encoded_texts = model.tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
        text_embeddings = model.text_model.base_model(**encoded_texts).last_hidden_state.mean(dim=1)

        # Combine embeddings
        combined_embeddings = torch.cat((img_embeddings, text_embeddings), dim=1)

        train_features_list.append(combined_embeddings.cpu())
        train_labels_list.append(labels.cpu())

# Extract features and labels from the test set
print("Extracting features from test set...")
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing Feature Extraction"):
        images = batch['image']
        texts = batch['text']
        labels = batch['label'].to(device)

        # Get image embeddings
        img_embeddings = model.img_model.encode(
            images,
            convert_to_tensor=True,
            device=device,
            show_progress_bar=False,
            batch_size=len(images)
        )

        # Get text embeddings from the base model's last hidden state
        encoded_texts = model.tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
        text_embeddings = model.text_model.base_model(**encoded_texts).last_hidden_state.mean(dim=1)

        # Combine embeddings
        combined_embeddings = torch.cat((img_embeddings, text_embeddings), dim=1)


        test_features_list.append(combined_embeddings.cpu())
        test_labels_list.append(labels.cpu())

# Concatenate features and labels into tensors
train_features = torch.cat(train_features_list, dim=0)
train_labels = torch.cat(train_labels_list, dim=0)
test_features = torch.cat(test_features_list, dim=0)
test_labels = torch.cat(test_labels_list, dim=0)

print("\nShape of extracted training features:", train_features.shape)
print("Shape of extracted training labels:", train_labels.shape)
print("Shape of extracted test features:", test_features.shape)
print("Shape of extracted test labels:", test_labels.shape)

print("✅ Feature extraction complete.")

Extracting features from training set...


Training Feature Extraction: 100%|██████████| 203/203 [01:26<00:00,  2.36it/s]


Extracting features from test set...


Testing Feature Extraction: 100%|██████████| 26/26 [00:11<00:00,  2.32it/s]


Shape of extracted training features: torch.Size([3235, 1280])
Shape of extracted training labels: torch.Size([3235])
Shape of extracted test features: torch.Size([405, 1280])
Shape of extracted test labels: torch.Size([405])
✅ Feature extraction complete.





## Fully connected layer model

### Subtask:
Define and train a simple fully connected layer model on the extracted features from the training set.


**Reasoning**:
Define a simple fully connected layer model, instantiate it, move it to the correct device, define the loss function and optimizer, and then train it for a fixed number of epochs, monitoring progress with tqdm.



In [12]:
# 1. Define a simple neural network model
class SimpleFC(nn.Module):
    def __init__(self, input_dim, num_classes=1):
        super(SimpleFC, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Get the input dimension from the extracted training features
input_dim = train_features.shape[1]

# 2. Instantiate the defined model and move it to the appropriate device
simple_fc_model = SimpleFC(input_dim=input_dim).to(device)
print(f"✅ SimpleFC model instantiated and moved to {device}.")

# 3. Define a loss function
criterion_fc = nn.BCEWithLogitsLoss()
print("✅ Loss function (BCEWithLogitsLoss) defined.")

# 4. Define an optimizer
optimizer_fc = torch.optim.Adam(simple_fc_model.parameters(), lr=0.001)
print("✅ Optimizer (Adam) defined.")

# 5. Train the fully connected model
EPOCHS_FC = 20 # Define number of epochs for FC model

print(f"\n--- Training SimpleFC Model for {EPOCHS_FC} epochs ---")

# Create a DataLoader for the extracted features and labels for easier batching
train_data = torch.utils.data.TensorDataset(train_features.to(device), train_labels.to(device))
train_loader_fc = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

for epoch in range(EPOCHS_FC):
    simple_fc_model.train()
    epoch_loss = 0
    progress_bar = tqdm(train_loader_fc, desc=f"Epoch {epoch+1}/{EPOCHS_FC} [Training FC]")

    for features, labels in progress_bar:
        optimizer_fc.zero_grad()
        outputs = simple_fc_model(features)
        loss = criterion_fc(outputs.squeeze(1), labels)
        loss.backward()
        optimizer_fc.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item(), 'avg_loss': epoch_loss / (progress_bar.n + 1)})

    avg_epoch_loss = epoch_loss / len(train_loader_fc)
    print(f"Epoch {epoch+1}/{EPOCHS_FC} - Average Training Loss: {avg_epoch_loss:.4f}")

print("\n✅ SimpleFC model training complete.")

✅ SimpleFC model instantiated and moved to cuda.
✅ Loss function (BCEWithLogitsLoss) defined.
✅ Optimizer (Adam) defined.

--- Training SimpleFC Model for 20 epochs ---


Epoch 1/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 328.47it/s, loss=1.26, avg_loss=0.66]


Epoch 1/20 - Average Training Loss: 0.6569


Epoch 2/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 346.35it/s, loss=0.809, avg_loss=0.699]


Epoch 2/20 - Average Training Loss: 0.6093


Epoch 3/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 332.32it/s, loss=0.411, avg_loss=0.68]


Epoch 3/20 - Average Training Loss: 0.5797


Epoch 4/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 326.12it/s, loss=0.293, avg_loss=0.657]


Epoch 4/20 - Average Training Loss: 0.5532


Epoch 5/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 320.01it/s, loss=0.359, avg_loss=0.546]


Epoch 5/20 - Average Training Loss: 0.5324


Epoch 6/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 318.39it/s, loss=0.579, avg_loss=0.513]


Epoch 6/20 - Average Training Loss: 0.4977


Epoch 7/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 325.38it/s, loss=0.426, avg_loss=0.47]


Epoch 7/20 - Average Training Loss: 0.4702


Epoch 8/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 318.67it/s, loss=1.31, avg_loss=0.45]


Epoch 8/20 - Average Training Loss: 0.4323


Epoch 9/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 315.67it/s, loss=0.253, avg_loss=0.477]


Epoch 9/20 - Average Training Loss: 0.4091


Epoch 10/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 323.47it/s, loss=0.258, avg_loss=0.417]


Epoch 10/20 - Average Training Loss: 0.3571


Epoch 11/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 315.61it/s, loss=0.0773, avg_loss=0.328]


Epoch 11/20 - Average Training Loss: 0.3199


Epoch 12/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 325.23it/s, loss=0.49, avg_loss=0.278]


Epoch 12/20 - Average Training Loss: 0.2780


Epoch 13/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 287.40it/s, loss=0.353, avg_loss=0.258]


Epoch 13/20 - Average Training Loss: 0.2391


Epoch 14/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 323.66it/s, loss=0.215, avg_loss=0.201]


Epoch 14/20 - Average Training Loss: 0.2010


Epoch 15/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 250.11it/s, loss=0.159, avg_loss=0.195]


Epoch 15/20 - Average Training Loss: 0.1702


Epoch 16/20 [Training FC]: 100%|██████████| 203/203 [00:01<00:00, 123.55it/s, loss=0.12, avg_loss=0.155]


Epoch 16/20 - Average Training Loss: 0.1487


Epoch 17/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 213.46it/s, loss=0.0307, avg_loss=0.138]


Epoch 17/20 - Average Training Loss: 0.1198


Epoch 18/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 250.30it/s, loss=0.148, avg_loss=0.0994]


Epoch 18/20 - Average Training Loss: 0.0954


Epoch 19/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 215.97it/s, loss=0.0667, avg_loss=0.0919]


Epoch 19/20 - Average Training Loss: 0.0815


Epoch 20/20 [Training FC]: 100%|██████████| 203/203 [00:00<00:00, 240.07it/s, loss=0.0444, avg_loss=0.0637]

Epoch 20/20 - Average Training Loss: 0.0621

✅ SimpleFC model training complete.





**Reasoning**:
The first step is to load the data from the CSV file into a pandas DataFrame and display the first few rows to understand its structure.



In [13]:
import pandas as pd

# Load the dataset 'processed_data_all_labels.csv' as it was found in the content directory
df = pd.read_csv('processed_data_all_labels.csv')
display(df.head())

Unnamed: 0,id,lang,text,task4_hard,task4_soft,task5_hard,task5_soft,task6_hard,task6_soft
0,110001,es,2+2=5 MITO Albert Einstein tenía bajo rendimie...,"[1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0, 0.0, 0.0, 0.0]","[1.0, 0.1667, 0.0, 0.0, 0.1667]"
1,110002,es,CUANDO UNA MUJER VA A LUCHAR POR SUS DERECHOS,"[1.0, 0.0]","[1.0, 0.0]","[1.0, 0.0]","[0.8333, 0.1667]","[1.0, 1.0, 0.0, 0.0, 0.0]","[0.6667, 0.6667, 0.0, 0.0, 0.1667]"
2,110003,es,ІЯ ЕГЕЯ Е MOA ¿El Partido Republicano busca pe...,"[0.0, 1.0]","[0.3333, 0.6667]","[1.0, 0.0]","[0.3333, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.1667, 0.1667, 0.0, 0.3333]"
3,110004,es,"Paises que ""apoyan"" los derechos de la mujer A...","[1.0, 0.0]","[0.5, 0.5]","[0.0, 1.0]","[0.0, 0.5]","[0.0, 0.0, 0.0, 0.0, 0.0]","[0.3333, 0.1667, 0.0, 0.0, 0.0]"
4,110005,es,Ya verás como este 8 de marzo hay uno que te s...,"[0.0, 1.0]","[0.3333, 0.6667]","[0.0, 1.0]","[0.1667, 0.1667]","[0.0, 0.0, 0.0, 0.0, 0.0]","[0.3333, 0.0, 0.0, 0.0, 0.0]"


In [16]:
# Install LightGBM
!pip install lightgbm



In [17]:
import lightgbm as lgb

# Train LightGBM model
print("\n--- Training LightGBM Model ---")

# LightGBM requires data in numpy arrays
lgb_train_features = train_features.cpu().numpy()
lgb_train_labels = train_labels.cpu().numpy()

# Initialize and train the LightGBM classifier
lgb_clf = lgb.LGBMClassifier(objective='binary', metric='binary_logloss', random_state=42)
lgb_clf.fit(lgb_train_features, lgb_train_labels)

print("✅ LightGBM model training complete.")


--- Training LightGBM Model ---
[LightGBM] [Info] Number of positive: 1393, number of negative: 1842
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.129745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 326400
[LightGBM] [Info] Number of data points in the train set: 3235, number of used features: 1280
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.430603 -> initscore=-0.279392
[LightGBM] [Info] Start training from score -0.279392
✅ LightGBM model training complete.


In [18]:
from sklearn.metrics import accuracy_score

# Evaluate LightGBM model on the test set
print("\n--- Evaluating LightGBM Model on Test Set ---")

# LightGBM requires data in numpy arrays
lgb_test_features = test_features.cpu().numpy()
lgb_test_labels = test_labels.cpu().numpy()

# Make predictions on the test set
lgb_preds = lgb_clf.predict(lgb_test_features)

# Calculate accuracy
lgb_accuracy = accuracy_score(lgb_test_labels, lgb_preds)

print(f"\n🎯 Final test accuracy for LightGBM model: {lgb_accuracy:.4f}")


--- Evaluating LightGBM Model on Test Set ---

🎯 Final test accuracy for LightGBM model: 0.6469




In [15]:
# Evaluate the Fully Connected Layer model on the test set
print("\n--- Evaluating SimpleFC Model on Test Set ---")

simple_fc_model.eval() # Set the model to evaluation mode
test_correct_fc = 0
test_total_fc = 0

# Create a DataLoader for the extracted test features and labels
test_data_fc = torch.utils.data.TensorDataset(test_features.to(device), test_labels.to(device))
test_loader_fc = DataLoader(test_data_fc, batch_size=BATCH_SIZE, shuffle=False)

with torch.no_grad():
    for features, labels in tqdm(test_loader_fc, desc="Evaluating FC"):
        outputs = simple_fc_model(features)
        probs = torch.sigmoid(outputs.squeeze(1))
        preds = (probs > 0.5).float()

        test_correct_fc += (preds == labels).sum().item()
        test_total_fc += labels.size(0)

test_accuracy_fc = test_correct_fc / test_total_fc
print(f"\n🎯 Final test accuracy for SimpleFC model: {test_accuracy_fc:.4f}")


--- Evaluating SimpleFC Model on Test Set ---


Evaluating FC: 100%|██████████| 26/26 [00:00<00:00, 1612.12it/s]


🎯 Final test accuracy for SimpleFC model: 0.6444



