In [1]:
# Install all dependencies with specific versions for compatibility
!pip uninstall torch torchvision transformers numpy -y  # Clean uninstall first
!pip install numpy==1.26.4  # Install NumPy first to avoid conflicts
!pip install torch==2.1.2 torchvision==0.16.2
!pip install transformers==4.36.2 datasets scikit-learn pandas kaggle tqdm pillow




Found existing installation: torch 2.1.2
Uninstalling torch-2.1.2:
  Successfully uninstalled torch-2.1.2
Found existing installation: torchvision 0.16.2
Uninstalling torchvision-0.16.2:
  Successfully uninstalled torchvision-0.16.2
Found existing installation: transformers 4.36.2
Uninstalling transformers-4.36.2:
  Successfully uninstalled transformers-4.36.2
Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Defaulting to user installation because normal site-packages is not writeable
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl (14.0 MB)
Installing collected packages: numpy
Successfully installed numpy-1.26.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run

In [2]:
train_annotations_path = "archive/annots_arrs/annot_arrs_train.csv"
val_annotations_path = "archive/annots_arrs/annot_arrs_val.csv"
img_dir = "archive/img_arrs/"
model_path = "best_vit_emotic.pth"



In [3]:
# @title
import os
import torch
import transformers
from torch.utils.data import DataLoader, Dataset
import random
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.metrics import precision_recall_fscore_support
from transformers import AutoModelForImageClassification, AutoImageProcessor
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
from transformers import ViTForImageClassification
from torch import nn, optim


# Set random seed for reproducibility
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.backends.mps.is_available():
        # Set seed for MPS (Apple Silicon)
        torch.mps.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 464
seed_everything(SEED)

# Set device
if torch.backends.mps.is_available():
        device = torch.device("mps")
        print("Using MPS (Metal Performance Shaders) for acceleration")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA for acceleration")
else:
    device = torch.device("cpu")
    print("Using CPU")

print(f"Using device: {device}")




Using MPS (Metal Performance Shaders) for acceleration
Using device: mps


In [4]:


def parse_annotations(csv_path):
    # Load the CSV file
    df = pd.read_csv(csv_path)

    # Select only numeric category columns (9:35 worked assumed)
    category_columns = df.columns[8:34]

    # Debugging: Check if the selected columns are numeric
    print("Selected category columns:", category_columns)
    print("Column types:", df[category_columns].dtypes)

    # Ensure all data in category columns is numeric
    for col in category_columns:
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column {col} contains non-numeric data.")

    # Calculate class counts
    class_counts = df[category_columns].sum().to_numpy(dtype=np.float32)

    # Parse annotations
    annotations = []
    for _, row in df.iterrows():
        categories = [int(idx) for idx, val in enumerate(row[category_columns]) if val == 1]
        annotation = {"filename": row["Crop_name"], "categories": categories}
        annotations.append(annotation)

    return annotations, class_counts


class EMOTICDataset(torch.utils.data.Dataset):
    def __init__(self, annotations, img_dir, feature_extractor, num_categories=26):
        self.annotations = annotations
        self.img_dir = img_dir
        self.feature_extractor = feature_extractor
        self.num_categories = num_categories

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        entry = self.annotations[idx]
        img_path = os.path.join(self.img_dir, entry['filename'])

        if not os.path.exists(img_path):
            raise FileNotFoundError(f"File not found: {img_path}")

        # Load the image and ensure it's RGB
        image = np.load(img_path)
        if len(image.shape) == 2:  # Grayscale image
            image = np.stack([image] * 3, axis=-1)

        # Preprocess the image
        inputs = self.feature_extractor(images=image, return_tensors="pt", antialias=True)
        inputs = {key: val.squeeze(0).to(device) for key, val in inputs.items()}  # Move to device

        # Multi-hot encoding for labels
        categories = torch.zeros(self.num_categories, dtype=torch.float32).to(device)
        for category in entry['categories']:
            if category < self.num_categories:
                categories[category] = 1.0

        # # Debug labels for this sample
        # print("Categories (multi-hot encoding):", categories.cpu().numpy())

        inputs["labels"] = categories
        return inputs

In [5]:


class CustomViTForImageClassification(ViTForImageClassification):
    def forward(
        self,
        pixel_values,
        head_mask=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        interpolate_pos_encoding=None,
        return_dict=None
    ):
        # Outputs from the ViT model
        outputs = self.vit(
            pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            interpolate_pos_encoding=interpolate_pos_encoding,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        print(f"sequence_output device: {sequence_output.device}")
        print(f"classifier device: {next(self.classifier.parameters()).device}")

        # Ensure sequence_output[:, 0, :] is on the same device as the classifier
        sequence_output = sequence_output.to(next(self.classifier.parameters()).device)

        logits = self.classifier(sequence_output[:, 0, :])
        return logits


In [6]:

class FocalLoss(nn.Module):
    def __init__(self, gamma=2, alpha=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

    def forward(self, logits, targets):
        probs = torch.sigmoid(logits)
        ce_loss = nn.functional.binary_cross_entropy_with_logits(logits, targets, reduction='none')

        # Modulation
        p_t = probs * targets + (1 - probs) * (1 - targets)
        loss = ce_loss * ((1 - p_t) ** self.gamma)

        if self.alpha is not None:
            alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
            loss = alpha_t * loss

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        return loss


In [7]:
def calculate_dynamic_thresholds(model, val_loader, device):
    """
    Calculate optimal thresholds for each label based on F1 score.
    """
    model.eval()
    all_targets = []
    all_outputs = []

    with torch.no_grad():
        for batch in val_loader:
            images = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)

            logits = model(images).logits
            probabilities = torch.sigmoid(logits).cpu().numpy()  # Apply sigmoid to logits
            all_targets.extend(labels.cpu().numpy())
            all_outputs.extend(probabilities)

    all_targets = np.vstack(all_targets)
    all_outputs = np.vstack(all_outputs)

    # Adjust thresholds to consider the low logit values
    thresholds = []
    for i in range(all_targets.shape[1]):
        best_threshold = 0.1
        best_f1 = 0
        print(f"Processing Class {i}")  # Debugging class-level processing
        for threshold in np.arange(0.001, 0.2, 0.01):  # Reduced upper bound of thresholds
            preds = (all_outputs[:, i] > threshold).astype(int)
            precision, recall, f1, _ = precision_recall_fscore_support(
                all_targets[:, i], preds, average="binary", zero_division=0
            )
            print(f"Threshold: {threshold:.2f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
        thresholds.append(best_threshold)
        print(f"Best Threshold for Class {i}: {best_threshold}, Best F1: {best_f1:.4f}")
    # Log the thresholds for debugging
    print(f"Dynamic Thresholds: {thresholds}")
    return np.array(thresholds)


In [8]:

def validate_model(model, val_loader, device, thresholds=None):
    """
    Validate the model on the validation dataset and compute metrics.
    """
    model.eval()
    all_targets = []
    all_predictions = []
    
    with torch.no_grad():
        for batch in val_loader:
            images = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            
            logits = model(images).logits
            probabilities = torch.sigmoid(logits).cpu().numpy()  # Apply sigmoid
            
            # Use dynamic thresholds if provided
            if thresholds is not None:
                predictions = (probabilities > thresholds).astype(int)
            else:
                predictions = (probabilities > 0.1).astype(int)
            
            all_targets.extend(labels.cpu().numpy())
            all_predictions.extend(predictions)
    
    all_targets = np.vstack(all_targets)
    all_predictions = np.vstack(all_predictions)
    
    # Calculate metrics
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_targets, all_predictions, average="macro", zero_division=0
    )
    accuracy = np.mean(np.equal(all_targets, all_predictions).all(axis=1))
    
    print(f"Validation Metrics - Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
    
    return precision, recall, f1, accuracy


def train_model_with_dynamic_weights(
    model, train_loader, val_loader, optimizer, num_epochs, device, model_path, class_counts
):
    """
    Train model with Mac-compatible mixed precision training.
    Supports both MPS (Mac) and CPU fallback.
    """
    
    # Setup device-specific configurations
    use_mps = device.type == 'mps'
    use_cpu = device.type == 'cpu'
    
    # Prepare class weights
    class_weights = 1.0 / torch.tensor(class_counts, dtype=torch.float32, device=device)
    criterion_bce = nn.BCEWithLogitsLoss(pos_weight=class_weights)
    criterion_focal = FocalLoss(gamma=2, alpha=class_weights)
    
    # Mixed precision training setup
    scaler = None
    if not use_mps and not use_cpu and torch.cuda.is_available():
        # Only use GradScaler for CUDA
        scaler = torch.cuda.amp.GradScaler()
    
    scheduler = StepLR(optimizer, step_size=3, gamma=0.1)
    best_f1 = 0.0
    auxiliary_weight = 0.5
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        train_loader_iter = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")
        
        for batch in train_loader_iter:
            images = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            
            model.to(device)
            optimizer.zero_grad()
            
            # Mixed precision forward pass
            if use_mps:
                # For MPS: no autocast needed (MPS handles optimization internally)
                logits = model(images).logits
                loss_bce = criterion_bce(logits, labels)
                loss_focal = criterion_focal(logits, labels)
                total_loss_batch = loss_bce + auxiliary_weight * loss_focal
                
                # Standard backward pass for MPS
                total_loss_batch.backward()
                optimizer.step()
                
            elif use_cpu:
                # For CPU: use autocast if bfloat16 is available
                if torch.cuda.is_bf16_supported():
                    with torch.autocast(device_type='cpu', dtype=torch.bfloat16):
                        logits = model(images).logits
                        loss_bce = criterion_bce(logits, labels)
                        loss_focal = criterion_focal(logits, labels)
                        total_loss_batch = loss_bce + auxiliary_weight * loss_focal
                else:
                    # No autocast for CPU without bfloat16
                    logits = model(images).logits
                    loss_bce = criterion_bce(logits, labels)
                    loss_focal = criterion_focal(logits, labels)
                    total_loss_batch = loss_bce + auxiliary_weight * loss_focal
                
                # Standard backward pass for CPU
                total_loss_batch.backward()
                optimizer.step()
                
            else:
                # CUDA path (if you ever need it)
                with torch.cuda.amp.autocast():
                    logits = model(images).logits
                    loss_bce = criterion_bce(logits, labels)
                    loss_focal = criterion_focal(logits, labels)
                    total_loss_batch = loss_bce + auxiliary_weight * loss_focal
                
                scaler.scale(total_loss_batch).backward()
                scaler.step(optimizer)
                scaler.update()
            
            train_loader_iter.set_postfix(loss=total_loss_batch.item())
            total_loss += total_loss_batch.item()
        
        scheduler.step()
        
        # Validation
        precision, recall, f1, accuracy = validate_model(model, val_loader, device)
        avg_loss = total_loss / len(train_loader)
        
        print(
            f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}"
        )
        
        # Debug auxiliary weight
        print(f"Auxiliary Weight Before Adjustment: {auxiliary_weight}")
        
        if f1 < 0.7:
            auxiliary_weight = 0.7
        elif f1 > 0.9:
            auxiliary_weight = 0.3
        else:
            auxiliary_weight = 0.5
        
        print(f"Auxiliary Weight After Adjustment: {auxiliary_weight}")
        
        # Save best model
        if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), model_path)
            print(f"Model saved with F1 score: {best_f1:.4f}")



In [9]:


# Main Script
if __name__ == "__main__":
    # train_annotations_path = "/content/emotic_data/annots_arrs/annot_arrs_train.csv"
    # val_annotations_path = "/content/emotic_data/annots_arrs/annot_arrs_val.csv"
    # img_dir = "/content/emotic_data/img_arrs/"
    # model_path = "best_vit_emotic.pth"

    batch_size = 16
    num_epochs = 10
    learning_rate = 1e-4
    num_classes = 26

    feature_extractor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224", use_fast=True)

    train_annotations, class_counts = parse_annotations(train_annotations_path)
    val_annotations, _ = parse_annotations(val_annotations_path)

    train_dataset = EMOTICDataset(train_annotations, img_dir, feature_extractor, num_categories=num_classes)
    val_dataset = EMOTICDataset(val_annotations, img_dir, feature_extractor, num_categories=num_classes)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = AutoModelForImageClassification.from_pretrained(
        "google/vit-base-patch16-224",
        ignore_mismatched_sizes=True
    ).to(device)

    model.classifier = nn.Sequential(
        nn.Dropout(0.3),
        nn.Linear(model.config.hidden_size, num_classes)
    )

    for param in model.vit.parameters():
        param.requires_grad = False

    for param in model.classifier.parameters():
        param.requires_grad = True

    optimizer = optim.Adam(model.classifier.parameters(), lr=learning_rate)

    print("Training the model...")
    train_model_with_dynamic_weights(
        model, train_loader, val_loader, optimizer, num_epochs, device, model_path, class_counts
    )

    thresholds = calculate_dynamic_thresholds(model, val_loader, device)
    validate_model(model, val_loader, device, thresholds=thresholds)




Selected category columns: Index(['Peace', 'Affection', 'Esteem', 'Anticipation', 'Engagement',
       'Confidence', 'Happiness', 'Pleasure', 'Excitement', 'Surprise',
       'Sympathy', 'Doubt/Confusion', 'Disconnection', 'Fatigue',
       'Embarrassment', 'Yearning', 'Disapproval', 'Aversion', 'Annoyance',
       'Anger', 'Sensitivity', 'Sadness', 'Disquietment', 'Fear', 'Pain',
       'Suffering'],
      dtype='object')
Column types: Peace              float64
Affection          float64
Esteem             float64
Anticipation       float64
Engagement         float64
Confidence         float64
Happiness          float64
Pleasure           float64
Excitement         float64
Surprise           float64
Sympathy           float64
Doubt/Confusion    float64
Disconnection      float64
Fatigue            float64
Embarrassment      float64
Yearning           float64
Disapproval        float64
Aversion           float64
Annoyance          float64
Anger              float64
Sensitivity        



Training the model...


Epoch 1/10: 100%|██████████| 1540/1540 [14:42<00:00,  1.75it/s, loss=0.0151] 


Validation Metrics - Precision: 0.2470, Recall: 0.0108, F1: 0.0194, Accuracy: 0.0000
Epoch 1/10, Loss: 0.0963, F1: 0.0194, Accuracy: 0.0000
Auxiliary Weight Before Adjustment: 0.5
Auxiliary Weight After Adjustment: 0.7
Model saved with F1 score: 0.0194


Epoch 2/10: 100%|██████████| 1540/1540 [16:00<00:00,  1.60it/s, loss=0.00817]


Validation Metrics - Precision: 0.2179, Recall: 0.0005, F1: 0.0009, Accuracy: 0.0000
Epoch 2/10, Loss: 0.0094, F1: 0.0009, Accuracy: 0.0000
Auxiliary Weight Before Adjustment: 0.7
Auxiliary Weight After Adjustment: 0.7


Epoch 3/10: 100%|██████████| 1540/1540 [16:18<00:00,  1.57it/s, loss=0.00297] 


Validation Metrics - Precision: 0.0385, Recall: 0.0000, F1: 0.0001, Accuracy: 0.0000
Epoch 3/10, Loss: 0.0039, F1: 0.0001, Accuracy: 0.0000
Auxiliary Weight Before Adjustment: 0.7
Auxiliary Weight After Adjustment: 0.7


Epoch 4/10: 100%|██████████| 1540/1540 [17:12<00:00,  1.49it/s, loss=0.00161] 


Validation Metrics - Precision: 0.0385, Recall: 0.0000, F1: 0.0001, Accuracy: 0.0000
Epoch 4/10, Loss: 0.0027, F1: 0.0001, Accuracy: 0.0000
Auxiliary Weight Before Adjustment: 0.7
Auxiliary Weight After Adjustment: 0.7


Epoch 5/10: 100%|██████████| 1540/1540 [13:31<00:00,  1.90it/s, loss=0.00214] 


Validation Metrics - Precision: 0.0385, Recall: 0.0000, F1: 0.0001, Accuracy: 0.0000
Epoch 5/10, Loss: 0.0025, F1: 0.0001, Accuracy: 0.0000
Auxiliary Weight Before Adjustment: 0.7
Auxiliary Weight After Adjustment: 0.7


Epoch 6/10: 100%|██████████| 1540/1540 [12:33<00:00,  2.04it/s, loss=0.00134] 


Validation Metrics - Precision: 0.0000, Recall: 0.0000, F1: 0.0000, Accuracy: 0.0000
Epoch 6/10, Loss: 0.0022, F1: 0.0000, Accuracy: 0.0000
Auxiliary Weight Before Adjustment: 0.7
Auxiliary Weight After Adjustment: 0.7


Epoch 7/10: 100%|██████████| 1540/1540 [12:32<00:00,  2.05it/s, loss=0.00123] 


Validation Metrics - Precision: 0.0000, Recall: 0.0000, F1: 0.0000, Accuracy: 0.0000
Epoch 7/10, Loss: 0.0020, F1: 0.0000, Accuracy: 0.0000
Auxiliary Weight Before Adjustment: 0.7
Auxiliary Weight After Adjustment: 0.7


Epoch 8/10: 100%|██████████| 1540/1540 [12:31<00:00,  2.05it/s, loss=0.00118] 


Validation Metrics - Precision: 0.0000, Recall: 0.0000, F1: 0.0000, Accuracy: 0.0000
Epoch 8/10, Loss: 0.0020, F1: 0.0000, Accuracy: 0.0000
Auxiliary Weight Before Adjustment: 0.7
Auxiliary Weight After Adjustment: 0.7


Epoch 9/10: 100%|██████████| 1540/1540 [12:46<00:00,  2.01it/s, loss=0.00114] 


Validation Metrics - Precision: 0.0000, Recall: 0.0000, F1: 0.0000, Accuracy: 0.0000
Epoch 9/10, Loss: 0.0020, F1: 0.0000, Accuracy: 0.0000
Auxiliary Weight Before Adjustment: 0.7
Auxiliary Weight After Adjustment: 0.7


Epoch 10/10: 100%|██████████| 1540/1540 [13:13<00:00,  1.94it/s, loss=0.00195] 


Validation Metrics - Precision: 0.0000, Recall: 0.0000, F1: 0.0000, Accuracy: 0.0000
Epoch 10/10, Loss: 0.0020, F1: 0.0000, Accuracy: 0.0000
Auxiliary Weight Before Adjustment: 0.7
Auxiliary Weight After Adjustment: 0.7
Processing Class 0
Threshold: 0.00, Precision: 0.2703, Recall: 0.2526, F1: 0.2612
Threshold: 0.01, Precision: 0.3684, Recall: 0.0147, F1: 0.0283
Threshold: 0.02, Precision: 0.5000, Recall: 0.0084, F1: 0.0166
Threshold: 0.03, Precision: 0.5000, Recall: 0.0063, F1: 0.0125
Threshold: 0.04, Precision: 0.5000, Recall: 0.0021, F1: 0.0042
Threshold: 0.05, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Threshold: 0.06, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Threshold: 0.07, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Threshold: 0.08, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Threshold: 0.09, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Threshold: 0.10, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Threshold: 0.11, Precision: 0.0000, Recall: 0.0000, F1: 0.0000
Thres