In [None]:
# Install all dependencies with specific versions for compatibility
!pip uninstall torch torchvision transformers numpy -y  # Clean uninstall first
!pip install numpy==1.26.4  # Install NumPy first to avoid conflicts
!pip install torch==2.1.2 torchvision==0.16.2
!pip install transformers==4.36.2 datasets scikit-learn pandas kaggle tqdm pillow

In [2]:
import os
import torch
import transformers
from torch.utils.data import DataLoader, Dataset
import random
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.metrics import precision_recall_fscore_support
from transformers import AutoModelForImageClassification, AutoImageProcessor
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
from transformers import ViTForImageClassification
from torch import nn, optim


Disabling PyTorch because PyTorch >= 2.1 is required but found 2.0.1
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/pyt

In [3]:
# Set random seed for reproducibility
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.backends.mps.is_available():
        torch.mps.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 464
seed_everything(SEED)

In [4]:
# Set device - MacBook compatible
def get_device():
    if torch.backends.mps.is_available():
        return torch.device("mps")  # Apple Silicon GPU
    else:
        return torch.device("cpu")  # Fallback to CPU

device = get_device()
print(f"Using device: {device}")

Using device: mps


In [5]:
def parse_annotations(csv_path):
    # Load the CSV file
    df = pd.read_csv(csv_path)

    # Select only numeric category columns (9:35 worked assumed)
    category_columns = df.columns[8:34]

    # Debugging: Check if the selected columns are numeric
    print("Selected category columns:", category_columns)
    print("Column types:", df[category_columns].dtypes)

    # Ensure all data in category columns is numeric
    for col in category_columns:
        if not pd.api.types.is_numeric_dtype(df[col]):
            raise ValueError(f"Column {col} contains non-numeric data.")

    # Calculate class counts
    class_counts = df[category_columns].sum().to_numpy(dtype=np.float32)

    # Parse annotations
    annotations = []
    for _, row in df.iterrows():
        categories = [int(idx) for idx, val in enumerate(row[category_columns]) if val == 1]
        annotation = {"filename": row["Crop_name"], "categories": categories}
        annotations.append(annotation)

    return annotations, class_counts



In [6]:
from sklearn.metrics import average_precision_score

def calculate_map_and_f1(all_targets, all_predictions, all_probabilities):
    """
    Calculate mAP and F1 scores for multi-label classification.
    """
    # Calculate mAP
    map_scores = []
    for i in range(all_targets.shape[1]):
        if np.sum(all_targets[:, i]) > 0:  # Only calculate if positive samples exist
            ap = average_precision_score(all_targets[:, i], all_probabilities[:, i])
            map_scores.append(ap)
    
    mean_ap = np.mean(map_scores) if map_scores else 0.0
    
    # Calculate F1 score
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_targets, all_predictions, average="macro", zero_division=0
    )
    
    return mean_ap, f1, precision, recall

In [7]:
class EMOTICDataset(torch.utils.data.Dataset):
    def __init__(self, annotations, img_dir, feature_extractor, num_categories=26):
        self.annotations = annotations
        self.img_dir = img_dir
        self.feature_extractor = feature_extractor
        self.num_categories = num_categories

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        entry = self.annotations[idx]
        img_path = os.path.join(self.img_dir, entry['filename'])

        if not os.path.exists(img_path):
            raise FileNotFoundError(f"File not found: {img_path}")

        # Load the image and ensure it's RGB
        image = np.load(img_path)
        if len(image.shape) == 2:  # Grayscale image
            image = np.stack([image] * 3, axis=-1)

        # Preprocess the image
        inputs = self.feature_extractor(images=image, return_tensors="pt")
        inputs = {key: val.squeeze(0).to(device) for key, val in inputs.items()}

        # Multi-hot encoding for labels
        categories = torch.zeros(self.num_categories, dtype=torch.float32).to(device)
        for category in entry['categories']:
            if category < self.num_categories:
                categories[category] = 1.0

        inputs["labels"] = categories
        return inputs



In [8]:
class CustomViTForImageClassification(ViTForImageClassification):
    def forward(
        self,
        pixel_values,
        head_mask=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        interpolate_pos_encoding=None,
        return_dict=None
    ):
        # Outputs from the ViT model
        outputs = self.vit(
            pixel_values,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            interpolate_pos_encoding=interpolate_pos_encoding,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]
        
        # Ensure sequence_output is on the same device as the classifier
        sequence_output = sequence_output.to(next(self.classifier.parameters()).device)
        logits = self.classifier(sequence_output[:, 0, :])
        return type('ModelOutput', (), {'logits': logits})()


In [9]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=2, alpha=None, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

    def forward(self, logits, targets):
        probs = torch.sigmoid(logits)
        ce_loss = nn.functional.binary_cross_entropy_with_logits(logits, targets, reduction='none')

        # Modulation
        p_t = probs * targets + (1 - probs) * (1 - targets)
        loss = ce_loss * ((1 - p_t) ** self.gamma)

        if self.alpha is not None:
            alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
            loss = alpha_t * loss

        if self.reduction == 'mean':
            return loss.mean()
        elif self.reduction == 'sum':
            return loss.sum()
        return loss


In [10]:
def calculate_dynamic_thresholds(model, val_loader, device):
    """
    Calculate optimal thresholds for each label based on F1 score.
    """
    model.eval()
    all_targets = []
    all_outputs = []

    with torch.no_grad():
        for batch in val_loader:
            images = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)

            logits = model(images).logits
            probabilities = torch.sigmoid(logits).cpu().numpy()
            all_targets.extend(labels.cpu().numpy())
            all_outputs.extend(probabilities)

    all_targets = np.vstack(all_targets)
    all_outputs = np.vstack(all_outputs)

    # Adjust thresholds to consider the low logit values
    thresholds = []
    for i in range(all_targets.shape[1]):
        best_threshold = 0.1
        best_f1 = 0
        print(f"Processing Class {i}")
        for threshold in np.arange(0.001, 0.2, 0.01):
            preds = (all_outputs[:, i] > threshold).astype(int)
            precision, recall, f1, _ = precision_recall_fscore_support(
                all_targets[:, i], preds, average="binary", zero_division=0
            )
            print(f"Threshold: {threshold:.2f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
        thresholds.append(best_threshold)
        print(f"Best Threshold for Class {i}: {best_threshold}, Best F1: {best_f1:.4f}")
    
    print(f"Dynamic Thresholds: {thresholds}")
    return np.array(thresholds)


In [11]:
def validate_model(model, val_loader, device, thresholds=None):
    """
    Validate the model and compute mAP and F1 metrics only.
    """
    model.eval()
    all_targets = []
    all_predictions = []
    all_probabilities = []

    with torch.no_grad():
        for batch in val_loader:
            images = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)

            logits = model(images).logits
            probabilities = torch.sigmoid(logits).cpu().numpy()

            # Use dynamic thresholds if provided
            if thresholds is not None:
                predictions = (probabilities > thresholds).astype(int)
            else:
                predictions = (probabilities > 0.1).astype(int)

            all_targets.extend(labels.cpu().numpy())
            all_predictions.extend(predictions)
            all_probabilities.extend(probabilities)

    all_targets = np.vstack(all_targets)
    all_predictions = np.vstack(all_predictions)
    all_probabilities = np.vstack(all_probabilities)

    # Calculate mAP and F1 scores
    mean_ap, f1, precision, recall = calculate_map_and_f1(all_targets, all_predictions, all_probabilities)

    print(f"Validation Metrics - mAP: {mean_ap:.4f}, F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")
    return mean_ap, f1, precision, recall

In [12]:
def train_model_with_dynamic_weights(
    model, train_loader, val_loader, optimizer, num_epochs, device, model_path, class_counts
):
    class_weights = 1.0 / torch.tensor(class_counts, dtype=torch.float32, device=device)
    criterion_bce = nn.BCEWithLogitsLoss(pos_weight=class_weights)
    criterion_focal = FocalLoss(gamma=2, alpha=class_weights)

    # Remove CUDA-specific scaler for Mac compatibility
    scheduler = StepLR(optimizer, step_size=3, gamma=0.1)

    best_f1 = 0.0
    auxiliary_weight = 0.5

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        train_loader_iter = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")

        for batch in train_loader_iter:
            images = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            model.to(device)

            optimizer.zero_grad()

            # Remove autocast for Mac compatibility
            logits = model(images).logits

            loss_bce = criterion_bce(logits, labels)
            loss_focal = criterion_focal(logits, labels)
            total_loss_batch = loss_bce + auxiliary_weight * loss_focal

            total_loss_batch.backward()
            optimizer.step()

            train_loader_iter.set_postfix(loss=total_loss_batch.item())
            total_loss += total_loss_batch.item()

        scheduler.step()
        mean_ap, f1, precision, recall = validate_model(model, val_loader, device)
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}, F1: {f1:.4f}, mAP: {mean_ap:.4f}")
        
        print(f"Auxiliary Weight Before Adjustment: {auxiliary_weight}")

        if f1 < 0.7:
            auxiliary_weight = 0.7
        elif f1 > 0.9:
            auxiliary_weight = 0.3
        else:
            auxiliary_weight = 0.5

        print(f"Auxiliary Weight After Adjustment: {auxiliary_weight}")

        if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), model_path)
            print(f"Model saved with F1 score: {best_f1:.4f}")



In [14]:
# Main Script
if __name__ == "__main__":
    # Updated paths for local execution
    train_annotations_path = "archive/annots_arrs/annot_arrs_train.csv"
    val_annotations_path = "archive/annots_arrs/annot_arrs_val.csv"
    img_dir = "archive/img_arrs/"
    model_path = "best_vit_emotic.pth"
    
    batch_size = 8  # Reduced batch size for Mac compatibility
    num_epochs = 10
    learning_rate = 1e-4
    num_classes = 26
    
    # Use slower image processor to avoid PyTorch version conflicts
    feature_extractor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224", use_fast=False)
    
    print("Parsing annotations...")
    train_annotations, class_counts = parse_annotations(train_annotations_path)
    val_annotations, _ = parse_annotations(val_annotations_path)
    
    print("Creating datasets...")
    train_dataset = EMOTICDataset(train_annotations, img_dir, feature_extractor, num_categories=num_classes)
    val_dataset = EMOTICDataset(val_annotations, img_dir, feature_extractor, num_categories=num_classes)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    print("Loading model...")
    
    # Alternative loading method to bypass AutoModel issues
    try:
        # First attempt: Direct ViT import (bypasses Auto class checks)
        from transformers import ViTForImageClassification
        
        model = ViTForImageClassification.from_pretrained(
            "google/vit-base-patch16-224",
            num_labels=num_classes,
            ignore_mismatched_sizes=True
        ).to(device)
        print("Model loaded successfully using direct ViT import!")
        
    except Exception as e:
        print(f"Direct import failed: {e}")
        print("Trying alternative method...")
        
        try:
            # Second attempt: Import from models module
            from transformers.models.vit import ViTForImageClassification
            
            model = ViTForImageClassification.from_pretrained(
                "google/vit-base-patch16-224",
                num_labels=num_classes,
                ignore_mismatched_sizes=True
            ).to(device)
            print("Model loaded successfully using models module!")
            
        except Exception as e:
            print(f"Alternative import also failed: {e}")
            print("Falling back to original AutoModel (may fail)...")
            
            # Fallback to original method
            model = AutoModelForImageClassification.from_pretrained(
                "google/vit-base-patch16-224",
                num_labels=num_classes,
                ignore_mismatched_sizes=True
            ).to(device)
    
    # Replace classifier
    model.classifier = nn.Sequential(
        nn.Dropout(0.3),
        nn.Linear(model.config.hidden_size, num_classes)
    ).to(device)
    
    # Freeze ViT parameters, only train classifier
    for param in model.vit.parameters():
        param.requires_grad = False
    
    for param in model.classifier.parameters():
        param.requires_grad = True
    
    optimizer = optim.Adam(model.classifier.parameters(), lr=learning_rate)
    
    print("Training the model...")
    train_model_with_dynamic_weights(
        model, train_loader, val_loader, optimizer, num_epochs, device, model_path, class_counts
    )
    
    print("Calculating dynamic thresholds...")
    thresholds = calculate_dynamic_thresholds(model, val_loader, device)
    
    print("Final validation with dynamic thresholds...")
    final_map, final_f1, final_precision, final_recall = validate_model(model, val_loader, device, thresholds=thresholds)
    print(f"Final Results - mAP: {final_map:.4f}, F1: {final_f1:.4f}")

Fast image processor class <class 'transformers.image_processing_vit_fast._LazyModule.__getattr__.<locals>.Placeholder'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


Parsing annotations...
Selected category columns: Index(['Peace', 'Affection', 'Esteem', 'Anticipation', 'Engagement',
       'Confidence', 'Happiness', 'Pleasure', 'Excitement', 'Surprise',
       'Sympathy', 'Doubt/Confusion', 'Disconnection', 'Fatigue',
       'Embarrassment', 'Yearning', 'Disapproval', 'Aversion', 'Annoyance',
       'Anger', 'Sensitivity', 'Sadness', 'Disquietment', 'Fear', 'Pain',
       'Suffering'],
      dtype='object')
Column types: Peace              float64
Affection          float64
Esteem             float64
Anticipation       float64
Engagement         float64
Confidence         float64
Happiness          float64
Pleasure           float64
Excitement         float64
Surprise           float64
Sympathy           float64
Doubt/Confusion    float64
Disconnection      float64
Fatigue            float64
Embarrassment      float64
Yearning           float64
Disapproval        float64
Aversion           float64
Annoyance          float64
Anger              floa

ImportError: 
AutoModelForImageClassification requires the PyTorch library but it was not found in your environment. Check out the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.
