In [34]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

MAPPER = {
  'species': {
    0: 'hyena',
    1: 'zebra',
    2: 'giraffe',
    3: 'buffalo',
    4: 'gazelle',
    5: 'wildebeest',
    6: 'elephant',
    7: 'lion',
    8: 'bird'
  },
  'behavior': {
    0: 'moving',
    1: 'eating',
    2: 'resting'
  },
  'animal': {
    0: 'empty',
    1: 'animal'
  }
}

RESULTS ANALYSIS

In [16]:
# Function to determine the task based on the file path
def get_task_from_path(path):
    if "species-classifier" in path:
        return "species"
    elif "animal-classifier" in path:
        return "animal"
    elif "behaviour-classifier" in path or "behavior-classifier" in path:
        return "behavior"
    else:
        return "unknown"

# Load CSV data and add task and model information
def load_model_data(model_list):
    model_data = []
    for entry in model_list:
        results_path = entry["results_path"]
        model = entry["model"]
        task = get_task_from_path(results_path)
        
        if os.path.exists(results_path):
            df = pd.read_csv(results_path)
            
            # Ensure 'pred' and 'real' are of consistent types
            df['pred'] = df['pred'].astype(str)  # Convert predictions to strings
            df['real'] = df['real'].astype(str)  # Convert ground truth to strings
            
            df['model'] = model
            df['task'] = task
            model_data.append(df)
        else:
            print(f"Warning: File not found - {results_path}")
    return pd.concat(model_data, ignore_index=True) if model_data else pd.DataFrame()

# Calculate metrics for a group (by task and model)
def calculate_metrics_for_group(df):
    pred = df['pred']
    real = df['real']
    metrics = {
        'accuracy': accuracy_score(real, pred),
        'precision': precision_score(real, pred, average='weighted', zero_division=0),
        'recall': recall_score(real, pred, average='weighted', zero_division=0),
        'f1_score': f1_score(real, pred, average='weighted', zero_division=0)
    }
    return metrics

# Aggregate metrics by task and model
def evaluate_models(data):
    grouped_metrics = []
    for (task, model), group in data.groupby(['task', 'model']):
        metrics = calculate_metrics_for_group(group)
        metrics.update({'task': task, 'model': model})
        grouped_metrics.append(metrics)
    return pd.DataFrame(grouped_metrics)

# Visualize metrics with vertical model names in data labels
def visualize_model_performance(metrics_df, metric_name="accuracy"):
    # Pivot the metrics dataframe for easier plotting
    pivot_table = metrics_df.pivot(index='task', columns='model', values=metric_name)

    # Define parameters for bar placement and width
    bar_width = 0.15  # Width of each individual bar
    group_width = bar_width * len(pivot_table.columns) + 0.2  # Space for each group
    x = np.arange(len(pivot_table)) * group_width  # Adjust group spacing

    # Create the figure and axis
    fig, ax = plt.subplots(figsize=(14, 8))

    # Plot each model's bar with adjusted positions
    for i, model in enumerate(pivot_table.columns):
        positions = x + i * bar_width  # Offset for each model within the group
        ax.bar(
            positions,
            pivot_table[model],
            bar_width,
            label=model,
            edgecolor='black',
            alpha=0.85
        )

        # Add data labels
        for pos, value in zip(positions, pivot_table[model]):
            ax.text(
                pos, value + 0.01, 
                f"{value:.2f}", 
                fontsize=10, ha='center', va='bottom'
            )

    # Customize the chart aesthetics
    ax.set_title(f"Model Performance by Task ({metric_name.capitalize()})", fontsize=16, fontweight='bold')
    ax.set_xlabel("Task", fontsize=14)
    ax.set_ylabel(metric_name.capitalize(), fontsize=14)
    ax.set_xticks(x + (len(pivot_table.columns) - 1) * bar_width / 2)  # Center tick under group
    ax.set_xticklabels(pivot_table.index, rotation=45, ha='right', fontsize=12)
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))  # Convert to percentage if applicable
    ax.legend(title="Model", fontsize=12, title_fontsize=14, loc="upper left", bbox_to_anchor=(1.05, 1))
    ax.grid(axis='y', linestyle='--', alpha=0.6)

    # Adjust layout
    plt.tight_layout()
    plt.show()

def replace_text(df, column_name):
    def replace_value(value):
        try:
            float(value)
            return value 
        except (ValueError, TypeError):
            return "-1"
    df[column_name] = df[column_name].apply(replace_value)
    return df

In [17]:
# List of models and results
model_list = [
    {"results_path": "/data/luiz/dataset/results/behaviour-classifier/blip_serengeti_-.csv", "model": "BLIP"},
    {"results_path": "/data/luiz/dataset/results/behaviour-classifier/blip_serengeti_pretrained.csv", "model": "BLIP-FewShot"},
    {"results_path": "/data/luiz/dataset/results/behaviour-classifier/clip_serengeti_-.csv", "model": "CLIP"},
    {"results_path": "/data/luiz/dataset/results/behaviour-classifier/clip_serengeti_pretrained.csv", "model": "CLIP-FewShot"},
    {"results_path": "/data/luiz/dataset/results/behaviour-classifier/gpt_serengeti_-.csv", "model": "GPT"},
    {"results_path": "/data/luiz/dataset/results/behaviour-classifier/gemini_serengeti_-.csv", "model": "GEMINI"},

    {"results_path": "/data/luiz/dataset/results/behaviour-classifier/blip_serengeti_seq.csv", "model": "BLIP-Seq"},
    {"results_path": "/data/luiz/dataset/results/behaviour-classifier/blip_serengeti_pretrained-seq.csv", "model": "BLIP-FewShot-Seq"},
    {"results_path": "/data/luiz/dataset/results/behaviour-classifier/clip_serengeti_seq.csv", "model": "CLIP-Seq"},
    {"results_path": "/data/luiz/dataset/results/behaviour-classifier/clip_serengeti_pretrained-seq.csv", "model": "CLIP-FewShot-Seq"},
    {"results_path": "/data/luiz/dataset/results/behaviour-classifier/gpt_serengeti_seq.csv", "model": "GPT-Seq"},
    {"results_path": "/data/luiz/dataset/results/behaviour-classifier/gemini_serengeti_seq.csv", "model": "GEMINI-Seq"},

    {"results_path": "/data/luiz/dataset/results/animal-classifier/blip_serengeti_-.csv", "model": "BLIP"},
    {"results_path": "/data/luiz/dataset/results/animal-classifier/blip_serengeti_pretrained.csv", "model": "BLIP-FewShot"},
    {"results_path": "/data/luiz/dataset/results/animal-classifier/clip_serengeti_-.csv", "model": "CLIP"},
    {"results_path": "/data/luiz/dataset/results/animal-classifier/clip_serengeti_pretrained.csv", "model": "CLIP-FewShot"},
    {"results_path": "/data/luiz/dataset/results/animal-classifier/gpt_serengeti_-.csv", "model": "GPT"},
    {"results_path": "/data/luiz/dataset/results/animal-classifier/gemini_serengeti_-.csv", "model": "GEMINI"},


    {"results_path": "/data/luiz/dataset/results/species-classifier/blip_serengeti_-.csv", "model": "BLIP"},
    {"results_path": "/data/luiz/dataset/results/species-classifier/blip_serengeti_pretrained.csv", "model": "BLIP-FewShot"},
    {"results_path": "/data/luiz/dataset/results/species-classifier/clip_serengeti_-.csv", "model": "CLIP"},
    {"results_path": "/data/luiz/dataset/results/species-classifier/clip_serengeti_pretrained.csv", "model": "CLIP-FewShot"},
    {"results_path": "/data/luiz/dataset/results/species-classifier/gpt_serengeti_-.csv", "model": "GPT"},
    {"results_path": "/data/luiz/dataset/results/species-classifier/gemini_serengeti_-.csv", "model": "GEMINI"},
    {"results_path": "/data/luiz/dataset/results/species-classifier/gemini_serengeti_-.csv", "model": "GEMINI"},
    {"results_path": "/data/luiz/dataset/results/species-classifier/gemini_serengeti_-.csv", "model": "GEMINI"},
]

# Load data
data = load_model_data(model_list)
data = replace_text(data, "pred")

In [37]:
# Function to compute classification metrics for each model and task
def compute_metrics_by_class(df):
    grouped_metrics = []

    for (model, task), group in df.groupby(['model', 'task']):
        # Get unique labels for this task only
        labels = list(set(group['real']))
        
        precision, recall, f1, _ = precision_recall_fscore_support(
            group['real'], group['pred'], labels=labels, zero_division=0
        )
        for label, p, r, f in zip(labels, precision, recall, f1):
            grouped_metrics.append({
                'model': model,
                'task': task,
                'class': label,
                'precision': round(p, 4),
                'recall': round(r, 4),
                'f1_score': round(f, 4)
            })
    return pd.DataFrame(grouped_metrics)

def compute_metrics(df):
    # Group by model and task
    grouped = df.groupby(['model', 'task'])
    results = []

    # Iterate through each group
    for (model, task), group in grouped:
        y_true = group['real']
        y_pred = group['pred']
        
        # Compute metrics (assuming classification and labels are binary or multiclass)
        precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
        recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
        
        results.append({
            'model': model,
            'task': task,
            'precision': round(precision, 4),
            'recall': round(recall, 4),
            'f1_score': round(f1, 4)
        })

    # Convert results to a DataFrame
    metrics_df = pd.DataFrame(results)
    return metrics_df

metrics_df = compute_metrics(data)
metrics_df_by_class = compute_metrics_by_class(data)


In [38]:
metrics_df

Unnamed: 0,model,task,precision,recall,f1_score
0,BLIP,animal,0.0464,0.0118,0.0188
1,BLIP,behavior,0.125,0.0008,0.0016
2,BLIP,species,0.1861,0.0005,0.0011
3,BLIP-FewShot,animal,0.91,0.91,0.91
4,BLIP-FewShot,behavior,0.7041,0.7011,0.7011
5,BLIP-FewShot,species,0.7684,0.7532,0.7556
6,BLIP-FewShot-Seq,behavior,0.764,0.7555,0.7567
7,BLIP-Seq,behavior,0.1073,0.0653,0.0812
8,CLIP,animal,0.815,0.7983,0.7959
9,CLIP,behavior,0.3348,0.343,0.3321


In [None]:
metrics_df_by_class

Unnamed: 0,model,task,class,precision,recall,f1_score
0,BLIP,animal,empty,0.0000,0.0000,0.0000
1,BLIP,animal,animal,0.6030,0.1528,0.2438
2,BLIP,behavior,moving,0.5000,0.0033,0.0066
3,BLIP,behavior,resting,0.0000,0.0000,0.0000
4,BLIP,behavior,eating,0.0000,0.0000,0.0000
...,...,...,...,...,...,...
95,GPT,species,gazelle,0.9205,0.7535,0.8286
96,GPT,species,giraffe,0.9716,0.8368,0.8992
97,GPT,species,zebra,0.7458,0.7525,0.7491
98,GPT,species,wildebeest,0.7943,0.4991,0.6130


In [19]:
# Mapping function
def map_class_labels(row, mapper):
    task = row['task']
    class_value = row['class']
    return mapper.get(task, {}).get(int(class_value), class_value)  # Keep original if not found

# Apply the mapping
metrics_df_by_class['class'] = metrics_df_by_class.apply(lambda row: map_class_labels(row, MAPPER), axis=1)

In [40]:
import json

def convert_to_json(df):
    return json.dumps(df.to_dict(orient="records"))

def filter_results(task, metrics):
    response = metrics_df_by_class[metrics_df_by_class["task"] == task][['model', 'class', metrics]]
    return convert_to_json(response)

print(filter_results('behavior', 'recall'))

[{"model": "BLIP", "class": "0", "recall": 0.0033}, {"model": "BLIP", "class": "2", "recall": 0.0}, {"model": "BLIP", "class": "1", "recall": 0.0}, {"model": "BLIP-FewShot", "class": "0", "recall": 0.662}, {"model": "BLIP-FewShot", "class": "2", "recall": 0.6706}, {"model": "BLIP-FewShot", "class": "1", "recall": 0.7707}, {"model": "BLIP-FewShot-Seq", "class": "0", "recall": 0.7392}, {"model": "BLIP-FewShot-Seq", "class": "2", "recall": 0.7232}, {"model": "BLIP-FewShot-Seq", "class": "1", "recall": 0.804}, {"model": "BLIP-Seq", "class": "0", "recall": 0.0}, {"model": "BLIP-Seq", "class": "2", "recall": 0.0}, {"model": "BLIP-Seq", "class": "1", "recall": 0.2614}, {"model": "CLIP", "class": "0", "recall": 0.4886}, {"model": "CLIP", "class": "2", "recall": 0.3497}, {"model": "CLIP", "class": "1", "recall": 0.1909}, {"model": "CLIP-FewShot", "class": "0", "recall": 0.5436}, {"model": "CLIP-FewShot", "class": "2", "recall": 0.5748}, {"model": "CLIP-FewShot", "class": "1", "recall": 0.3289},

In [41]:
print(filter_results('species', 'recall'))

[{"model": "BLIP", "class": "0", "recall": 0.0}, {"model": "BLIP", "class": "7", "recall": 0.0027}, {"model": "BLIP", "class": "8", "recall": 0.0}, {"model": "BLIP", "class": "3", "recall": 0.0}, {"model": "BLIP", "class": "6", "recall": 0.0}, {"model": "BLIP", "class": "4", "recall": 0.0}, {"model": "BLIP", "class": "2", "recall": 0.0018}, {"model": "BLIP", "class": "1", "recall": 0.0009}, {"model": "BLIP", "class": "5", "recall": 0.0}, {"model": "BLIP-FewShot", "class": "0", "recall": 0.8575}, {"model": "BLIP-FewShot", "class": "7", "recall": 0.8}, {"model": "BLIP-FewShot", "class": "8", "recall": 0.669}, {"model": "BLIP-FewShot", "class": "3", "recall": 0.5669}, {"model": "BLIP-FewShot", "class": "6", "recall": 0.8058}, {"model": "BLIP-FewShot", "class": "4", "recall": 0.7991}, {"model": "BLIP-FewShot", "class": "2", "recall": 0.8812}, {"model": "BLIP-FewShot", "class": "1", "recall": 0.6535}, {"model": "BLIP-FewShot", "class": "5", "recall": 0.746}, {"model": "CLIP", "class": "0", 

In [42]:
print(filter_results('animal', 'recall'))

[{"model": "BLIP", "class": "0", "recall": 0.0}, {"model": "BLIP", "class": "1", "recall": 0.1528}, {"model": "BLIP-FewShot", "class": "0", "recall": 0.9081}, {"model": "BLIP-FewShot", "class": "1", "recall": 0.9119}, {"model": "CLIP", "class": "0", "recall": 0.9125}, {"model": "CLIP", "class": "1", "recall": 0.684}, {"model": "CLIP-FewShot", "class": "0", "recall": 0.735}, {"model": "CLIP-FewShot", "class": "1", "recall": 0.7828}, {"model": "GEMINI", "class": "0", "recall": 0.7115}, {"model": "GEMINI", "class": "1", "recall": 0.9538}, {"model": "GPT", "class": "0", "recall": 0.949}, {"model": "GPT", "class": "1", "recall": 0.7635}]


DATASET ANALYSIS

In [23]:
# Define tasks and subsets
tasks = ['animal-classifier', 'behaviour-classifier', 'species-classifier']
subsets = ['val', 'test', 'train']

# Base directory
base_dir = '/data/luiz/dataset/partitions'

# Function to count categories
def count_categories(task, subset):
    file_path = os.path.join(base_dir, task, 'serengeti', f'{subset}.csv')
    
    if not os.path.exists(file_path):
        print(f'File not found: {file_path}')
        return
    
    # Read CSV
    df = pd.read_csv(file_path)
    
    # Check if 'category' column exists
    if 'category' not in df.columns:
        print(f'No "category" column in {file_path}')
        return

    # Count the occurrences of each category
    category_counts = df['category'].value_counts()

    print(f'\nTask: {task}, Subset: {subset}')
    print(category_counts)

# Iterate through all tasks and subsets
for task in tasks:
    for subset in subsets:
        count_categories(task, subset)


Task: animal-classifier, Subset: val
yes    1538
no     1462
Name: category, dtype: int64

Task: animal-classifier, Subset: test
no     5019
yes    4981
Name: category, dtype: int64

Task: animal-classifier, Subset: train
yes    10114
no      9886
Name: category, dtype: int64

Task: behaviour-classifier, Subset: val
moving     1033
resting     988
eating      979
Name: category, dtype: int64

Task: behaviour-classifier, Subset: test
eating     3363
moving     3328
resting    3309
Name: category, dtype: int64

Task: behaviour-classifier, Subset: train
resting    6732
eating     6682
moving     6586
Name: category, dtype: int64

Task: species-classifier, Subset: val
gazelle       350
hyena         347
elephant      341
lion          334
bird          333
buffalo       329
zebra         323
wildebeest    322
giraffe       321
Name: category, dtype: int64

Task: species-classifier, Subset: test
buffalo       1136
hyena         1130
bird          1127
lion          1125
wildebeest    1122


VIEWING PREDINGS

In [24]:
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import os

# Function to check if an image meets the rendering criteria
def should_render_image(group):
    correct_predictions = (group['pred'] == group['real']).sum()
    return correct_predictions == 1

# Main code to process the DataFrame and render images
def render_images(df):
    # Group by 'path' to process predictions per image
    grouped = df.groupby('path')
    
    # Track models for which an image has already been rendered
    rendered_models = set()
    
    for path, group in grouped:
        if should_render_image(group):
            idxs = group.loc[group['pred'] == group['real']].index
            for idx in idxs:
                correct_row = df.iloc[idx]
                correct_model = correct_row['model']
                task_name = correct_row['task']
                model_prediction = correct_row['pred']
                
                if (correct_model, task_name) not in rendered_models and task_name== "animal" and correct_model=="CLIP-FewShot":
                    try:
                        # Load the image
                        img = Image.open(path)
                        
                        # Display the image
                        plt.figure(figsize=(10, 10))
                        plt.imshow(img)
                        plt.axis('off')
                        plt.title(f"Task: {task_name}\nModel: {correct_model}\nPrediction: {model_prediction}")
                        plt.show()
                        
                        # Mark the model as rendered
                        # rendered_models.add((correct_model, task_name))
                        
                    except FileNotFoundError:
                        print(f"File not found: {path}")
                    except Exception as e:
                        print(f"Error loading image {path}: {e}")

# render_images(data)
