In [3]:
import torch
from PIL import Image
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
import numpy as np
import re

In [4]:
def evaluate(y_true, y_pred):
    labels = ["0", "1"]
    mapping = {label: idx for idx, label in enumerate(labels)}
    
    def map_func(x):
        return mapping.get(x, -1)  # Map to -1 if not found, but should not occur with correct data
    
    y_true_mapped = np.vectorize(map_func)(y_true)
    y_pred_mapped = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true_mapped, y_pred=y_pred_mapped)
    print(f'Accuracy: {accuracy:.4f}')
    
    # Generate accuracy report
    unique_labels = set(y_true_mapped)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true_mapped)) if y_true_mapped[i] == label]
        label_y_true = [y_true_mapped[i] for i in label_indices]
        label_y_pred = [y_pred_mapped[i] for i in label_indices]
        label_accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {labels[label]}: {label_accuracy:.4f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true_mapped, y_pred=y_pred_mapped, target_names=labels, labels=list(range(len(labels))), digits=4)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true_mapped, y_pred=y_pred_mapped, labels=list(range(len(labels))))
    print('\nConfusion Matrix:')
    print(conf_matrix)

### Performance of Idefics2 9B

In [9]:
idefics_df = pd.read_json('/home/gpuuser3/sinngam_albert/work/mllm_finetune/idefics2_9B_test_predictions.json', dtype='str')

In [10]:
idefics_df.columns

Index(['image_id', 'text', 'label', 'image_path', 'question',
       'idefics2_response', 'idefics2_label'],
      dtype='object')

In [7]:
idefics_df['idefics2_label'].value_counts()

idefics2_label
0    1501
1     908
Name: count, dtype: int64

In [11]:
evaluate(idefics_df['label'].values, idefics_df['idefics2_label'].values)

Accuracy: 0.7090
Accuracy for label 0: 0.7915
Accuracy for label 1: 0.5998

Classification Report:
              precision    recall  f1-score   support

           0     0.7235    0.7915    0.7560      1372
           1     0.6850    0.5998    0.6396      1037

    accuracy                         0.7090      2409
   macro avg     0.7043    0.6957    0.6978      2409
weighted avg     0.7069    0.7090    0.7059      2409


Confusion Matrix:
[[1086  286]
 [ 415  622]]


### Performance test of Llava 1.6 Vicuna 7B

In [12]:
vic_df = pd.read_json('/home/gpuuser3/sinngam_albert/work/mllm_finetune/llava16_vicuna_7B_mmsd2_test_predictions.json', dtype='str')

In [13]:
vic_df['llava16_label'].value_counts()

llava16_label
0    1442
1     967
Name: count, dtype: int64

In [14]:
evaluate(vic_df['label'].tolist(), vic_df['llava16_label'].tolist())

Accuracy: 0.7186
Accuracy for label 0: 0.7784
Accuracy for label 1: 0.6393

Classification Report:
              precision    recall  f1-score   support

           0     0.7406    0.7784    0.7591      1372
           1     0.6856    0.6393    0.6617      1037

    accuracy                         0.7186      2409
   macro avg     0.7131    0.7089    0.7104      2409
weighted avg     0.7170    0.7186    0.7171      2409


Confusion Matrix:
[[1068  304]
 [ 374  663]]


### Performance test on LLama 3.2 11B

### Performance test on LLava 1.5 7B

In [15]:
import pandas as pd
llava15_df = pd.read_json("/home/gpuuser3/sinngam_albert/work/mllm_finetune/llava15_7B_mmsd2_test_predictions.json", dtype='str')

In [16]:
llava15_df.head()

Unnamed: 0,image_id,text,label,image_path,question,llava15_response,llava15_label
0,862902619928506372,i am guessing # netflix no longer lets you gra...,1,/home/gpuuser3/sinngam_albert/datasets/mmsd2/d...,Classify the text <i am guessing # netflix no ...,SARCASTIC,1
1,892551658487631873,it 's the insensitive strikeouts at suntrust p...,1,/home/gpuuser3/sinngam_albert/datasets/mmsd2/d...,Classify the text <it 's the insensitive strik...,SARCASTIC,1
2,853143461360480256,"following the path of the river calder , so .....",1,/home/gpuuser3/sinngam_albert/datasets/mmsd2/d...,Classify the text <following the path of the r...,NOT SARCASTIC,0
3,918423568823840768,# westernsahara # authority has no lessons 2ge...,1,/home/gpuuser3/sinngam_albert/datasets/mmsd2/d...,Classify the text <# westernsahara # authority...,SARCASTIC,1
4,731617467718610944,hey <user> great sale !,1,/home/gpuuser3/sinngam_albert/datasets/mmsd2/d...,Classify the text <hey <user> great sale !> an...,NOT SARCASTIC,0


In [17]:
evaluate(llava15_df['label'], llava15_df['llava15_label'])

Accuracy: 0.6513
Accuracy for label 0: 0.5466
Accuracy for label 1: 0.7898

Classification Report:
              precision    recall  f1-score   support

           0     0.7748    0.5466    0.6410      1372
           1     0.5684    0.7898    0.6610      1037

    accuracy                         0.6513      2409
   macro avg     0.6716    0.6682    0.6510      2409
weighted avg     0.6859    0.6513    0.6496      2409


Confusion Matrix:
[[750 622]
 [218 819]]


### Performance test on LLava 1.6 Mistral 7B

In [18]:
import pandas as pd
llava16_df = pd.read_json("/home/gpuuser3/sinngam_albert/work/mllm_finetune/llava16_mistral_7B_mmsd2_test_predictions.json", dtype='str')

In [19]:
llava16_df['llava16_label'].value_counts()

llava16_label
1    1497
0     912
Name: count, dtype: int64

In [20]:
evaluate(
    y_true=llava16_df['label'].tolist(),
    y_pred=llava16_df['llava16_label'].tolist()
)

Accuracy: 0.7484
Accuracy for label 0: 0.6115
Accuracy for label 1: 0.9296

Classification Report:
              precision    recall  f1-score   support

           0     0.9200    0.6115    0.7347      1372
           1     0.6440    0.9296    0.7609      1037

    accuracy                         0.7484      2409
   macro avg     0.7820    0.7706    0.7478      2409
weighted avg     0.8011    0.7484    0.7459      2409


Confusion Matrix:
[[839 533]
 [ 73 964]]


### Consistency Analysis

In [21]:
ivl_df = pd.read_json("/home/gpuuser3/sinngam_albert/work/internvl_ft/inference_test/internvl25_8B_mmsd2_test_predictions.json", dtype="str")
llama_df = pd.read_json("/home/gpuuser3/sinngam_albert/work/mllm_finetune/llama32_11b_mmsd2_test_predictions.json", dtype="str")
mistral_df = pd.read_json("/home/gpuuser3/sinngam_albert/work/mllm_finetune/llava16_mistral_7B_mmsd2_test_predictions.json", dtype="str")
vic_df = pd.read_json("/home/gpuuser3/sinngam_albert/work/mllm_finetune/llava16_vicuna_7B_mmsd2_test_predictions.json", dtype="str")

In [32]:
def check_consistencyN_with_truth(model_preds, tL):
    """
    model_preds: List of lists, where each inner list is the predictions from one model.
    tL: Ground truth labels (list of strings, '0' or '1')
    """
    from collections import Counter

    total = len(tL)
    match_all = 0
    match_all_with_truth = 0

    match_0 = match_1 = 0
    total_0 = total_1 = 0

    num_models = len(model_preds)

    for preds in zip(*model_preds, tL):
        *model_labels, truth = preds

        if truth == '0':
            total_0 += 1
        else:
            total_1 += 1

        # Check if all model predictions agree
        if all(label == model_labels[0] for label in model_labels):
            match_all += 1
            if truth == '0':
                match_0 += 1
            else:
                match_1 += 1
                
            if model_labels[0] == truth:
                match_all_with_truth += 1


    overall_consistency = (match_all / total) * 100 if total > 0 else 0
    overall_truth_consistency = (match_all_with_truth / total) * 100 if total > 0 else 0
    consistency_0 = (match_0 / total_0) * 100 if total_0 > 0 else 0
    consistency_1 = (match_1 / total_1) * 100 if total_1 > 0 else 0

    return {
        "all_model_consistency": overall_consistency,
        "all_model_consistency_against_truth": overall_truth_consistency,
        "conistency_of_label_0_samples": consistency_0,
        "conistency_of_label_1_samples": consistency_1
    }

print("### Exact consistency (2 Models)")
print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], llama_df['llama_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], vic_df['llava16_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], mistral_df['llava16_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [llama_df['llama_label'], vic_df['llava16_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [llama_df['llama_label'], mistral_df['llava16_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [vic_df['llava16_label'], mistral_df['llava16_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], idefics_df['idefics2_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [llama_df['llama_label'], idefics_df['idefics2_label']], 
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [mistral_df['llava16_label'], idefics_df['idefics2_label']], 
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [vic_df['llava16_label'], idefics_df['idefics2_label']], 
    ivl_df['label']
))

print("### Exact consistency (3 Models)")
print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], llama_df['llama_label'], mistral_df['llava16_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], llama_df['llama_label'], vic_df['llava16_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], llama_df['llama_label'], idefics_df['idefics2_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], mistral_df['llava16_label'], vic_df['llava16_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], mistral_df['llava16_label'], idefics_df['idefics2_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], vic_df['llava16_label'], idefics_df['idefics2_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [llama_df['llama_label'], mistral_df['llava16_label'], vic_df['llava16_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [llama_df['llama_label'], mistral_df['llava16_label'], idefics_df['idefics2_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [llama_df['llama_label'], vic_df['llava16_label'], idefics_df['idefics2_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [mistral_df['llava16_label'], vic_df['llava16_label'], idefics_df['idefics2_label']],
    ivl_df['label']
))

print("### Exact consistency (4 Models)")
print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], llama_df['llama_label'], mistral_df['llava16_label'], vic_df['llava16_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], llama_df['llama_label'], mistral_df['llava16_label'], idefics_df['idefics2_label']],
    ivl_df['label']
))
print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], llama_df['llama_label'], vic_df['llava16_label'], idefics_df['idefics2_label']],
    ivl_df['label']
))

print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], mistral_df['llava16_label'], vic_df['llava16_label'], idefics_df['idefics2_label']],
    ivl_df['label']
))

print(check_consistencyN_with_truth(
    [llama_df['llama_label'], mistral_df['llava16_label'], vic_df['llava16_label'], idefics_df['idefics2_label']],
    ivl_df['label']
))

print("### Exact consistency (5 Models)")
print(check_consistencyN_with_truth(
    [ivl_df['internvl_label'], llama_df['llama_label'], mistral_df['llava16_label'], vic_df['llava16_label'], idefics_df['idefics2_label']],
    ivl_df['label']
))

### Exact consistency (2 Models)
{'all_model_consistency': 78.74636778746368, 'all_model_consistency_against_truth': 64.00996264009963, 'conistency_of_label_0_samples': 81.04956268221575, 'conistency_of_label_1_samples': 75.69913211186113}
{'all_model_consistency': 75.79908675799086, 'all_model_consistency_against_truth': 60.68908260689082, 'conistency_of_label_0_samples': 79.1545189504373, 'conistency_of_label_1_samples': 71.35969141755064}
{'all_model_consistency': 81.7766708177667, 'all_model_consistency_against_truth': 65.17227065172271, 'conistency_of_label_0_samples': 81.26822157434403, 'conistency_of_label_1_samples': 82.44937319189971}
{'all_model_consistency': 77.1274387712744, 'all_model_consistency_against_truth': 62.26650062266501, 'conistency_of_label_0_samples': 81.92419825072886, 'conistency_of_label_1_samples': 70.7810993249759}
{'all_model_consistency': 77.04441677044417, 'all_model_consistency_against_truth': 63.71938563719386, 'conistency_of_label_0_samples': 77.7696

In [34]:
def majority_vote_accuracyN(model_preds, tL):
    """
    model_preds: List of lists, where each inner list is the predictions from one model (strings or ints '0'/'1')
    tL: Ground truth labels (list of strings or ints: '0' or '1')
    """
    correct = 0
    correct_0 = 0
    correct_1 = 0
    total_0 = 0
    total_1 = 0

    num_models = len(model_preds)
    
    for preds in zip(*model_preds, tL):
        *model_labels, truth = preds
        model_labels = list(map(int, model_labels))
        truth = int(truth)

        vote_sum = sum(model_labels)
        vote = 1 if vote_sum >= (num_models / 2) else 0

        if truth == 0:
            total_0 += 1
            if vote == truth:
                correct_0 += 1
        else:
            total_1 += 1
            if vote == truth:
                correct_1 += 1

        if vote == truth:
            correct += 1

    total = len(tL)
    overall_accuracy = (correct / total) * 100 if total > 0 else 0
    class_0_accuracy = (correct_0 / total_0) * 100 if total_0 > 0 else 0
    class_1_accuracy = (correct_1 / total_1) * 100 if total_1 > 0 else 0

    return {
        "overall_accuracy": overall_accuracy,
        "class_0_accuracy": class_0_accuracy,
        "class_1_accuracy": class_1_accuracy
    }

print("### Majority Vote Accuracy (5 Models)")
print(majority_vote_accuracyN(
    [ivl_df['internvl_label'], llama_df['llama_label'], mistral_df['llava16_label'], vic_df['llava16_label'], idefics_df['idefics2_label']],
    ivl_df['label']
))

### Majority Vote Accuracy (5 Models)
{'overall_accuracy': 77.91614777916148, 'class_0_accuracy': 75.29154518950438, 'class_1_accuracy': 81.38862102217936}
