# AAAI EDGeS: Accuracy of automated annotations

Notebook to assess the accuracy of automated annotation routines. Automated annotations were made with algorithms as available in v0.0.5.

In [1]:
import pandas as pd
import os
import random
import sklearn as sk
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, balanced_accuracy_score, f1_score
import itertools

## Import data

In [2]:
automated_tasks = ['Single object', 'Multiple objects','Conditional generation','Simple arithmetic','Simple arithmetic','Negation','Counting']

In [3]:
# Manual annotation data
annotations_df = pd.read_csv('annotations_df_230311.csv')
## Subselect relevant prompts
sub_annotations_df = annotations_df.loc[
    (annotations_df['Task'].isin(automated_tasks))&
    (annotations_df['Auto_assessment']==True)][['Task','Algorithm','Prompt_no','annotation']]
sub_annotations_df["Prompt_int"]=sub_annotations_df.Prompt_no.astype('int')

In [4]:
# Automated annotation data
imported_automated_results = pd.read_csv('automated_annotations.csv')
imported_automated_results["Prompt_int"]=imported_automated_results.Prompt_no.astype('int')

In [5]:
imported_automated_results.Task.unique()

array(['Single object', 'Negation', 'Simple arithmetic',
       'Conditional generation', 'Counting', 'Multiple objects'],
      dtype=object)

In [6]:
# Merge both types of annotations
merged_annotations = imported_automated_results.merge(sub_annotations_df, on=['Prompt_int','Algorithm','Task'],how='inner')
merged_annotations

Unnamed: 0,File_name,Prompt_no_x,Task,Score,Dashboard_version,Algorithm,Prompt_int,Prompt_no_y,annotation
0,p2047_1_v0.0.5_pd230208.png,2047,Single object,True,v0.0.5_pd230228,Dalle,2047,2047,True
1,p1917_1_v0.0.5_pd230208.png,1917,Single object,True,v0.0.5_pd230228,Dalle,1917,1917,True
2,p2029_1_v0.0.5_pd230208.png,2029,Single object,True,v0.0.5_pd230228,Dalle,2029,2029,True
3,p1979_1_v0.0.5_pd230208.png,1979,Single object,True,v0.0.5_pd230228,Dalle,1979,1979,True
4,p2216_1_v0.0.5_pd230208.png,2216,Single object,True,v0.0.5_pd230228,Dalle,2216,2216,True
...,...,...,...,...,...,...,...,...,...
3855,p1296_1_v0.0.5_pd230208.png,1296,Multiple objects,False,v0.0.5_pd230228,SD1.5,1296,1296,False
3856,p1297_1_v0.0.5_pd230208.png,1297,Multiple objects,True,v0.0.5_pd230228,SD1.5,1297,1297,False
3857,p1298_1_v0.0.5_pd230208.png,1298,Multiple objects,False,v0.0.5_pd230228,SD1.5,1298,1298,False
3858,p1299_1_v0.0.5_pd230208.png,1299,Multiple objects,True,v0.0.5_pd230228,SD1.5,1299,1299,False


## Evaluate accuracy

We evaluate the accuracy with F1 scores (https://en.wikipedia.org/wiki/F-score). Note that chance level is 0.5 for all tasks but that there are no true generated images for the simple arithmetic task.

We see that overall algorithms are doing a mediocre job when compared to judgements of human annotators. They seem the most reliable for "Conditional generation". Many of these numbers could be analysed in more detail. One example is the difference between F1 True and F1 False for Single Objects. The CLIP algorithms used at the moment believes many images to show the correct objects even if they do not and as such agree with the generative algorithms but not the human annotators. This is interesting because many of these algorithms are evaluated using CLIP scores, so that biases and mistakes of CLIP probably transfer over to the generative algorithms.

In [7]:
res_df_list = []
iter = itertools.product(merged_annotations.Task.unique(),merged_annotations.Algorithm.unique())
for i in iter:
    temp_df = merged_annotations.loc[
        (merged_annotations['Task']==i[0])&
        (merged_annotations['Algorithm']==i[1])
    ]
    
    #temp_df = temp_df.merge(annotations_df,on="Prompt_no",how='left')
    res_row = {'Task':i[0],'Algorithm':i[1],
               'F1: True':f1_score(temp_df.annotation,temp_df.Score,pos_label=1),
               'F1: False':f1_score(temp_df.annotation,temp_df.Score,pos_label=0)}
    res_df_list.append(pd.Series(res_row))

In [8]:
res_df = pd.concat(res_df_list,axis=1).T

In [9]:
res_df

Unnamed: 0,Task,Algorithm,F1: True,F1: False
0,Single object,Dalle,0.984833,0.0
1,Single object,SD2.1,0.807487,0.4375
2,Single object,MJ,0.964876,0.055556
3,Single object,SD1.5,0.869364,0.18705
4,Negation,Dalle,0.337349,0.529915
5,Negation,SD2.1,0.512821,0.688525
6,Negation,MJ,0.162162,0.809816
7,Negation,SD1.5,0.441176,0.712121
8,Simple arithmetic,Dalle,0.0,0.979592
9,Simple arithmetic,SD2.1,0.0,0.979592


In [10]:
res_df.groupby('Task')[['F1: True','F1: False']].mean()

Unnamed: 0_level_0,F1: True,F1: False
Task,Unnamed: 1_level_1,Unnamed: 2_level_1
Conditional generation,0.678328,0.800183
Counting,0.495452,0.8802
Multiple objects,0.431065,0.638886
Negation,0.363377,0.685094
Simple arithmetic,0.0,0.97272
Single object,0.90664,0.170026
