### Prepare data for error analysis

In [1]:
import torchvision
from copy import copy
import numpy as np
from collections import defaultdict
import pickle
import os
import os.path as osp
import pickle

In [2]:
# cifar10_data = torchvision.datasets.Food101('/export/share/projects/mcai/COCO-Counterfactuals/datasets/food101/', download=True)

In [3]:
# For Caltech101
caltech101_data = torchvision.datasets.Caltech101('/export/share/projects/mcai/COCO-Counterfactuals/datasets/caltech101/', download=True)
caltech101_labels = copy(caltech101_data.categories)
caltech101_labels.remove('Faces_easy')

Files already downloaded and verified


In [40]:
# For Caltech256
caltech256_data = torchvision.datasets.Caltech256('/export/share/projects/mcai/COCO-Counterfactuals/datasets/caltech256/', download=True)
caltech256_labels = copy(caltech256_data.categories)

sents_256 = []
specials = {'billiards':'billiard',
            'binoculars' : 'binocular',
            'bonsai-101' : 'bonsai',
            'brain-101' : 'brain',
            'buddha-101' : 'buddha',
            'chandelier-101' : 'chandelier',
            'chopsticks' : 'chopstick',
            'crab-101' : 'crab',
            'dolphin-101' : 'dolphin',
            'electric-guitar-101' : 'electric-guitar',
            'elephant-101' : 'elephant',
            'ewer-101' : 'ewer',
            'eyeglasses' : 'eyeglass',
            'fireworks' : 'firework',
            'grand-piano-101' : 'grand-piano',
            'hawksbill-101' : 'hawksbill',
            'helicopter-101' : 'helicopter',
            'ibis-101' : 'ibis',
            'kangaroo-101' : 'kangaroo',
            'ketch-101' : 'ketch',
            'laptop-101' : 'laptop',
            'leopards-101' : 'leopard',
            'llama-101' : 'llama',
            'menorah-101' : 'menorah',
            'motorbikes-101' : 'motorbike',
            'mussels' : 'mussel',
            'revolver-101' : 'revolver',
            'scorpion-101' : 'scorpion',
            'socks' : 'sock',
            'starfish-101' : 'starfish',
            'sunflower-101' : 'sunflower',
            'triceratops' : 'triceratop',
            'trilobite-101' : 'trilobite',
            'umbrella-101' : 'umbrella',
            'watch-101' : 'watch',
            'airplanes-101' : 'airplane',
            'car-side-101' : 'car-side',
            'faces-easy-101' : 'face',
           }
for i in range(len(caltech256_labels)):
    tmp = caltech256_labels[i].lower()
    #remove xxx from xxx.label
    tmp = tmp.split('.')[-1]
    #remove plural if needed
    if tmp in specials.keys():
        tmp = specials[tmp]
    #replace '-' with space
    if tmp.lower() != 't-shirt':
        tmp = tmp.replace('-', ' ')
    
    sents_256.append(tmp)
caltech256_labels = sents_256

Files already downloaded and verified


In [5]:
# For food101
food101_data = torchvision.datasets.Food101('/export/share/projects/mcai/COCO-Counterfactuals/datasets/food101/', split='test', download=True)
food101_labels = copy(food101_data.classes)

In [6]:
# For imagenet
imagenet_data = torchvision.datasets.ImageNet(root='/export/share/projects/mcai/COCO-Counterfactuals/datasets/imagenet/', split='val')
imagenet_labels = copy(imagenet_data.classes)
sents = []

for i in range(len(imagenet_labels)):
    tmp = imagenet_labels[i][0].lower()   
    sents.append(tmp)
imagenet_labels = sents

In [8]:
# imagenet_labels

In [9]:
# For Cifar10
cifar10_labels = {0 : "airplane", 1: "automobile", 2: "bird", 3 : "cat", 4: "deer", 5: "dog", 6 : "frog", 7 : "horse", 8 : "ship", 9 : "truck"}
# For Cifar100
with open('/export/share/projects/mcai/COCO-Counterfactuals/datasets/cifar100/cifar-100-python/meta', 'rb') as handle:
    cifar100_labels = handle.read()
cifar100_labels = pickle.loads(cifar100_labels)
cifar100_labels = cifar100_labels['fine_label_names']

In [45]:
def error_analyze_per_class(ground_truth_path, prediction_path, labels):
    ground_truth = np.load(ground_truth_path)
    prediction = np.load(prediction_path)
    counter_class = defaultdict(int)
    counter_correct_predict = defaultdict(int)
    for i, truth in enumerate(ground_truth):
        counter_class[truth] += 1
        if truth == prediction[i]:
            counter_correct_predict[truth] += 1
    accuracy_rate_per_class = {}
    for k in counter_class.keys():
        accuracy_rate_per_class[labels[k]] = {'accuracy' : counter_correct_predict[k] / counter_class[k], 'total_number_tests' : counter_class[k]}
    return accuracy_rate_per_class

In [11]:
# ground_truth_path = 'base-clip/groundtruth_cifar10.npy'
# prediction_path = 'base-clip/predictions_cifar10.npy'
# accuracy_rate_per_class = error_analyze_per_class(ground_truth_path, prediction_path, cifar10_labels)

In [12]:
# accuracy_rate_per_class

In [13]:
# with open(osp.join('.', 'test1.pkl'), 'wb') as f:
#     pickle.dump(accuracy_rate_per_class, f)

In [14]:
# with open('base-clip/accuracy_cifar10.pkl', 'rb') as f:
#     accuracy_dict = pickle.load(f)
# accuracy_dict

In [43]:
model_paths = [ 'base-clip', 
               'CLIP-finetune-on-coco-and-coco_cfs-all-1',
               'CLIP-finetune-on-coco-and-coco_cfs-base-1',
               'CLIP-finetune-on-coco-and-coco_cfs-medium-1',
               'CLIP-finetune-on-mscoco-1']
benchmarks = [
    # 'cifar10',
    #           'cifar100',
    #           'food101',
    #           'caltech101',
              'caltech256',
              # 'imagenet',
]
labels = [
    # cifar10_labels,
    #       cifar100_labels,
    #       food101_labels,
    #       caltech101_labels,
          caltech256_labels,
          # imagenet_labels,
]


In [46]:
for m in model_paths:
    for i, d in enumerate(benchmarks):
        ground_truth_path = osp.join(m, 'groundtruth_' + d + '.npy')
        prediction_path = osp.join(m, 'predictions_' + d + '.npy')
        label = labels[i]
        accuracy_rate_per_class = error_analyze_per_class(ground_truth_path, prediction_path, label)
        with open(osp.join(m, 'accuracy_' + d + '.pkl'), 'wb') as f:
            pickle.dump(accuracy_rate_per_class, f)

In [21]:
#for loading accuracy
all_accuracy = {}
for m in model_paths:
    m_accuracy = {}
    for i, d in enumerate(benchmarks):
        with open(osp.join(m, 'accuracy_' + d + '.pkl'), 'rb') as f:
            accuracy_rate_per_class = pickle.load(f)
        m_accuracy.update({d : accuracy_rate_per_class})
    all_accuracy[m] = m_accuracy

In [24]:
all_accuracy.keys()

dict_keys(['base-clip', 'CLIP-finetune-on-coco-and-coco_cfs-all-1', 'CLIP-finetune-on-coco-and-coco_cfs-base-1', 'CLIP-finetune-on-coco-and-coco_cfs-medium-1', 'CLIP-finetune-on-mscoco-1'])

In [23]:
all_accuracy['base-clip'].keys()

dict_keys(['cifar10', 'cifar100', 'food101', 'caltech101', 'caltech256', 'imagenet'])

In [None]:
# {'train_runtime' : 147.74,
# 'train_samples_per_second' : 23.915,
# 'train_steps_per_second' : 0.188,
# 'train_loss' : 1.4744, 
# 'memory_allocated (GB)' : 31.54,
# 'max_memory_allocated (GB)' : 78.2 }

### Error Ananlysis

In [1]:
import pandas as pd
import os
import os.path as osp
import numpy
import pickle

In [2]:
# Load frequency_replaced_words.csv
# df_freq = pd.read_csv('../../CounterfactualCaptionGeneration/frequency_replaced_words.csv')
df_freq = pd.read_csv('../../CounterfactualCaptionGeneration/frequency_used_counterfactual_words.csv')

In [3]:
df_freq

Unnamed: 0,counterfactual_stem_word,count
0,boy,777
1,girl,568
2,man,471
3,car,358
4,men,324
...,...,...
1592,corona,1
1593,rusti,1
1594,saddl,1
1595,core,1


In [4]:
#load all accuracy
# at COCO-Counterfactuals/coco-counterfactuals-src/Evaluation/CLIP

model_paths = [ 'base-clip', 
               'CLIP-finetune-on-coco-and-coco_cfs-all-1',
               'CLIP-finetune-on-coco-and-coco_cfs-base-1',
               'CLIP-finetune-on-coco-and-coco_cfs-medium-1',
               'CLIP-finetune-on-mscoco-1']

benchmarks = [
    'cifar10',
              'cifar100',
              'food101',
              'caltech101',
              'caltech256',
              'imagenet']

#for loading accuracy
all_accuracy = {}
for m in model_paths:
    m_accuracy = {}
    for i, d in enumerate(benchmarks):
        with open(osp.join(m, 'accuracy_' + d + '.pkl'), 'rb') as f:
            accuracy_rate_per_class = pickle.load(f)
        m_accuracy.update({d : accuracy_rate_per_class})
    all_accuracy[m] = m_accuracy

In [5]:
all_accuracy.keys()

dict_keys(['base-clip', 'CLIP-finetune-on-coco-and-coco_cfs-all-1', 'CLIP-finetune-on-coco-and-coco_cfs-base-1', 'CLIP-finetune-on-coco-and-coco_cfs-medium-1', 'CLIP-finetune-on-mscoco-1'])

In [6]:
import nltk
p = nltk.PorterStemmer()

In [7]:
datasets = ['cifar10', 'cifar100', 'caltech101', 'caltech256', 'food101', 'imagenet']
models = ['CLIP-finetune-on-coco-and-coco_cfs-base-1', 'CLIP-finetune-on-coco-and-coco_cfs-medium-1', 'CLIP-finetune-on-coco-and-coco_cfs-all-1']

In [8]:
def error_analysis(dataset, model, df_freq_per_counter_word):
    all_accuracy_baseline = all_accuracy['base-clip']
    all_accuracy_baseline_dataset = all_accuracy_baseline[dataset]
    all_accuracy_cfs_model = all_accuracy[model]
    all_accuracy_cfs_model_dataset = all_accuracy_cfs_model[dataset]
    changes = [all_accuracy_cfs_model_dataset[k]['accuracy'] - all_accuracy_baseline_dataset[k]['accuracy'] for k in all_accuracy_baseline_dataset.keys()]
    changes_df = pd.DataFrame({'category':all_accuracy_baseline_dataset.keys(), 'changed_from_baseline' : changes})
    changes_df = changes_df.sort_values(by=['changed_from_baseline'], ascending=[False])
    category_list = changes_df['category'].to_list()
    category_list = [p.stem(i) for i in category_list]
    frequency_per_category = df_freq_per_counter_word[df_freq_per_counter_word['counterfactual_stem_word'].isin(category_list)]
    frequency = [ frequency_per_category.loc[frequency_per_category['counterfactual_stem_word'] == i]['count'].iloc[0] if 
                 len(frequency_per_category.loc[frequency_per_category['counterfactual_stem_word'] == i]) > 0 else 0
                 for i in category_list]
    changes_df['frequency'] = frequency
    return changes_df

In [25]:
# changes_df_cifar10_base = error_analysis(dataset, model, df_freq_per_counter_word)
# changes_df_cifar10_base.to_csv(f"ErrorAnalysisResults/accuracy_changes_with_frequency_{dataset}_{model}.csv", header=True, index=False)

In [10]:
for dataset in datasets:
    for m in models:
        changes_df_model = error_analysis(dataset, m, df_freq)
        changes_df_model.to_csv(f"ErrorAnalysisResults/accuracy_changes_with_frequency_{dataset}_{m}.csv", header=True, index=False)

In [11]:
changes_df_model.frequency.sum()

887

In [14]:
dss = []
sum = []
total_appearances = []
appearance_percentages = []
average = []
for d in datasets:
    df = pd.read_csv(F"ErrorAnalysisResults/accuracy_changes_with_frequency_{d}_{m}.csv")
    dss.append(d)
    sum.append(df.frequency.sum())
    df['is_pos'] = [1 if i > 0 else 0 for i in df['frequency'].to_list()]
    total_appearances.append(df.is_pos.sum())
    average.append(df.frequency.sum() / len(df['frequency'].to_list()))
    appearance_percentages.append(100.0 * df.is_pos.sum() / len(df['frequency'].to_list()))
sum_freq = pd.DataFrame({'benchmark' : dss, 'sum_frequency' : sum, 'sum_frequency/len(categories)' : average, 'total_appearance_in_coco_cfs':total_appearances, 'appearance_percentages (%)' :appearance_percentages})

In [15]:
sum_freq

Unnamed: 0,benchmark,sum_frequency,sum_frequency/len(categories),total_appearance_in_coco_cfs,appearance_percentages (%)
0,cifar10,398,39.8,9,90.0
1,cifar100,3446,34.46,55,55.0
2,caltech101,354,3.54,24,24.0
3,caltech256,744,2.894942,48,18.677043
4,food101,28,0.277228,3,2.970297
5,imagenet,887,0.889669,105,10.531595


In [16]:
sum_freq.to_csv(f"ErrorAnalysisResults/error-analysis-all-benchmarks.csv", header=True, index=False)

#### Error Analysis on Human Annotation

In [1]:
import re
import nltk
import pandas as pd
p = nltk.PorterStemmer()
regex = re.compile('[^a-zA-Z ]')

In [2]:
human_annotation_df = pd.read_csv('./ErrorAnalysisResults/captions_with_human_labels.csv')

In [3]:
human_annotation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34820 entries, 0 to 34819
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        34820 non-null  int64 
 1   index             34820 non-null  int64 
 2   caption_1         34820 non-null  object
 3   caption_2         34820 non-null  object
 4   filename          34820 non-null  object
 5   type              34820 non-null  object
 6   image_id          34820 non-null  int64 
 7   source_file       34820 non-null  object
 8   destination_file  34820 non-null  object
 9   label_1           34820 non-null  object
 10  label_2           3482 non-null   object
 11  label_3           3482 non-null   object
 12  label             34820 non-null  object
 13  correct           34820 non-null  object
dtypes: int64(3), object(11)
memory usage: 3.7+ MB


In [4]:
incorrect = human_annotation_df[(human_annotation_df['correct'] != 'correct')]

In [5]:
incorrect.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9338 entries, 1 to 34819
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        9338 non-null   int64 
 1   index             9338 non-null   int64 
 2   caption_1         9338 non-null   object
 3   caption_2         9338 non-null   object
 4   filename          9338 non-null   object
 5   type              9338 non-null   object
 6   image_id          9338 non-null   int64 
 7   source_file       9338 non-null   object
 8   destination_file  9338 non-null   object
 9   label_1           9338 non-null   object
 10  label_2           747 non-null    object
 11  label_3           747 non-null    object
 12  label             9338 non-null   object
 13  correct           9338 non-null   object
dtypes: int64(3), object(11)
memory usage: 1.1+ MB


In [6]:
cap1 = incorrect['caption_1'].to_list()
cap2 = incorrect['caption_2'].to_list()

In [7]:
def extract_replaced_words(original, counter):
    original = regex.sub('', original)
    counter = regex.sub('', counter)
    
    original=original.split()
    counter=counter.split()
    replaced_word = None
    being_replaced_word = None
    i, j = -1, -1
    for i, w in enumerate(original):
        if w not in counter:
            being_replaced_word = w
            break
    for j, w in enumerate(counter):
        if w not in original:
            replaced_word = w
            break
    if replaced_word == None and being_replaced_word != None:
        replaced_word = original[j]
    if being_replaced_word == None and replaced_word != None:
        being_replaced_word = counter[i]
    if being_replaced_word == None and replaced_word == None:
        print('Error', original, counter)
    # if being_replaced_word != None and being_replaced_word[-1] == '.':
    #     being_replaced_word = being_replaced_word[:-1]
    # if replaced_word != None and replaced_word[-1] == '.':
    #     replaced_word = replaced_word[:-1]
    return p.stem(being_replaced_word), p.stem(replaced_word)

In [8]:
being_replaced_words = []
replaced_words = []
wrong = []
for i, origin in enumerate(cap1):
    counter = cap2[i]
    being_replaced_word, replaced_word = extract_replaced_words(origin, counter)
    if being_replaced_word != None and replaced_word != None:
        being_replaced_words.append(being_replaced_word)
        replaced_words.append(replaced_word)
    else:
        wrong.append(i)

In [9]:
len(wrong)
# replaced_words

0

In [10]:
incorrect['original_stem_word'] = being_replaced_words
incorrect['counterfactual_stem_word']=replaced_words

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incorrect['original_stem_word'] = being_replaced_words
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  incorrect['counterfactual_stem_word']=replaced_words


In [12]:
incorrect_stats = incorrect.groupby(by=['original_stem_word','counterfactual_stem_word'])['correct'].count().reset_index(name='count')

In [13]:
incorrect_stats = incorrect_stats.sort_values(by=['count'], ascending=[False])

In [14]:
incorrect_stats.head(40)['count'].sum()

1223

In [15]:
incorrect_stats.to_csv('./ErrorAnalysisResults/frequencies_of_human_annotated_non_correct.csv', header=True, index=False)

In [53]:
incorrect_stats_counterfactual_word = incorrect_stats.groupby(by=['counterfactual_word'])['count'].sum().reset_index()

In [56]:
incorrect_stats_counterfactual_word = incorrect_stats_counterfactual_word.sort_values(by=['count'],ascending=[False])

In [61]:
incorrect_stats_counterfactual_word.head(60)['count'].sum()

3999

In [62]:
incorrect_stats_counterfactual_word.head(60)

Unnamed: 0,counterfactual_word,count
131,boy,258
475,girl,255
653,man,241
672,men,162
200,car,161
1242,woman,138
1161,tree,130
1101,tabl,126
546,hous,111
791,peopl,103


In [43]:
import nltk
from nltk.corpus import wordnet

In [66]:
w1 = wordnet.synset('baby.n.01')
w2 = wordnet.synset('child.n.01') # n denotes noun
print(w1.wup_similarity(w2))

0.6


In [28]:
incorrect_stats.iloc[:30]

Unnamed: 0,original_stem_word,counterfactual_stem_word,count
5281,woman,girl,126
2739,man,boy,125
3289,peopl,men,116
3330,person,man,93
3339,person,woman,42
3313,person,boy,37
1374,coupl,group,36
3281,peopl,guy,35
3286,peopl,kid,33
3325,person,girl,33


In [31]:
incorrect_stats['count'].sum()

9338

In [16]:
p.stem('children')

'children'

In [32]:
human_related = ['girl', 'boy', 'man', 'men', 'woman', 'guy', 'kid', 'person', 'peopl', 'child', 'childen', 'coupl', 'group', 'ladi']


In [33]:
human_incorrect_stats = incorrect_stats[(incorrect_stats['original_stem_word'].isin(human_related) )| (incorrect_stats['counterfactual_stem_word'].isin(human_related))]

In [34]:
human_incorrect_stats['count'].sum()

1864

In [35]:
non_human_incorrect_stats = incorrect_stats[(~incorrect_stats['original_stem_word'].isin(human_related) ) & (~incorrect_stats['counterfactual_stem_word'].isin(human_related))]

In [43]:
non_human_incorrect_stats[:30]

Unnamed: 0,original_stem_word,counterfactual_stem_word,count
754,build,hous,23
4929,truck,car,19
4134,sink,stall,16
1510,desk,tabl,15
706,bu,car,15
4092,sign,light,15
2930,motorcycl,car,15
4440,street,highway,15
2518,kitchen,room,14
4838,train,car,14


In [38]:
df1 = pd.read_csv('../../CounterfactualCaptionGeneration/frequency_replaced_words.csv')

In [40]:
df2 = df1[(df1['origin_stem_word'].isin(human_related) )| (df1['counterfactual_stem_word'].isin(human_related))]

In [42]:
df2['count'].sum()

4117