In [3]:
import os, sys
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
import csv

warnings.filterwarnings('ignore')

In [4]:
os.chdir('..')

In [5]:
experiment_folder = 'data/results/coco_small_similarity_termination'

In [6]:
dirs = []
for prompt_folder in tqdm(os.listdir(experiment_folder)):
    if prompt_folder == 'hyperparameters.json':
        continue
    prompt_folder = os.path.join(experiment_folder, prompt_folder)
    dirs.append(prompt_folder)

100%|██████████| 51/51 [00:00<00:00, 413751.46it/s]


In [21]:
results_df = pd.DataFrame(columns=('prompt_id', 'terminated', 'termination_step', 'original_prompt','optimized_prompt','optimized_caption'))
for idx, dir in enumerate(dirs):

    results_df.loc[idx, 'prompt_id'] = dir.rsplit('/', 1)[-1]

    with open(dir + '/terminated.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=':')
        for row in csv_reader:
            terminated = row[1].strip() == 'True'

    results_df.loc[idx, 'terminated'] = terminated

    prompts = pd.read_csv(dir + '/prompts.csv', header=None, sep='\t', names=['name','prompt'])
    results_df.loc[idx, 'original_prompt'] = prompts.iloc[0]['prompt']
    results_df.loc[idx, 'optimized_prompt'] = prompts.iloc[-1]['prompt']

    captions = pd.read_csv(dir + '/captions.csv', header=None, sep='\t', names=['id','caption'])
    results_df.loc[idx, 'optimized_caption'] = captions.iloc[-1]['caption']

    if terminated:
        results_df.loc[idx, 'termination_step'] = len(captions)
    else:
        results_df.loc[idx, 'termination_step'] = -1

In [22]:
results_df

Unnamed: 0,prompt_id,terminated,termination_step,original_prompt,optimized_prompt,optimized_caption
0,0,True,2,A woman walking across a street holding a pink...,A woman strolling on a rainy street under her ...,woman walking down a wet sidewalk with a pink ...
1,1,False,-1,Cat sitting next to remote control on small co...,A black and white cat sitting next to a silver...,there is a cat that is sitting next to a mouse
2,2,True,2,A couple of children sitting down on a white w...,Two kids sitting on a white wall and smiling.,two young children sitting on the ground next ...
3,3,True,3,A white refrigerator and a counter in a room.,A spacious kitchen with a white refrigerator a...,"there is a kitchen with a stove, refrigerator,..."
4,4,True,2,a close up of a cat on a rug on the ground,A detailed shot of a cat lounging on a cozy ru...,there is a cat that is laying down on a rug
5,5,True,1,A herd of elephants walking down a dirt road.,A herd of elephants walking down a dirt road.,there are many elephants crossing the road in ...
6,6,True,1,A man holding a slice of pizza while wearing g...,A man holding a slice of pizza while wearing g...,arafed man with glasses holding a slice of piz...
7,7,True,2,A large sleigh bed in a hotel room.,"A luxurious, wooden sleigh bed in a spacious h...",there is a bed with a white and brown comforte...
8,8,False,-1,a person holding an open umbrella with words w...,A close-up of a person's hand holding an umbre...,there is a blue umbrella with a white handle o...
9,9,False,-1,An orange lying next to a green utensil.,An orange resting beside a green spoon.,there is a plate with a leaf and an orange on it


In [23]:
results_df.to_csv('coco_small_similarity_termination.csv')

## Average CLIP Score

In [45]:
avg_clipscore_results = result_df.groupby(level=1).mean()
avg_clipscore_results

Unnamed: 0_level_0,score
image_id,Unnamed: 1_level_1
0,33.474657
1,30.980562
2,30.850779
3,30.695939
4,30.562657
5,30.710908


## Optimized Image vs. Original Image Score
Count the number of occurances per optimization steps where the similarity score of the optimized generated image and the user prompt is higher then the image generated based on the original user prompt.
This metric measures if optimizing the prompt using our model yields better results

In [46]:
original_scores = result_df.loc(axis=0)[:,0]["score"].values
for optimization_step in range(1, optimization_step_num+1):
    optimization_scores = result_df.loc(axis=0)[:,optimization_step]["score"].values
    improvement_count = np.sum(optimization_scores > original_scores)
    improvement_percentage = improvement_count / prompt_num
    print(f"Optimization step {optimization_step}: {improvement_percentage*100:.1f}% of prompts improved")


Optimization step 1: 30.9% of prompts improved
Optimization step 2: 29.5% of prompts improved
Optimization step 3: 30.1% of prompts improved
Optimization step 4: 30.6% of prompts improved
Optimization step 5: 30.7% of prompts improved


## Current Optimized Image vs. Previous Generations Score
Count the number of occurances where the generated image of the current optimization steps achieves the highest similarity with the original user prompt compared to all previous generations. 
This metric measures if running the optimization loop several times improves results and if/when a convergence is reached.

In [47]:
for optimization_step in range(1, optimization_step_num+1):
    previous_scores = result_df.loc(axis=0)[:,:optimization_step-1]["score"].max(level=0).values
    optimization_scores = result_df.loc(axis=0)[:,optimization_step]["score"].values
    improvement_count = np.sum(optimization_scores > previous_scores)
    improvement_percentage = improvement_count / prompt_num
    print(f"Optimization step {optimization_step}: {improvement_percentage*100:.1f}% of prompts improved compared to previous generation steps")

Optimization step 1: 30.9% of prompts improved compared to previous generation steps
Optimization step 2: 20.6% of prompts improved compared to previous generation steps
Optimization step 3: 17.6% of prompts improved compared to previous generation steps
Optimization step 4: 14.6% of prompts improved compared to previous generation steps
Optimization step 5: 11.9% of prompts improved compared to previous generation steps


## Current Optimized Image vs. Previous Generations Score
Count the number of occurances where the generated image of a given optimization steps achieves the highest similarity with the original user prompt compared to all other generated images.
This metric measures which optimization steps tends to yield the most fitting images.

In [51]:
for optimization_step in range(1, optimization_step_num+1):
    previous_scores = result_df.loc(axis=0)[:,:optimization_step-1]["score"].max(level=0).values
    if optimization_step == optimization_step_num:
        next_scores = np.zeros(prompt_num)
    else:
        next_scores = result_df.loc(axis=0)[:,optimization_step+1:]["score"].max(level=0).values
    combined_max = np.max([previous_scores, next_scores], axis=0)
    optimization_scores = result_df.loc(axis=0)[:,optimization_step]["score"].values
    improvement_count = np.sum(optimization_scores > combined_max)
    improvement_percentage = improvement_count / prompt_num
    print(f"Optimization step {optimization_step}: {improvement_percentage*100:.1f}% of prompts achieved best overall performance")

Optimization step 1: 13.9% of prompts achieved best overall performance
Optimization step 2: 12.1% of prompts achieved best overall performance
Optimization step 3: 11.9% of prompts achieved best overall performance
Optimization step 4: 12.5% of prompts achieved best overall performance
Optimization step 5: 11.9% of prompts achieved best overall performance
