In [43]:
import os, sys
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# CLIPScore

In [44]:
result_path = "../data/results/default-experiment/results_clipscore.tsv"
result_df = pd.read_csv(os.path.join(os.getcwd(), result_path), sep="\t", index_col=[0,1])

prompt_num = result_df.index.get_level_values(0).unique().shape[0]
optimization_step_num = result_df.index.get_level_values(1).unique().shape[0] - 1

## Average CLIP Score

In [45]:
avg_clipscore_results = result_df.groupby(level=1).mean()
avg_clipscore_results

Unnamed: 0_level_0,score
image_id,Unnamed: 1_level_1
0,33.474657
1,30.980562
2,30.850779
3,30.695939
4,30.562657
5,30.710908


## Optimized Image vs. Original Image Score
Count the number of occurances per optimization steps where the similarity score of the optimized generated image and the user prompt is higher then the image generated based on the original user prompt.
This metric measures if optimizing the prompt using our model yields better results

In [46]:
original_scores = result_df.loc(axis=0)[:,0]["score"].values
for optimization_step in range(1, optimization_step_num+1):
    optimization_scores = result_df.loc(axis=0)[:,optimization_step]["score"].values
    improvement_count = np.sum(optimization_scores > original_scores)
    improvement_percentage = improvement_count / prompt_num
    print(f"Optimization step {optimization_step}: {improvement_percentage*100:.1f}% of prompts improved")


Optimization step 1: 30.9% of prompts improved
Optimization step 2: 29.5% of prompts improved
Optimization step 3: 30.1% of prompts improved
Optimization step 4: 30.6% of prompts improved
Optimization step 5: 30.7% of prompts improved


## Current Optimized Image vs. Previous Generations Score
Count the number of occurances where the generated image of the current optimization steps achieves the highest similarity with the original user prompt compared to all previous generations. 
This metric measures if running the optimization loop several times improves results and if/when a convergence is reached.

In [47]:
for optimization_step in range(1, optimization_step_num+1):
    previous_scores = result_df.loc(axis=0)[:,:optimization_step-1]["score"].max(level=0).values
    optimization_scores = result_df.loc(axis=0)[:,optimization_step]["score"].values
    improvement_count = np.sum(optimization_scores > previous_scores)
    improvement_percentage = improvement_count / prompt_num
    print(f"Optimization step {optimization_step}: {improvement_percentage*100:.1f}% of prompts improved compared to previous generation steps")

Optimization step 1: 30.9% of prompts improved compared to previous generation steps
Optimization step 2: 20.6% of prompts improved compared to previous generation steps
Optimization step 3: 17.6% of prompts improved compared to previous generation steps
Optimization step 4: 14.6% of prompts improved compared to previous generation steps
Optimization step 5: 11.9% of prompts improved compared to previous generation steps


## Current Optimized Image vs. Previous Generations Score
Count the number of occurances where the generated image of a given optimization steps achieves the highest similarity with the original user prompt compared to all other generated images.
This metric measures which optimization steps tends to yield the most fitting images.

In [51]:
for optimization_step in range(1, optimization_step_num+1):
    previous_scores = result_df.loc(axis=0)[:,:optimization_step-1]["score"].max(level=0).values
    if optimization_step == optimization_step_num:
        next_scores = np.zeros(prompt_num)
    else:
        next_scores = result_df.loc(axis=0)[:,optimization_step+1:]["score"].max(level=0).values
    combined_max = np.max([previous_scores, next_scores], axis=0)
    optimization_scores = result_df.loc(axis=0)[:,optimization_step]["score"].values
    improvement_count = np.sum(optimization_scores > combined_max)
    improvement_percentage = improvement_count / prompt_num
    print(f"Optimization step {optimization_step}: {improvement_percentage*100:.1f}% of prompts achieved best overall performance")

Optimization step 1: 13.9% of prompts achieved best overall performance
Optimization step 2: 12.1% of prompts achieved best overall performance
Optimization step 3: 11.9% of prompts achieved best overall performance
Optimization step 4: 12.5% of prompts achieved best overall performance
Optimization step 5: 11.9% of prompts achieved best overall performance
