# Analyse the Classification performance under Frequency Manipulations. 


In [None]:
WORKDIR = "/home/wustl/Dummy/Wustl/Deepfake/MasterThesis"  # specify WORKDIR here

### Imports

In [None]:

import matplotlib.pyplot as plt
import numpy as np 
import json
import os 
import json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import seaborn as sns


from paper_utils import get_figsize, configure_matplotlib, create_subplot_mosaic
from calculate_MMD import get_files_from_directory,get_files_from_sub_directory


Generators = ["ProGAN", "StyleGAN", "ProjectedGAN", "Diff-StyleGAN2", "Diff-ProjectedGAN", "DDPM", "IDDPM", "ADM", "PNDM", "LDM","coco", "all","gan","diff"]#

Gans = ["ProGAN", "StyleGAN", "ProjectedGAN", "Diff-StyleGAN2", "Diff-ProjectedGAN"]
Diffs = ["DDPM", "IDDPM", "ADM", "PNDM", "LDM"]



Base functions

In [None]:
configure_matplotlib(
    rc={
        "xtick.labelbottom": False,
        "xtick.bottom": True,
        "xtick.labeltop": True,
        "ytick.left": True,
        "ytick.right": False,
        "figure.constrained_layout.use": False,
        "savefig.pad_inches": 0.01,
    }
)


def create_dataframe_ce(files):
    experiments = {}
    path_names = []
    for experiment_name, files in files.items():
        for file in files:
            parts = file.split("/")[-1].split("_")
            generator_type = parts[-1].split(".")[0]
            model_name = parts[0]
            
            try:
                with open(file, "r") as input_file:  # Assuming JSON file
                    results = json.load(input_file)
            except Exception as e:
                print(f"Error reading {file}: {e}")
                continue

            experiments.setdefault(
                model_name, {}).setdefault(
                    experiment_name, {}).setdefault(
                        generator_type, {key: 
                                        {metric: metric_value for metrics in value for metric, metric_value in metrics.items()} 
                                        for step in results for key, value in step.items()
                                        })      

    data = []
    for encoder_name, experiment_dict in experiments.items():
        for experiment_name, generator_dict in experiment_dict.items():
            for generator_type, metrics_dict in generator_dict.items():
                for number, metrics in metrics_dict.items():
                    # Create a dictionary for each combination
                    flattened_dict = {
                        'Encoder': encoder_name,
                        'Experiment': experiment_name,
                        'Generator': generator_type,
                        'Number': number
                    }
                    # Add all metrics to this dictionary
                    flattened_dict.update(metrics)
                    data.append(flattened_dict)

    # Converting to a DataFrame
    df = pd.DataFrame(data)
    df['tendency_to_overclassify_positive'] = 1 - df['test_precision']
        
    df['tendency_to_overclassify_negative'] = 1 - df['test_recall']
    df.to_csv("classification_results.csv")
    return df

def create_df_test(files):
    experiments = {}
    path_names = []
    for experiment_name, files in files.items():
        for file in files:
            parts = file.split("/")[-1].split("_")
            generator_type = parts[-1].split(".")[0]
            model_name = parts[0]
            
            try:
                with open(file, "r") as input_file:  # Assuming JSON file
                    results = json.load(input_file)
            except Exception as e:
                print(f"Error reading {file}: {e}")
                continue

            experiments.setdefault(
                model_name, {}).setdefault(
                    experiment_name, {}).setdefault(
                        generator_type, {key: 
                                     {metric: metric_value for metrics in values 
                                      for metric, metric_value in metrics.items()} 
                                     for key,values in results.items()
                                     })      

    data = []
    for encoder_name, experiment_dict in experiments.items():
        for experiment_name, generator_dict in experiment_dict.items():
            for generator_type, metrics_dict in generator_dict.items():
                for number, metrics in metrics_dict.items():
                    # Create a dictionary for each combination
                    flattened_dict = {
                        'Encoder': encoder_name,
                        'Experiment': experiment_name,
                        'Generator': generator_type,
                        'Number': number
                    }
                    # Add all metrics to this dictionary
                    flattened_dict.update(metrics)
                    data.append(flattened_dict)

    # Converting to a DataFrame
    df = pd.DataFrame(data)
    df['tendency_to_overclassify_positive'] = 1 - df['test_precision']
        
    df['tendency_to_overclassify_negative'] = 1 - df['test_recall']
    df.to_csv("classification_results.csv")
    return df



In [None]:
experiments = {}
path_names = []
base_input_folder = "/home/wustl/Dummy/Wustl/Deepfake/MasterThesis/results/classification/butt_low_5_ce"
all_files_butt_low_ce = get_files_from_directory(base_input_folder)
base_input_folder = "/home/wustl/Dummy/Wustl/Deepfake/MasterThesis/results/classification/butt_low_middle_freq_ce"
all_files_but_low_per = get_files_from_directory(base_input_folder)

df_ce = create_dataframe_ce(all_files_butt_low_ce)
df_per = create_df_test(all_files_but_low_per)


### Compare the two generators.

First we want to understand how the two generators trained on the specific data behave when no filters are applied. Here we want to see if under normal condiations, which model handles the case of new type of generators better, see if there are any tendencies to overclassify either fake or real. 

In [None]:


import numpy as np
import matplotlib.pyplot as plt

# Filter DataFrame for rows where 'Number' is "no"
df_filtered = df_ce[df_ce['Number'] == "no"]

Test_Columns = ["ProGAN", "StyleGAN", "ProjectedGAN", "Diff-StyleGAN2", "Diff-ProjectedGAN", "DDPM", "IDDPM", "ADM", "PNDM", "LDM", "coco"]
#Generators = Test_Columns.copy()  # Assuming Generators should be the same as Test_Columns initially

# Assuming we have two models: Model A and Model B
model_a = "OpenClip"  # Replace with the actual key for Model A
model_b = "DinoV2"  # Replace with the actual key for Model B

metrics = ['test_roc_auc']  # List of metrics

# Create a figure with subplots for each metric
fig, ax = plt.subplots(figsize=(12, 6)) # Adjust the figure size as needed
# Iterate over each metric

def create_heatmap(df, metric, Generators, Test_Columns, model_a, ax):
    df_modified = df.copy()
    df_modified[metric] *= 100
    pivot_a = df_modified[df_modified['Encoder'] == model_a].pivot(index='Generator', columns='Experiment', values=metric)
    pivot_a['AVG'] = pivot_a.mean(axis=1)
    pivot_a.loc['AVG'] = pivot_a.mean(axis=0)

    Generators_with_avg = Generators + ['AVG']
    Test_Columns_with_avg = Test_Columns + ['AVG']

    # Reindex pivot table to include the 'AVG' row and column
    pivot_a = pivot_a.reindex(index=Test_Columns_with_avg, columns=Generators_with_avg).fillna(0)

    sns.heatmap(
        pivot_a,
        xticklabels=[label.split("_")[-1] for label in pivot_a.columns],
        annot=True,
        annot_kws={"fontsize": 10.5},
        fmt=".1f",
        cbar=False,
        square=True,
        linewidths=1,
        #cmap=plt.cm.Reds_r,
        ax=ax,
        vmin=0,
        vmax=100
    )

    ax.set_xticklabels(Generators_with_avg, rotation=45, fontsize=11)
    ax.set_yticklabels(Test_Columns_with_avg, fontsize=11)
    ax.set_xlabel("Fine-tuned on")
    ax.xaxis.set_label_position("top")
    ax.set_ylabel("Tested on")
    ax.grid(False, which='minor')
    # Adding lines to separate sections of the heatmap
    ax.vlines([5, 10], ymin=0, ymax=len(Test_Columns_with_avg), colors=["k"], linestyles=["solid"], linewidths=[0.5])
    ax.hlines([5], xmin=0, xmax=len(Generators_with_avg), colors=["k"], linestyles=["solid"], linewidths=[0.5])

# df_filtered = df_ce[df_ce['Number'] == "185"].reset_index(drop=True)
# df_no_filtered = df_ce[df_ce['Number'] == "no"].reset_index(drop=True)
# df_filtered['test_acc'] = df_filtered.groupby(['Encoder', 'Generator', 'Experiment'])['test_acc'].transform('max')
# df_filtered["test_acc"] = pd.to_numeric(df_filtered["test_acc"]) - pd.to_numeric(df_no_filtered["test_acc"])
# print(df_filtered["test_acc"])
create_heatmap(df_filtered,"test_pd_01",Generators,Test_Columns,model_a,ax)
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Filter DataFrame for rows where 'Number' is "no"
df_filtered = df_ce[df_ce['Number'] == "no"]

Test_Columns = ["ProGAN", "StyleGAN", "ProjectedGAN", "Diff-StyleGAN2", "Diff-ProjectedGAN", "DDPM", "IDDPM", "ADM", "PNDM", "LDM", "coco"]
#Generators = Test_Columns.copy()  # Assuming Generators should be the same as Test_Columns initially

# Assuming we have two models: Model A and Model B
model_a = "OpenClip"  # Replace with the actual key for Model A
model_b = "DinoV2"  # Replace with the actual key for Model B

metrics = ['test_acc']  # List of metrics

# Create a figure with subplots for each metric
fig, ax = plt.subplots(figsize=(12, 6)) # Adjust the figure size as needed
# Iterate over each metric

def create_heatmap(df, metric, Generators, Test_Columns, model_a, ax):
    df_modified = df.copy()
    df_modified[metric] *= 100
    pivot_a = df_modified[df_modified['Encoder'] == model_a].pivot(index='Generator', columns='Experiment', values=metric)
    pivot_a['AVG'] = pivot_a.mean(axis=1)
    pivot_a.loc['AVG'] = pivot_a.mean(axis=0)

    Generators_with_avg = Generators + ['AVG']
    Test_Columns_with_avg = Test_Columns + ['AVG']

    # Reindex pivot table to include the 'AVG' row and column
    pivot_a = pivot_a.reindex(index=Test_Columns_with_avg, columns=Generators_with_avg).fillna(0)

    sns.heatmap(
        pivot_a,
        xticklabels=[label.split("_")[-1] for label in pivot_a.columns],
        annot=True,
        annot_kws={"fontsize": 10.5},
        fmt=".1f",
        cbar=False,
        square=True,
        linewidths=1,
        #cmap=plt.cm.Reds_r,
        ax=ax,
        vmin=df_modified[metric].min(),
        vmax=df_modified[metric].max()
    )

    ax.set_xticklabels(Generators_with_avg, rotation=45, fontsize=11)
    ax.set_yticklabels(Test_Columns_with_avg, fontsize=11)
    ax.set_xlabel("Fine-tuned on")
    ax.xaxis.set_label_position("top")
    ax.set_ylabel("Tested on")
    ax.grid(False, which='minor')
    # Adding lines to separate sections of the heatmap
    ax.vlines([5, 10], ymin=0, ymax=len(Test_Columns_with_avg), colors=["k"], linestyles=["solid"], linewidths=[0.5])
    ax.hlines([5], xmin=0, xmax=len(Generators_with_avg), colors=["k"], linestyles=["solid"], linewidths=[0.5])

df_filtered = df_ce[df_ce['Number'] == "185"].reset_index(drop=True)
df_no_filtered = df_ce[df_ce['Number'] == "no"].reset_index(drop=True)

df_filtered["test_acc"] = pd.to_numeric(df_filtered["test_acc"]) - pd.to_numeric(df_no_filtered["test_acc"])
print(df_filtered["test_acc"])
create_heatmap(df_filtered,"test_acc",Generators,Test_Columns,model_a,ax)
plt.tight_layout()
plt.show()

In [None]:

df_filtered = df[df["Generator"] == "ADM"]

average_acc_by_model_filter = df_filtered.groupby(['Encoder', 'Number'])['test_acc'].max()

# Find the filter with the maximum average accuracy for each model
max_average_acc_by_model = average_acc_by_model_filter.groupby(level=0).idxmax()
max_average_values_by_model = average_acc_by_model_filter.groupby(level=0).max()

# Display the results
print(f"The filters with the maximum average accuracy for each model are:\n{max_average_acc_by_model}")
print(f"The maximum average accuracy values for each model are:\n{max_average_values_by_model}")

We can see, that clip has bigger problems handling diffusion trained models when trained on Gan generated data. Yet, clip can handle coco better when trained on gans and diffusion repectively. We can observe that clip tends classify images as real when faced with new type of generators. 

So in the following section, we should seperate between gan trained and diff trained models.

Now we want to see, how the models behave under the context of domain and generator shift?

### DIFF Generated Gan Tested

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd


filtered_data = df_ce[(df_ce['Experiment'] == 'ADM') & (df_ce['Generator'] != 'coco')]#& (df['Generator'] != 'ADM')
filtered_data = filtered_data[filtered_data['Generator'].isin(Test_Columns)]
filtered_data = filtered_data[filtered_data['Generator'] != filtered_data['Experiment']]
# filtered_data["Number"] = filtered_data["Number"].replace("no", 200)
# filtered_data["Number"] = filtered_data["Number"].astype(int)
filtered_data['Number'] = pd.to_numeric(filtered_data['Number'].replace('no', 200))
Diffs = ["DDPM", "IDDPM", "ADM", "PNDM", "LDM"]

dino_data_corrected = filtered_data[filtered_data['Encoder'] == 'DinoV2']
clip_data = filtered_data[filtered_data['Encoder'] == 'OpenClip']

metrics = ['test_pd_01', 'test_recall', 'test_precision']
#data = data[data['Generator'] != "coco"]
#data["Number"].replace("no", 200, inplace=True)
blue_palette = sns.color_palette("Blues", len(Gans))
red_palette = sns.color_palette("Reds", len(Diffs))
#sns.set_palette('deep')
head_key = "Experiment"
y = "test_acc"
#plt.figure(figsize=(18, len(filtered_data[head_key].unique()) * 6))
# Assuming 'data' is a DataFrame that contains all experiments data
# 'y' is the column for the y-axis and 'head_key' is the column that distinguishes different experiments
# 'Number' is assumed to be the x-axis
data = filtered_data
global_min_y = data[y].min()
global_max_y = data[y].max()

# Define a diverse color palette
palette = sns.color_palette("husl", n_colors=len(clip_data['Generator'].unique()))

# Plot for CLIP encoder
# g_clip = sns.FacetGrid(clip_data, col=head_key, hue='Generator', col_wrap=4, sharey=False, palette=palette)
# g_clip.map(sns.lineplot, 'Number', y, linewidth=1.5,  dashes=False).set(ylim=(global_min_y, global_max_y))
# g_clip.set_titles("{col_name}",fontsize='x-large')
# #g_clip.fig.suptitle('Accuarcy Score vs Frequency Information for CLIP Encoder', y=1.05)
# g_clip.despine(left=True)

# # Update the x-axis to show only every other tick for better readability
# for ax in g_clip.axes.flat:
#     ax.set_xticks(ax.get_xticks()[::2])
#     ax.set_title(ax.get_title(), fontsize=15)
# Plot for DINO (DinoV2) encoder
subplot_size_width = 7   # Width of each subplot
subplot_size_height = 25  # Height of each subplot

# Calculate the width of the entire FacetGrid
grid_width = subplot_size_width * 3  # Since you want 3 in a row
grid_height = subplot_size_height * (len(clip_data[head_key].unique()) // 3 + 1)
def create_facetGrid(data, head_key, y, global_min_y, global_max_y, palette):
    # Create the FacetGrid
    g = sns.FacetGrid(data, col=head_key, hue='Generator', col_wrap=4, sharey=False, palette=palette, height=2.5, aspect=.65)
    g.map(sns.lineplot, 'Number', y, linewidth=1.5, dashes=False).set(ylim=(global_min_y, global_max_y))#, xlim=(min_y, max_y)
    g.set_titles("{col_name}", fontsize='x-large')

    # Adding a horizontal dotted line at y=50
    for ax in g.axes.flat:
        ax.axhline(50, color='gray', linestyle='--', linewidth=0.5)

    # Customize x-axis labels
    for ax in g.axes.flat:
        ax.set_xticks([1, 100, 200])
        ax.set_xticklabels(['1', '100', '200'])
        ax.tick_params(axis='x', labelbottom=True)  # Ensures x-axis labels are at the bottom
        ax.set_title(ax.get_title(), fontsize=15)

    # Despine the plots
    g.despine(left=True)

    return g

def create_custom_palette(gans_list, diffs_list):
    # Number of unique elements in each list
    num_gans = len(gans_list)
    num_diffs = len(diffs_list) 

    # Generate color palettes
    blues = sns.color_palette("Reds", num_gans)
    reds = sns.color_palette("Blues", num_diffs)

    # Map each element to its color
    palette = {gan: blues[i] for i, gan in enumerate(gans_list)}
    palette.update({diff: reds[i] for i, diff in enumerate(diffs_list)})

    return palette


def create_single_view(data, head_key, y, global_min_y, global_max_y, palette):
    # Create one single view. 
    sns.lineplot(data=data, x="Number", y=y, hue='Generator', palette=palette, linewidth=1.5, dashes=False)
    plt.ylim(global_min_y, global_max_y)
    plt.title(head_key, fontsize='x-large')
    plt.axhline(50, color='gray', linestyle='--', linewidth=0.5)
    plt.xticks([1,25,50,75, 100,125,150,175,200], ["1","25","50","75", "100","125","150","175","200"])
    #plt.xticks([1, 100, 200])
    plt.xlabel('D0')
    plt.ylabel(y)
    plt.show()
    


palette = create_custom_palette(Gans, Diffs)
create_single_view(dino_data_corrected, head_key, y, global_min_y, global_max_y, palette)
# grid = create_facetGrid(dino_data_corrected, head_key, y, global_min_y, global_max_y, palette)
# grid.add_legend(title='Generator', fontsize='x-large', title_fontsize='20', loc='lower right', ncol=2,label_order=Gans+Diffs)

# # Adjust layout
# plt.tight_layout()

# # Display the plots
# grid.fig.show()




Following the opservation, that the models trained on the gan images perform worse on the DM generated images, we want to see if this is due to a bias twoards a certain frequency spectrum. 

### The Impact of Frequency Removal on the Classification Performace. 

To Understand if or even which frequency bands are most important for the classification of a given generator, we low pass filter the image using a butterworth filter. Butterworth has the advandge of generating less "rippels" aka artifcats in the inverse fourier. So we increase with a step size of 5 the filter, and see how that impacts the classification performance. It is important that we take the False Postive and False Negative Rate into perspective, as a High flase postive would indicate a a fake image dump, while a false negative rate, would indicate a basis towards classifiying images as real. 


As a first step, we want to compare the two models for all experiments and generators. We do this, to see if there are any specific patterns that are destinc towards the encoder. As coco was trained on a completely different dataset, we exclude it for the analysis, as it would not be a fair comparison. We also remove the performance on the own data.
Then we also plot the recall, to see if the model starts to overclassify images a real, even though they are fake. Meaning, it starts to dump images into the real category.

Then we also plot the precision, to see if the model starts to overclassify images a fake, even though they are real. Meaning, it starts to dump images into the fake category.


As we have seen earlier that the there is a clear difference between models trained on gans and models trained on diff models. We want to see if we can determine which could be the reason for it. 


In [None]:
filtered_data = df_ce[(df_ce['Experiment'] != 'coco') & (df_ce['Generator'] != 'coco')]#
filtered_data = filtered_data[filtered_data['Generator'] != filtered_data['Experiment']]
dino_data_corrected = filtered_data[filtered_data['Encoder'] == 'DinoV2']
clip_data = filtered_data[filtered_data['Encoder'] == 'OpenClip']
metrics = ['test_acc', 'test_recall', 'test_precision']
sns.set_palette('deep')

dino_data_corrected = dino_data_corrected[(dino_data_corrected["Generator"].isin(Diffs))&(dino_data_corrected["Experiment"].isin(Gans))]
clip_data = clip_data[(clip_data["Generator"].isin(Diffs))&(clip_data["Experiment"].isin(Gans))]
# Creating a figure for each metric
for metric in metrics:
    global_min_y = filtered_data[metric].min()
    global_max_y = filtered_data[metric].max()

    plt.figure(figsize=(18, 6))

    # Convert 'Number' to numeric, replacing 'no' with 200 and sort in descending order
    clip_data_sorted = clip_data.copy()
    dino_data_sorted = dino_data_corrected.copy()
    clip_data_sorted['Number'] = pd.to_numeric(clip_data_sorted['Number'].replace('no', 200))
    dino_data_sorted['Number'] = pd.to_numeric(dino_data_sorted['Number'].replace('no', 200))
    clip_data_sorted = clip_data_sorted.sort_values(by='Number', ascending=True)
    dino_data_sorted = dino_data_sorted.sort_values(by='Number', ascending=True)

    # Combined line plot for both CLIP and DINO encoders
    if not clip_data_sorted.empty:
        sns.lineplot(x='Number', y=metric, data=clip_data_sorted, label='CLIP', ci='sd', markers=True)
    if not dino_data_sorted.empty:
        sns.lineplot(x='Number', y=metric, data=dino_data_sorted, label='DINO (DinoV2)', ci='sd', markers=True)

    plt.xticks(rotation=45)  # Rotating the x-axis labels
    plt.xlim(200, 0) 
    plt.ylim(global_min_y, global_max_y) 
    plt.title(f'{metric.capitalize().replace("test_", "")} vs Frequency Information')
    plt.xlabel('Frequency Information (Number)')
    plt.ylabel(metric.capitalize().replace("test_", ""))
    
    plt.legend()
    plt.tight_layout()
    plt.show()

We can see here that clip tends to classify all images as fake, if there is no high frequency informaiton. When we give it more frequency information, the rate of missing fake images increases. Dino on the other hand, increases its recall the more information is provided. THis indicated that dino when trained on gan images checks also the low frequency domain, and makes it majority decission based on the highest frequencys. Both models, learned that when focusing on the highest frequenices, they best classify gans generated images, while for dino the lower frequencies also play a role. So if we train the model with the fourier filter, it should be able to better classify new unseen generators, as they can learn to learn to also focus on new bands. 
When looking a the precission, we see that dino realtivly fasts reduces the false postive rate, meaning to overclassifiying fake images. With dino, the rate slowly increases. So Dino is less likly tricked into thinking a image is fake, with lower frequencies. 

The data suggest that dino is able to utilize the lower frequency bands aswell for its classification. 



An idead would be to find the frequency where all models on average perform best for either gan or diff. 


Gans produce more noise in the higher frequencies then diffusion models. So when trained on gans, the models have problems handling diffusion models, where the noise in the higher frequencies is less distinct. This  has been shown by (paper). So if a model is able to utilize also lower frequencies in a efficient way, it performs should be good till the noisy higher frequecies are introduced. So we see that dino encodes lower frequencies better then clip. Meaning when training clip, the higher frequencies played a more important role in the classificaiton context. Yet in the self supervised training of dino, the lower frequencies play are more crucial role. So while both models utilize the full frequency range, dino handles the lower frequencies better. This is important, is the lower frequecies are less prone to noise, and thus more robust to unseen generators. 

To now fully utilize the lower frequencies, we want to train the model with a low pass filter. Here it is important to note, that all images need to be lowpass filtered. This is possible as we do not finetune the weights of the encoder it self, which still is able to produce the embeddings of unfilter images correctly, we just ensure that the classification layer is able to focus the lower frequencies. So if the training is succesfull, dino should be able to classify unseen generators better, as it is able to utilize the lower frequencies. If this holds up, the training paradigm for dino does improve foundation models in the sense, that more low frequency information is encoded. This information is important for a variety of tasks, such as ... .

When looking into the training paradigm used for dino and clip, it is mentioned in the dino paper they use a gausian blur effect, which is a low pass filter. While in the clip paper they do not use it. This could be the reason why dino is better in focusing on lower frequencies, and in our case it brings a wide range of advantages. So we have a task, which can not soley be solved with higher frequencie information, we should train the model with a low pass filter. This not only reduces the noise in the higher frequencies, but also forces the model to focus on the lower frequencies, which contain valuble information for certain tasks.

We hypothesis that dino can better encode lowfrequency information dues to the augmentaitons used during training. In Training they always apply the Gausian Blur on all student data, which forces the model to encode the basic understanding of the image based on those frequencies. As we could see in the privous section, Dino is able to utilize the lower frequencies better then clip. This is important, as the lower frequencies are less prone to noise (paper that shows this). Even more so, it was shown that diffusion models also show artificats in the mid-to high frequenzy range. What is interesting though, is that dino can sperate between gans and dinos in the mid frequency range way better then in the high frequency range.


Lowpass filtering does imporve performance. This is contradicting the conjecture claimned int (paper), which states the the lowpass filtering makes them indistinguishable. 

It is also shown that higher frequencies in diffusion models show simalr artifcats as naturall images in the with jpeg compresseion.(Frourier Diff paper). The same paper delivers the conjecture, that infact mid frequecny deliver 



In [None]:
filtered_data = df[(df['Experiment'] == 'coco') & (df['Generator'] != 'coco')]
#filtered_data = filtered_data[filtered_data['Generator'] != filtered_data['Experiment']]
dino_data_corrected = filtered_data[filtered_data['Encoder'] == 'DinoV2']
clip_data = filtered_data[filtered_data['Encoder'] == 'OpenClip']
metrics = ['test_acc', 'test_recall', 'test_precision']
sns.set_palette('deep')


dino_data_corrected = dino_data_corrected[(dino_data_corrected["Generator"].isin(Diffs))]
clip_data = clip_data[(clip_data["Generator"].isin(Diffs))]
# Creating a figure for each metric
for metric in metrics:
    global_min_y = filtered_data[metric].min()
    global_max_y = filtered_data[metric].max()
    plt.figure(figsize=(18, 6))

    # Boxplot for CLIP encoder
    plt.subplot(1, 2, 1)
    
    sns.boxplot(x='Number', y=metric, data=clip_data)
    sns.pointplot(x='Number', y=metric, data=clip_data, color='black', markersize=6, linewidth=2)
    plt.xticks(rotation=90)
    plt.ylim(global_min_y, global_max_y) 
    plt.title(f'{metric.capitalize().replace("test_", "")} vs Frequency Information for CLIP Encoder')
    plt.xlabel('Frequency Information (Number)')
    plt.ylabel(metric.capitalize().replace("test_", ""))

    # Boxplot for DINO (DinoV2) encoder
    plt.subplot(1, 2, 2)
    if not dino_data_corrected.empty:
        sns.boxplot(x='Number', y=metric, data=dino_data_corrected)
        sns.pointplot(x='Number', y=metric, data=dino_data_corrected, color='black', markersize=6, linewidth=2)
        plt.xticks(rotation=90)
        plt.ylim(global_min_y, global_max_y) 
        plt.title(f'{metric.capitalize().replace("test_", "")} vs Frequency Information for DINO (DinoV2) Encoder')
        plt.xlabel('Frequency Information (Number)')
        plt.ylabel(metric.capitalize().replace("test_", ""))
    else:
        plt.text(0.5, 0.5, 'No data available for DINO (DinoV2) Encoder', 
                 horizontalalignment='center', verticalalignment='center', 
                 transform=plt.gca().transAxes)
        plt.title('Data Unavailable for DINO (DinoV2)')

    plt.tight_layout()
    plt.show()

Now we check, if face with the same datatype, if model has different behaviour. We



In [None]:
filtered_data = df[df['Generator'] == df['Experiment']]
dino_data_corrected = filtered_data[filtered_data['Encoder'] == 'DinoV2']
clip_data = filtered_data[filtered_data['Encoder'] == 'OpenClip']
metrics = ['test_f1', 'test_recall', 'test_precision']
sns.set_palette('deep')
# Creating a figure for each metric
for metric in metrics:
    plt.figure(figsize=(18, 6))

    # Boxplot for CLIP encoder
    plt.subplot(1, 2, 1)
    sns.boxplot(x='Number', y=metric, data=clip_data)
    plt.xticks(rotation=90)
    plt.title(f'{metric.capitalize().replace("test_", "")} vs Frequency Information for CLIP Encoder')
    plt.xlabel('Frequency Information (Number)')
    plt.ylabel(metric.capitalize().replace("test_", ""))

    # Boxplot for DINO (DinoV2) encoder
    plt.subplot(1, 2, 2)
    if not dino_data_corrected.empty:
        sns.boxplot(x='Number', y=metric, data=dino_data_corrected)
        plt.xticks(rotation=90)
        plt.title(f'{metric.capitalize().replace("test_", "")} vs Frequency Information for DINO (DinoV2) Encoder')
        plt.xlabel('Frequency Information (Number)')
        plt.ylabel(metric.capitalize().replace("test_", ""))
    else:
        plt.text(0.5, 0.5, f'No data available for DINO (DinoV2) Encoder', 
                 horizontalalignment='center', verticalalignment='center', 
                 transform=plt.gca().transAxes)
        plt.title('Data Unavailable for DINO (DinoV2)')

    plt.tight_layout()
    plt.show()

If there is an easy distinction for example in the high frequency specturm for a model, that model should generally be better performing when when a input data also has that aspect. 

Here we compare how the different experiments behave. To Ensure no domain shift problems within the lsun trained generators. 

We also seperate between Gans and diffusion trained models. So that the generator will be diffs if trained on a gan and visa versa for diffs.

To better compare the plots, we have to 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

data = df
data = data[data['Generator'] != "coco"]
data["Number"].replace("no", 200, inplace=True)
dino_data_corrected = data[data['Encoder'] == 'DinoV2']
clip_data = data[data['Encoder'] == 'OpenClip']
sns.set_palette('deep')
head_key = "Experiment"
y = "test_f1"
plt.figure(figsize=(18, len(data[head_key].unique()) * 6))

# Find global y-axis limits
global_min_y = data[y].min()
global_max_y = data[y].max()

for i, experiment in enumerate(data[head_key].unique()):
    dino_experiment_data = dino_data_corrected[dino_data_corrected[head_key] == experiment]
    clip_experiment_data = clip_data[clip_data[head_key] == experiment]

    # Similar filtering for Gans and Diffs...

    if experiment in Gans:
        dino_experiment_data = dino_experiment_data[dino_experiment_data["Generator"].isin(Diffs)]
        clip_experiment_data = clip_experiment_data[clip_experiment_data["Generator"].isin(Diffs)]
    elif experiment in Diffs:
        dino_experiment_data = dino_experiment_data[dino_experiment_data["Generator"].isin(Gans)]
        clip_experiment_data = clip_experiment_data[clip_experiment_data["Generator"].isin(Gans)]

    # Boxplot for CLIP encoder
    plt.subplot(len(data[head_key].unique()), 2, 2*i + 1)
    if not clip_experiment_data.empty:
        sns.boxenplot(x='Number', y=y, data=clip_experiment_data, color='lightgray')
        sns.pointplot(x='Number', y=y, data=clip_experiment_data, color='black', scale=0.75)
        plt.xticks(rotation=90)
        plt.ylim(global_min_y, global_max_y)  # Set the global y-axis limits
        plt.title(f'F1 Score vs Frequency Information for CLIP Encoder - {experiment}')
        plt.xlabel('Frequency Information (Number)')
        plt.ylabel('F1 Score')
    else:
        plt.text(0.5, 0.5, f'No data available for CLIP Encoder - {experiment}', 
                 horizontalalignment='center', verticalalignment='center', 
                 transform=plt.gca().transAxes)
        plt.title(f'Data Unavailable for CLIP - {experiment}')

    # Boxplot for DINO (DinoV2) encoder
    plt.subplot(len(data[head_key].unique()), 2, 2*i + 2)
    if not dino_experiment_data.empty:
        sns.boxenplot(x='Number', y=y, data=dino_experiment_data, color='lightgray')
        sns.pointplot(x='Number', y=y, data=dino_experiment_data, color='black', scale=0.75)
        plt.xticks(rotation=90)
        plt.ylim(global_min_y, global_max_y)  # Set the global y-axis limits
        plt.title(f'F1 Score vs Frequency Information for DINO (DinoV2) Encoder - {experiment}')
        plt.xlabel('Frequency Information (Number)')
        plt.ylabel('F1 Score')
    else:
        plt.text(0.5, 0.5, f'No data available for DINO (DinoV2) Encoder - {experiment}', 
                 horizontalalignment='center', verticalalignment='center', 
                 transform=plt.gca().transAxes)
        plt.title(f'Data Unavailable for DINO (DinoV2) - {experiment}')

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

data = df
data = data[data['Generator'] != "coco"]
data["Number"].replace("no", 200, inplace=True)
dino_data_corrected = data[data['Encoder'] == 'DinoV2']
clip_data = data[data['Encoder'] == 'OpenClip']
sns.set_palette('deep')
head_key = "Generator"
y = "test_acc"
plt.figure(figsize=(18, len(data[head_key].unique()) * 6))

# Find global y-axis limits
global_min_y = data[y].min()
global_max_y = data[y].max()

for i, experiment in enumerate(data[head_key].unique()):
    dino_experiment_data = dino_data_corrected[dino_data_corrected[head_key] == experiment]
    clip_experiment_data = clip_data[clip_data[head_key] == experiment]

    # Similar filtering for Gans and Diffs...

    # if experiment in Gans:
    #     dino_experiment_data = dino_experiment_data[dino_experiment_data["Experiment"].isin(Diffs)]
    #     clip_experiment_data = clip_experiment_data[clip_experiment_data["Experiment"].isin(Diffs)]
    # elif experiment in Diffs:
    #     dino_experiment_data = dino_experiment_data[dino_experiment_data["Experiment"].isin(Gans)]
    #     clip_experiment_data = clip_experiment_data[clip_experiment_data["Experiment"].isin(Gans)]

    # Boxplot for CLIP encoder
    plt.subplot(len(data[head_key].unique()), 2, 2*i + 1)
    if not clip_experiment_data.empty:
        sns.boxenplot(x='Number', y=y, data=clip_experiment_data, color='lightgray')
        sns.pointplot(x='Number', y=y, data=clip_experiment_data, color='black', scale=0.75)
        plt.xticks(rotation=90)
        plt.ylim(global_min_y, global_max_y)  # Set the global y-axis limits
        plt.title(f'F1 Score vs Frequency Information for CLIP Encoder - {experiment}')
        plt.xlabel('Frequency Information (Number)')
        plt.ylabel('F1 Score')
    else:
        plt.text(0.5, 0.5, f'No data available for CLIP Encoder - {experiment}', 
                 horizontalalignment='center', verticalalignment='center', 
                 transform=plt.gca().transAxes)
        plt.title(f'Data Unavailable for CLIP - {experiment}')

    # Boxplot for DINO (DinoV2) encoder
    plt.subplot(len(data[head_key].unique()), 2, 2*i + 2)
    if not dino_experiment_data.empty:
        sns.boxenplot(x='Number', y=y, data=dino_experiment_data, color='lightgray')
        sns.pointplot(x='Number', y=y, data=dino_experiment_data, color='black', scale=0.75)
        plt.xticks(rotation=90)
        plt.ylim(global_min_y, global_max_y)  # Set the global y-axis limits
        plt.title(f'F1 Score vs Frequency Information for DINO (DinoV2) Encoder - {experiment}')
        plt.xlabel('Frequency Information (Number)')
        plt.ylabel('F1 Score')
    else:
        plt.text(0.5, 0.5, f'No data available for DINO (DinoV2) Encoder - {experiment}', 
                 horizontalalignment='center', verticalalignment='center', 
                 transform=plt.gca().transAxes)
        plt.title(f'Data Unavailable for DINO (DinoV2) - {experiment}')

plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

data = df
data = data[data['Generator'] != "coco"]
data["Number"].replace("no", 200, inplace=True)
data["Number"] = data["Number"].astype(str)  # Convert 'Number' to string
dino_data = data[data['Encoder'] == 'DinoV2']
clip_data = data[data['Encoder'] == 'OpenClip']

sns.set_palette('deep')
head_key = "Experiment"
y = "test_f1"

# Separate experiments into Gans and Diffs
experiments_gans = [exp for exp in data[head_key].unique() if exp in Gans]
experiments_diffs = [exp for exp in data[head_key].unique() if exp in Diffs]

# Create subplots
fig, axs = plt.subplots(2, max(len(experiments_gans), len(experiments_diffs)), figsize=(18, 12))

for i, experiment in enumerate(experiments_gans):
    experiment_data_dino = dino_data[dino_data[head_key] == experiment]
    experiment_data_clip = clip_data[clip_data[head_key] == experiment]
    
    if not experiment_data_dino.empty or not experiment_data_clip.empty:
        sns.lineplot(x='Number', y=y, data=experiment_data_dino, label='DinoV2', ci='sd', estimator='mean', ax=axs[0, i])
        sns.lineplot(x='Number', y=y, data=experiment_data_clip, label='OpenClip', ci='sd', estimator='mean', ax=axs[0, i])
        axs[0, i].set_title(f'Gans - {experiment}')
    else:
        axs[0, i].text(0.5, 0.5, 'No data available', ha='center', va='center')


for j, experiment in enumerate(experiments_diffs):
    experiment_data_dino = dino_data[dino_data[head_key] == experiment]
    experiment_data_clip = clip_data[clip_data[head_key] == experiment]
    
    if not experiment_data_dino.empty or not experiment_data_clip.empty:
        sns.lineplot(x='Number', y=y, data=experiment_data_dino, label='DinoV2', ci='sd', estimator='mean', ax=axs[1, j])
        sns.lineplot(x='Number', y=y, data=experiment_data_clip, label='OpenClip', ci='sd', estimator='mean', ax=axs[1, j])
        axs[1, j].set_title(f'Diffs - {experiment}')
    else:
        axs[1, j].text(0.5, 0.5, 'No data available', ha='center', va='center')


# Adjust layout
plt.tight_layout()
plt.show()

### Analyse the impact of FDA Swichting on the Classification performance

When low frequencies from a real image are injected into a target image, a low recall rate indicates that the model is mistaking fake images for real ones. Initially, this might seem disadvantageous, as it could potentially allow the model to be tricked into classifying an altered image as genuine. However, it could also demonstrate the encoder's proficiency in processing low-level features. Ideally, this capability should result in a real image being correctly identified as real, hence a higher precision score. If the model fails to do so, it effectively creates a repository for misidentified real images. Moreover, the model may inaccurately flag images with unusual frequency characteristics as fake, even when such peculiarities arise from authentic, unaltered sources.




