In [1]:
import os
import cv2
import csv
import pandas as pd
import numpy as np
from constants import DatasetPath
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

In [2]:
#Utility constants
DATASET_PATH = DatasetPath.effectivePath

def DATASET_DIRS():
    return os.listdir(DATASET_PATH)

REAL_DIRS = ["afhq", "celebahq", "coco", "ffhq", "imagenet", "landscape", "lsun", "metfaces", "cycle_gan"]

FAKE_DIRS = ["big_gan", "cips", "cycle_gan", "ddpm", "denoising_diffusion_gan", "diffusion_gan", "face_synthetics", 
                 "gansformer", "gau_gan", "generative_inpainting", "glide", "lama", "latent_diffusion", "mat", "palette", 
                 "pro_gan", "projected_gan", "sfhq", "stable_diffusion", "star_gan", "stylegan1", "stylegan2", "stylegan3",
                 "taming_transformer", "vq_diffusion"]

csv_columns_name = ['filename', 'image_path', 'target', 'category']
REAL_CSV_PATH = DATASET_PATH + "real.csv"
FAKE_CSV_PATH = DATASET_PATH + "fake.csv"
DIR_FOURIER_PATH = DATASET_PATH + "fourier\\"
FFTS_PATH = DIR_FOURIER_PATH + "fourier\\"

In [3]:
def checkDatasetSync():
    """
    Checks wether data loss occured during the download and unzip process.
    """
    sync = True
    
    for folder in tqdm(DATASET_DIRS()):
        if folder not in REAL_DIRS + FAKE_DIRS:
            # exclude other files 
            if not (".csv" in folder or folder == "fourier"):
                print("Folder " + folder + " does not exist.")
                sync = False

    if sync: print("Dataset correctly synchronized.")

In [4]:
checkDatasetSync()

  0%|          | 0/37 [00:00<?, ?it/s]

Dataset correctly synchronized.


In [6]:
def real_fake_csv_split():
    """
    Calls 'create_csv()' to perform a metadata split creating different csv file for both real and fake images.
        
    """
    create_csv("real")
    create_csv("fake")

def create_csv(target):
    """
    Performs a metadata split creating different csv file for both real and fake images.

    Parametres
    ----------
        target (str): 'real' or 'fake', used to identify the image source.   
    """
    csv = target + ".csv"
    dir_group = REAL_DIRS if target == "real" else FAKE_DIRS
    csv_path = REAL_CSV_PATH if target == "real" else FAKE_CSV_PATH
    
    if csv in DATASET_DIRS():
        print(csv + " already exists.")
        
        return
    
    csv_df = pd.DataFrame(columns = csv_columns_name)

    # Images collection process
    for dir in tqdm(dir_group, desc="Collecting " + target + " images"):
        csv_df = collect_metadatas(csv_df, dir, 0)

    # DataFrame-to-csv conversion process
    df_to_csv(csv_df, csv, csv_path)

def collect_metadatas(df, dir, mode, size=None): 
    """
    Collects metadatas from a given directory into a Pandas DataFrame.

    Parametres
    ----------
        df (pd.DataFrame): the DataFrame to collect metadatas into.
        dir (str): the directory where the metadata file is stored.
        mode (int): how the DataFrame is built
            > 0: for real-fake split.
            > 1: for balanced dataset partitioning (contains both fake and real images).
        size (int, optional): optional parameter, used within mode 1. Default: None.

    Returns
    -------
        pd.DataFrame: the updated Dataframe.
    """
    with open(DATASET_PATH + dir + "\\metadata.csv", mode='r', newline='') as current_csv:

        current_csv = pd.read_csv(current_csv)
        image_path = dir + "/" + current_csv["image_path"]
        current_csv["image_path"] = image_path.replace("/","\\")
        
        match mode:
            case 0:
                df = pd.concat([df,current_csv[current_csv['target'] == 0]], ignore_index=True) if dir in REAL_DIRS else pd.concat([df,current_csv[current_csv['target'] != 0]], ignore_index=True)
                return df
            
            case 1:
                #In this mode, the number of sampled images is also returned
                length = len(current_csv)
                if size > length:
                    print("Error: sampling size cannot exceed the number of tuples in the dataframe.")
                    print("Only " + length + " tuples were sampled.")
                    df = pd.concat([df,current_csv.sample(length)], ignore_index=True)
                    return df, length
                else:
                    df = pd.concat([df,current_csv.sample(size)], ignore_index=True)
                    return df, size
                
            case _:
                return "Unkwon mode. Consult function doc for recognised modes."

def df_to_csv(df, filename, path):
    """
    Splits the DataFrame in chunks to enable tqdm progress visualization while converting the DataFrame into a '.csv' file.

    Parametres
    ----------
        df (pd.DataFrame): the DataFrame to convert.
        filename (str): the desired file name (comprehensive of '.csv' extension).
        path (str): the path where the '.csv' will be stored.
    """
    chunks = np.array_split(df.index, 100)
    for chunck, subset in enumerate(tqdm(chunks, desc="Creating \'" + filename + "\' file")):
        if chunck == 0: # first row
            df.loc[subset].to_csv(path, mode='w', index=False)
        else:
             df.loc[subset].to_csv(path, header=None, mode='a', index=False)
    
    print("\'" + filename + "\' has been successfully created.")

In [7]:
real_fake_csv_split()

real.csv already exists.
fake.csv already exists.


In [14]:
def create_dataset_partition(filename, size, real_dirs, fake_dirs):
    df = pd.DataFrame(columns = csv_columns_name)
    
    max_r_size = max_f_size = round(size / 2)
    min_r_size = round(max_r_size / len(real_dirs))

    remaining_size = 0
    for dir in tqdm(real_dirs, desc="Collecting metadatas from Real Directories"):
        #Tries sampling an exact amount of tuples
        #case 1: sampling size is equal or smaller than the number of images in the directory
            # > No actions required
        #case 2: sampling size is greater than the number of images in the directory
            # > Errors are handled in the 'collect_metadatas' function
            # > The following code ensure total sampling size is reached
        
        #sampled_size is the number of tuples actually sampled from the metadata.csv file in the currect directory
        df, sampled_size = collect_metadatas(df, dir, 1, min_r_size + remaining_size)
        
        #if sampled_size is smaller than the what it should be
        if(sampled_size < min_r_size):
            #calculate the remaining tuples so that the function tries to sample it from the next directory
            remaining_size = min_r_size - sampled_size
        #if sampled_size is correct than there are no remaining tuples to sample
        else:
            remaining_size = 0
    
    if remaining_size > 0: 
        print("Correct sampling size could not be reached from the given real_dirs")
        print("Actual sampled size: " + max_r_size-remaining_size)
        max_f_size = max_f_size - remaining_size

    min_f_size = round(max_f_size / len(fake_dirs))
    remaining_size = 0
    
    for dir in tqdm(fake_dirs, desc="Collecting_metadatas from Fake Directories"):

        df, sampled_size = collect_metadatas(df, dir, 1, min_f_size + remaining_size)

        if(sampled_size < min_f_size):
            remaining_size = min_f_size - sampled_size
        else:
            remaining_size = 0

    if remaining_size > 0: 
        print("Correct sampling size could not be reached from the given fake_dirs")
        print("Actual sampled size: " + max_f_size-remaining_size)
        max_f_size = max_f_size - remaining_size

    df_to_csv(df, filename, DATASET_PATH+filename)

In [15]:
create_dataset_partition("dataset_partition.csv", 20000, ["coco"], ["big_gan"])

Collecting metadatas from Real Directories:   0%|          | 0/1 [00:00<?, ?it/s]

Collecting_metadatas from Fake Directories:   0%|          | 0/1 [00:00<?, ?it/s]

Creating 'dataset_partition.csv' file:   0%|          | 0/100 [00:00<?, ?it/s]

'dataset_partition.csv' has been successfully created.


In [None]:
if("fourier" in DATASET_DIRS()):
    print("Fourier folder already exist.")
else:
    mode = 0o666
    path = os.path.join(DATASET_PATH,"fourier")
    os.mkdir(path,mode)
    os.mkdir(path + "\\fourier", mode)

In [None]:
#APPLICO LA TRASFORMATA DI FOURIER SUL TINY DATASET

fourier_csv = pd.DataFrame(columns = csv_columns_name)
with open(DATASET_PATH  + "\\tinydataset.csv", mode='r', newline='') as tiny_dataset:
    tiny_dataset_csv = pd.read_csv(tiny_dataset)
    
    for index, row in tqdm(tiny_dataset_csv.iterrows()):
        path = DATASET_PATH + row["image_path"]
    
        RGBimg = cv2.imread(path)
        grayImg = cv2.cvtColor(RGBimg, cv2.COLOR_BGR2GRAY)
        
        fft_img = np.fft.fft2(grayImg)
        fft_img = np.log(np.abs(fft_img))

        min_val = np.min(fft_img)
        max_val = np.max(fft_img)
        
        #fft_img = cv2.normalize(fft_img, None, 0, 255, cv2.NORM_MINMAX)
        fft_img = (fft_img - min_val) * (255.0 / (max_val - min_val))
        fft_img = np.uint8(fft_img)     
        
        filename = row["filename"]
        if row["target"] == 0:
            filename = filename.replace("img","real")
        else:
            filename = filename.replace("img","fake")

        fft_path = FFTS_PATH + filename
        cv2.imwrite(fft_path, fft_img)
        
        split_fft_path = fft_path.split("ArtiFact dataset\\")[-1]
        fourier_csv.loc[len(fourier_csv)] = [filename,split_fft_path,row["target"],row["category"]]
    
    fourier_csv.to_csv(DIR_FOURIER_PATH + "metadata.csv", index=False)
    
    #plt.imshow(fft_img)
    #plt.show()

In [None]:
img = cv2.imread("C:\\Users\\mario\\Desktop\\Biometria\\ArtiFact dataset\\fourier\\fourier\\fake000000.jpg", -1)
print(img.shape)
cv2.imshow("img",img)
cv2.waitKey(0)
plt.imshow(img)
plt.show()

In [None]:
input_df = pd.DataFrame(columns= ["anchor","positive","negative"])

with open(DIR_FOURIER_PATH + "metadata.csv", mode='r', newline='') as fourier_csv:
    fourier_df = pd.read_csv(fourier_csv)
    input_df["anchor"] = fourier_df["image_path"]

    real_fourier_df = fourier_df[ fourier_df["target"] == 0 ]
    fake_fourier_df = fourier_df[ fourier_df["target"] != 0 ]
    
    
    for index, row in input_df.iterrows():
        image_path = row["anchor"]
        
        if "real" in image_path:
            positive_img = real_fourier_df.sample(frac=1).head(1)
            while positive_img["image_path"].iloc[0] == image_path:
                positive_img = real_fourier_df.sample(frac=1).head(1)
            input_df.at[index, "positive"] = positive_img["image_path"].iloc[0]
            negative_img = fake_fourier_df.sample(frac=1).head(1)
            input_df.at[index, "negative"] = negative_img["image_path"].iloc[0]
        elif "fake" in image_path:
            positive_img = real_fourier_df.sample(frac=1).head(1)
            input_df.at[index, "negative"] = positive_img["image_path"].iloc[0]
            negative_img = fake_fourier_df.sample(frac=1).head(1)
            while negative_img["image_path"].iloc[0] == image_path:
                negative_img = fake_fourier_df.sample(frac=1).head(1)
            input_df.at[index, "positive"] = negative_img["image_path"].iloc[0]
            

input_df.to_csv(DATASET_PATH + "input.csv", index=False)
    