In [None]:
!pip install scipy pillow numpy matplotlib scikit-learn pandas



In [None]:
#The Frist Digital Feature Extractor
#Output is a CSV file

import os
import numpy as np
from PIL import Image
from scipy.fftpack import dct
import matplotlib.pyplot as plt
import csv

def get_dct_values(image_path):
    # Open the image file
    img = Image.open(image_path)
    # Convert the image to grayscale
    img = img.convert('L')
    # Convert the image data to a numpy array
    img_data = np.array(img)
    # Compute the 2D DCT of the image data
    dct_values = dct(dct(img_data.T).T)
    return dct_values

def count_first_digit_probabilities(dct_values):
    # Flatten the DCT values to a 1D array
    flat_values = dct_values.flatten()
    # Initialize a dictionary to hold the probabilities of the first digits
    probabilities = {i: 0 for i in range(1, 10)}
    # Iterate over the DCT values
    for value in flat_values:
        # Get the first digit of the DCT value
        first_digit = int(str(abs(value))[0])
        # Update the count
        if first_digit in probabilities:
            probabilities[first_digit] += 1
    # Calculate the total number of values
    total_values = sum(probabilities.values())
    # Convert the counts to probabilities
    for digit in probabilities:
        probabilities[digit] /= total_values
    return probabilities



def benfords_law():
    # Benford's law states that in many naturally occurring collections of numbers,
    # the leading significant digit is likely to be small.
    # The law holds that 30% will have 1 as the first digit.
    return [np.log10(1 + 1/digit) for digit in range(1, 10)]



# def plot_probabilities(probabilities):
#     # Create a plot of the probabilities
#     #plt.figure(figsize=(10, 6))  # Increase the size of the plot
#     plt.plot(list(probabilities.keys()), list(probabilities.values()))  # Removed label argument
#     # Set the y-axis to a logarithmic scale
#     plt.yscale('log')
#     # Increase the size of the y-axis labels and change their color to blue
#     plt.tick_params(axis='y', labelsize='large')
#     plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))  # Set y-axis values to two decimal points

# def process_images(image_dir):
#     # Get a list of all the image files in the directory
#     image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]
#     # Process each image file
#     for image_file in image_files:
#         image_path = os.path.join(image_dir, image_file)
#         dct_values = get_dct_values(image_path)
#         probabilities = count_first_digit_probabilities(dct_values)
#     #     plot_probabilities(probabilities)
#     # # Plot Benford's law for comparison
#     # plt.plot(range(1, 10), benfords_law(), color='red', label='Benford\'s Law')
#     # plt.xlabel('First Digit')
#     # plt.ylabel('Probability')
#     # plt.title('Probability of First Digits of DCT Values')
#     # plt.show()
#     return probabilities


def process_images(image_dir, csv_file):
    # Get a list of all the image files in the directory
    image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]
    # Create a CSV file to store the probabilities
    with open(csv_file, 'w', newline='') as csvfile:
        fieldnames = ['Image'] + [str(i) for i in range(1, 10)]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        # Process each image file
        for image_file in image_files:
            image_path = os.path.join(image_dir, image_file)
            dct_values = get_dct_values(image_path)
            probabilities = count_first_digit_probabilities(dct_values)
            # Write the probabilities to the CSV file
            row = {'Image': image_file}
            row.update({str(digit): probability for digit, probability in probabilities.items()})
            writer.writerow(row)


def main():
    # image_dir1='/Users/reyna/CSCI158ProjectServer/source/20realface'
    # csv_file1 = "real.csv"
    # image_dir2='/Users/reyna/CSCI158ProjectServer/source/20fakeface'
    # csv_file2 = "fake.csv"
    # process_images(image_dir1, csv_file1)
    # process_images(image_dir2, csv_file2)
    #image_dir3='/Users/reyna/CSCI158ProjectServer/source/stylegan3/Realistic_50_Images_JPG'
    #csv_file3 = 'stylegan3.csv'
    #process_images(image_dir3, csv_file3)

if __name__ == "__main__":
    main()


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Benford's law probabilities for the first digit (1-9)
benford_law = [np.log10(1 + 1/digit) for digit in range(1, 10)]

def plot_data(file_name, typeofimage):
    # Read the CSV file
    data = pd.read_csv(file_name)

    # Ensure the data is in the correct format
    if not set(data.columns[1:]) == set(map(str, range(1, 10))):
        raise ValueError("CSV file must contain probabilities for digits 1-9")

    # Convert data to float
    data.iloc[:, 1:] = data.iloc[:, 1:].astype(float)

    # Plot the data
    for index, row in data.iterrows():
        plt.plot(range(1, 10), row.iloc[1:])

    # Plot Benford's law for comparison
    plt.plot(range(1, 10), benford_law, label="Benford's Law", linestyle='--',color='yellow')

    # Add details to the plot
    plt.xlabel('First Digit')
    plt.ylabel('Probability')
    plt.title(typeofimage + 'First Digit Probabilities vs Benford\'s Law')
    plt.legend()

    # Show the plot
    plt.show()

filename1="real_first_digit_extract.csv"
filename2="fake_first_digit_extract.csv"
filename3="stylegan3.csv"
plot_data(filename1, 'Real Image ')
plot_data(filename2,'Fake Image ')
#plot_data(filename3,'StyleGan3 Image ')



FileNotFoundError: [Errno 2] No such file or directory: 'real_first_digit_extract.csv'

In [None]:
#Classifier for detecting the fake or real images

import pandas as pd
import numpy as np
from scipy.special import kl_div
from sklearn.preprocessing import normalize

# Load the data
def classifier(infile, outfile):
    data = pd.read_csv(infile)
    data = data.drop(data.columns[0], axis=1)
    # Benford's law probabilities for the first digit (1-9) with six significant figures
    benford_law = np.array([0.301029995663981, 0.176091259055681, 0.124938736608300,
                            0.096910013008056, 0.079181246047625, 0.066946789630613,
                            0.057991946977686, 0.051152522447381, 0.045757490560675])

    # Normalize the data
    data_normalized = pd.DataFrame(normalize(data, norm='l1'), columns=data.columns)

    # Function to calculate symmetrized Renyi divergence
    def renyi_divergence(p, q, alpha=0.3):
        p = p.values
        return (1/(alpha-1) * np.log(np.sum((p**alpha)*(q**(1-alpha))))).mean()

    # Function to calculate symmetrized Tsallis divergence
    def tsallis_divergence(p, q, alpha=0.3):
        p = p.values
        return ((1/(alpha-1)) * (np.sum(p**alpha + q**alpha) - np.sum((p*q)**alpha))).mean()

    # Function to calculate Kullback-Leibler divergence
    def kl_divergence(p, q):
        p = p.values
        return kl_div(p, q).mean()

    # Function to classify whether an image follows or violates Benford's law
    def classify(row):
        # Calculate the Renyi, Tsallis, KL, and JS divergences
        renyi_div = renyi_divergence(row, benford_law)
        tsallis_div = tsallis_divergence(row, benford_law)
        kl_div = kl_divergence(row, benford_law)
        # If all divergences are below their respective thresholds, the image follows Benford's law
        if renyi_div <= 0.0002 and tsallis_div <= 0.0002 and kl_div <= 0.0002:
            return 'Follows'
        else:
            return 'Violates'

    # Apply the classifier to each row (image) in the data
    data_normalized['Decision'] = data_normalized.apply(classify, axis=1)

    # Save the output to a CSV file
    data_normalized.to_csv(outfile, index=False)

    print("done")


In [None]:
# infile1="real.csv"
# outfile1="RealOut1.csv"
# infile2="fake.csv"
# outfile2="FakeOut2.csv"
# classifier(infile1,outfile1)
# classifier(infile2,outfile2)

# infile1="real.csv"
# outfile1="RealOut00002.csv"
# infile2="fake.csv"
# outfile2="FakeOut00002.csv"
# classifier(infile1,outfile1)
# classifier(infile2,outfile2)

#infile3="stylegan3.csv"
#outfile3="stylegan3Result.csv"
#classifier(infile3,outfile3)


done
