In [16]:
import os
import sys

# Get the directory of the current notebook 
notebook_dir_path = os.path.dirname(os.path.abspath('__file__'))
project_path = os.path.join(notebook_dir_path, '..') # Change this

sys.path.append(notebook_dir_path)
sys.path.append(project_path)

from struct import unpack
from collections import defaultdict
import math
import os
import matplotlib.pyplot as plt
import binascii

import numpy as np

# Setup

In [17]:
image_directory_path = f"/share/flickr30k_images"
# file_path_to_save = f"{project_path}/outputs/jpeg_image_data.bin"

target_image_size = 256  # Optional: Resize all images to (H, W) while H=W
batch_size=32

In [18]:


marker_mapping = {
    0xffd8: "Start of Image",
    0xfffe: "Comment",
    0xffe0: "Application Default Header",
    0xffdb: "Quantization Table",
    0xffc0: "Start of Frame",
    0xffc4: "Define Huffman Table",
    0xffda: "Start of Scan",
    0xffd9: "End of Image"
}


class JPEG:
    def __init__(self, image_file):
        with open(image_file, 'rb') as f:
            self.img_data = f.read()
    
    def decode(self):
        data = self.img_data
        while(True):
            marker, = unpack(">H", data[0:2])
            print(marker_mapping.get(marker))
            if marker == 0xffd8: # "Start of Image", there is no `length` field after it.
                data = data[2:]
            elif marker == 0xffd9: # "End of Image"
                return
            elif marker == 0xffda: # If pointing to "Start of Scan", jump to the last 2nd byte, i.e., "End of Image".
                len_start_of_scan_and_image_data = len(data[:2])
                print(f"Length of \"Start of Scan\" segment (including \"Image Data\" segment): {len_start_of_scan_and_image_data}")
                
                data = data[-2:]
            else:
                lenchunk, = unpack(">H", data[2:4])
                print(f"Length of this segment: {lenchunk}")
                data = data[2+lenchunk:]            
            if len(data)==0:
                break 
    def extract_data(self):
        '''
        Extract the binaries between `0xffd8` and `0xffd9`, i.e., all the data between "Start of Image" and "End of Image" segments.
        '''
        data = self.img_data
        start_idx = data.find(b'\xff\xd8')  # Start of Image
        if start_idx == -1:
            return None  # Start marker not found

        end_idx = data.find(b'\xff\xd9', start_idx)  # End of Image
        if end_idx == -1:
            return None  # End marker not found

        start_idx += 2 # Exclute the start marker
        length = end_idx - start_idx
        print(f"Length of extracted  binary {length}")
        return data[start_idx:end_idx], length  # Exclute the end marker.

def extract_data_from_jpeg_binary(binary_data):
    '''
    Extract the binaries between `0xffd8` and `0xffd9`, i.e., all the data between "Start of Image" and "End of Image" segments.
    '''
    start_idx = binary_data.find(b'\xff\xd8')  # Start of Image
    if start_idx == -1:
        return None  # Start marker not found

    end_idx = binary_data.find(b'\xff\xd9', start_idx)  # End of Image
    if end_idx == -1:
        return None  # End marker not found

    start_idx += 2 # Exclute the start marker
    length = end_idx - start_idx
    print(f"Length of extracted  binary {length}")
    return binary_data[start_idx:end_idx], length  # Exclute the end marker.


def calculate_ngram_distribution(binary_data, N=20):
    # Dictionary to count occurrences of each n-gram
    ngram_counts = defaultdict(int)
    
    # Iterate over the binary data by stepping through each byte
    for i in range(len(binary_data) - N + 1):  # Adjust loop to ensure enough bytes for the last n-gram
        # Extract n consecutive bytes to form an n-gram
        ngram = binary_data[i:i+N]
        ngram_counts[ngram] += 1
    
    # Total number of n-grams
    total_ngrams = sum(ngram_counts.values())
    
    # Dictionary to store the probability of each n-gram
    ngram_probabilities = {k: v / total_ngrams for k, v in ngram_counts.items()}
    
    return ngram_probabilities

def calculate_ngram_frequencies(binary_data, N=20):
    # Dictionary to count occurrences of each n-gram
    ngram_counts = defaultdict(int)
    
    # Iterate over the binary data by stepping through each byte
    for i in range(len(binary_data) - N + 1):  # Adjust loop to ensure enough bytes for the last n-gram
        # Extract n consecutive bytes to form an n-gram
        ngram = binary_data[i:i+N]
        ngram_counts[ngram] += 1
        
    # Dictionary to store the probability of each n-gram
    ngram_frequencies = {k: v  for k, v in ngram_counts.items()}
    
    return ngram_frequencies

def calculate_entropy(probabilities):
    entropy = 0
    for prob in probabilities.values():
        if prob > 0:  # Log of zero is undefined, so we skip those probabilities
            entropy -= prob * math.log2(prob)
    return entropy

    

# Read binary data from JPEG files

In [19]:
# Collect all JPEG or PNG image paths
image_paths = [os.path.join(image_directory_path, f) for f in os.listdir(image_directory_path) if f.endswith(('.jpg', '.jpeg', ''))]

# List to hold binary data and lengths
binaries_and_lengths = []

# Iterate over each image path, create JPEG object, extract data, and collect it
for image_path in image_paths:
    jpeg_image = JPEG(image_path)
    data, length = jpeg_image.extract_data()
    binaries_and_lengths.append(
        {"data": data,
         "length": length}
    )
    print(f"File: {os.path.basename(image_path)}")
    print(f"Length of data: {length} bytes")
    # print(f"Data snippet (first 100 bytes or full data if shorter): {data[:100]}")





Length of extracted  binary 53110
File: 3787801.jpg
Length of data: 53110 bytes
Length of extracted  binary 70224
File: 4985704.jpg
Length of data: 70224 bytes
Length of extracted  binary 62290
File: 4489731.jpg
Length of data: 62290 bytes
Length of extracted  binary 60865
File: 3765589.jpg
Length of data: 60865 bytes
Length of extracted  binary 57342
File: 301246.jpg
Length of data: 57342 bytes
Length of extracted  binary 60352
File: 4376178.jpg
Length of data: 60352 bytes
Length of extracted  binary 46061
File: 4199555.jpg
Length of data: 46061 bytes
Length of extracted  binary 40095
File: 178045.jpg
Length of data: 40095 bytes
Length of extracted  binary 69743
File: 371897.jpg
Length of data: 69743 bytes
Length of extracted  binary 65923
File: 4183120.jpg
Length of data: 65923 bytes
Length of extracted  binary 57748
File: 3734864.jpg
Length of data: 57748 bytes
Length of extracted  binary 55558
File: 4749855.jpg
Length of data: 55558 bytes
Length of extracted  binary 58591
File: 256

# Compute the N-Gram distribution

In [20]:
# Concatenate all binary data for byte pair distribution analysis
all_binary_data = b''.join([item['data'] for item in binaries_and_lengths if item['data']])

N = 3
ngram_distribution = calculate_ngram_distribution(all_binary_data, N=N)

# Calculate entropy
entropy = calculate_entropy(ngram_distribution)
alphabet_size = len(ngram_distribution)
entropy_if_uniform = math.log2(alphabet_size)
# Output results
print(f"N-gram where N={N}")
print(f"Alphabet size (unique N-grams): {alphabet_size}")
print(f"The entropy of the N-gram distribution is: {entropy:.4f} bits")
print(f"The entropy of the the (idea) N-gram uniform distribution is: {entropy_if_uniform}")
print(f"The difference: {abs(entropy - entropy_if_uniform)}")


N-gram where N=3
Alphabet size (unique N-grams): 2059652
The entropy of the N-gram distribution is: 20.8765 bits
The entropy of the the (idea) N-gram uniform distribution is: 20.973969168729834
The difference: 0.09747147385308708


# Plot the N-Gram distribution

In [21]:
# # Convert n-gram byte sequences to integer representations
# int_ngram_distribution = {int.from_bytes(k, 'big'): v for k, v in ngram_distribution.items()}

# # Now plot the distribution using integer representations
# plt.figure(figsize=(12, 6))
# # Since there can be many n-grams, we plot only a sample or the most significant ones
# # You might still need to manage how many you plot if the number is very large
# # sample_size = min(100, len(int_ngram_distribution))  # Adjust sample size as needed
# sample_size = len(int_ngram_distribution)  # Adjust sample size as needed

# sampled_ngrams = dict(list(int_ngram_distribution.items())[:sample_size])

# plt.bar(sampled_ngrams.keys(), sampled_ngrams.values(), color='blue')
# plt.title(f'N-Gram Distribution for N={N}')
# plt.xlabel('N-Grams (Integer Representation)')
# plt.ylabel('Probability')
# plt.xticks(rotation=45, ha='right')  # Adjust rotation and alignment if needed
# plt.tight_layout()  # Adjust layout to make room for label rotation
# plt.show()

# Iterage N

In [22]:
# Loop through each N from 2 to 20


for N in range(1, 21):  # 21 is exclusive, so it iterates up to 20
    ngram_distribution = calculate_ngram_distribution(all_binary_data, N=N)
    ngram_frequencies= calculate_ngram_frequencies(all_binary_data, N=N)

    # Calculate statistical measures
    frequency_values = list(ngram_frequencies.values())
    mean_frequency = np.mean(frequency_values)
    median_frequency = np.median(frequency_values)

    # Calculate entropy
    entropy = calculate_entropy(ngram_distribution)
    alphabet_size = len(ngram_distribution)
    entropy_if_uniform = math.log2(alphabet_size) if alphabet_size > 0 else 0
    difference = abs(entropy - entropy_if_uniform)

    
    # Sort ngrams by frequencies in descending order
    sorted_ngrams_by_freq = sorted(ngram_frequencies.items(), key=lambda x: x[1], reverse=True)

    # Find the index where frequency drops to 1
    # (assuming there are any n-grams with frequency 1)
    index_frequency_one = next((i for i, (_, freq) in enumerate(sorted_ngrams_by_freq) if freq == 1), None)

    # Output results
    print(f"N-gram where N={N}")
    print(f"Alphabet size (unique N-grams): {alphabet_size}")
    print(f"Mean frequency: {mean_frequency:.2f}, Median frequency: {median_frequency}")
    

    if index_frequency_one is not None:
        print(f"Index after which all n-grams have frequencies of 1: {index_frequency_one}")
    else:
        print("No n-grams with frequency of 1 found.")
        
    # Assuming ngram_frequencies has been computed
    top_ngrams = sorted(ngram_frequencies.items(), key=lambda x: x[1], reverse=True)[:5]

    # Prepare and print top 5 n-grams with their frequencies
    top_ngrams_formatted = []
    for ngram, frequency in top_ngrams:
        # Convert the binary n-gram to a hexadecimal string
        hex_representation = "0x" + binascii.hexlify(ngram).decode().upper()
        # Append the formatted string with frequency to the list
        top_ngrams_formatted.append(f"{hex_representation} (Frequency: {frequency})")

    # Print all top n-grams
    print("Top 5 N-grams and their frequencies:")
    for ngram in top_ngrams_formatted:
        print(ngram)
    
    print(f"The entropy of the N-gram distribution is: {entropy:.4f} bits")
    print(f"The entropy of the ideal N-gram uniform distribution is: {entropy_if_uniform:.4f} bits")
    print(f"The difference: {difference:.4f} bits")
    print("-" * 50)  # Separator for readability between different N outputs


N-gram where N=1
Alphabet size (unique N-grams): 256
Mean frequency: 8854.33, Median frequency: 8731.5
No n-grams with frequency of 1 found.
Top 5 N-grams and their frequencies:
0x00 (Frequency: 21801)
0x92 (Frequency: 12215)
0x24 (Frequency: 11988)
0x49 (Frequency: 11738)
0x8E (Frequency: 10868)
The entropy of the N-gram distribution is: 7.9877 bits
The entropy of the ideal N-gram uniform distribution is: 8.0000 bits
The difference: 0.0123 bits
--------------------------------------------------
N-gram where N=2
Alphabet size (unique N-grams): 65287
Mean frequency: 34.72, Median frequency: 33.0
Index after which all n-grams have frequencies of 1: 65286
Top 5 N-grams and their frequencies:
0xFF00 (Frequency: 7663)
0x2828 (Frequency: 2081)
0x0000 (Frequency: 1113)
0x0101 (Frequency: 388)
0x0100 (Frequency: 377)
The entropy of the N-gram distribution is: 15.8851 bits
The entropy of the ideal N-gram uniform distribution is: 15.9945 bits
The difference: 0.1094 bits
-------------------------