# MD5 Light Algorithm

In [None]:
import math
import struct

class Hash_value:
    def __init__(self, hashvalue_hexa):
        self.hashvalue_hexa = hashvalue_hexa
        self.hashvalue_bits = bin(int(hashvalue_hexa, 16))[2:].zfill(len(hashvalue_hexa) * 4)
        self.hashvalue_integer =  int(hashvalue_hexa, 16)
        self.hashvalue_bytes = bytes.fromhex(self.hashvalue_hexa)

class MD5_Hash:
    # Constructor (__init__)
    def __init__(self, type = "regular"):
        # Set all Variables and Constants dependend on wordsize_bit
        if (type == "regular"):
            self.wordsize_bit = 32
            self.bits_length_rep = 'Q' # 64 bit / 8 Byte
            self.padding_length = 64 - 8
        elif (type == "light"):
            self.wordsize_bit = 8
            self.bits_length_rep = 'H' # 16 bit / 2 Byte
            self.padding_length = 16 - 2
        else: raise ValueError("hash type must be either light or regular")
        self.wordsize_byte = int(self.wordsize_bit / 8)
        self.blocksize_byte = self.wordsize_byte * 16
        self.hexa = (1 << (self.wordsize_byte * 8)) - 1
        a0, b0, c0, d0 = 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476
        self.Ks = [int(abs(math.sin(i + 1)) * (2**self.wordsize_bit)) & self.hexa for i in range(64)]
        self.Ss = list(map(lambda s: s % self.wordsize_bit,[7, 12, 17, 22] * 4 + [5, 9, 14, 20] * 4 + [4, 11, 16, 23] * 4 + [6, 10, 15, 21] * 4))
        self.a0,self.b0,self.c0,self.d0 = map(lambda x: int(hex(x)[2:].upper()[:(self.wordsize_byte * 2)],16), [a0, b0, c0, d0])
        self.hashlength_bit = 4* self.wordsize_bit
        self.hashlength_byte = 4* self.wordsize_byte

    #convert Type
    def convertType(self,m):
        if isinstance(m, bytes):
            return m
        # Make sure m is converted to byte value
        if isinstance(m, str) and all(c in '01' for c in m):  # If input is a bit string
            byte_length = (len(m) + 7) // 8  # Compute required byte length
            return int(m, 2).to_bytes(byte_length, byteorder="big")
        elif isinstance(m, str):  # If input is a message (string)
            return m.encode('utf-8') # bytes
        else:
            raise ValueError("Input must be either a message (string) or a valid bit string.")

    #Padding Function
    def pad(self,m):
        bytes = self.convertType(m)
        original_length_bits = len(bytes) * 8
        bytes += b'\x80' #appends 10000000
        while len(bytes)%(self.blocksize_byte)!=self.padding_length:
            bytes += b'\x00' #appends 00000000
        bytes += struct.pack(f'<{self.bits_length_rep}', original_length_bits) # appends 16 bit /64 bit representation in litte-endian-format of length
        return bytes

    # Rotation function (rotate left)
    def rotate_left(self, x, n):
        # Perform a left rotation on a 32-bit integer 'x' by 'n' positions
        x &= self.hexa
        return ((x << n) | (x >> (self.wordsize_bit - n))) & self.hexa

    #digest
    def digestABCD(self, A,B,C,D):
        raw = sum(value << (self.wordsize_bit * i) for i, value in enumerate([A,B,C,D])).to_bytes(16, byteorder='little')
        hashvalue_hexa = '{:0{width}x}'.format(int.from_bytes(raw[:self.hashlength_byte], byteorder='big'), width = self.wordsize_bit)
        hv = Hash_value(hashvalue_hexa)
        return Hash_value(hashvalue_hexa)

    #Hash
    def processblock(self, block, inits):
        A, B, C, D = inits
        for i in range(64):
            if i <= 15:
                f = (B & C) | (~B & D)
                g = i
            elif i <= 31:
                f = (B & D) | (C & ~D)
                g = (5 * i + 1) % 16
            elif i <= 47:
                f = B ^ C ^ D
                g = (3 * i + 5) % 16
            elif i <= 63:
                f = C ^ (B | ~D)
                g = (7 * i) % 16

            # Calculate the temporary values
            temp = D
            D = C
            C = B
            M = int.from_bytes(block[self.wordsize_byte * g : self.wordsize_byte * g + self.wordsize_byte], byteorder='little')
            B = (B + self.rotate_left(A + f + self.Ks[i] + M, self.Ss[i])) & self.hexa
            A = temp
        return([A,B,C,D])

    def digest(self,m):
        blocks = self.pad(m)
        # Process each block
        A_final,B_final,C_final,D_final = self.a0, self.b0, self.c0, self.d0
        for offset in range(0, len(blocks), self.blocksize_byte):
            block = blocks[ offset : offset + self.blocksize_byte]

            #Compute Hash
            A,B,C,D = self.processblock(block, [A_final,B_final,C_final,D_final])
            A_final, B_final, C_final, D_final = (A_final + A) & self.hexa, (B_final + B) & self.hexa, (C_final + C) & self.hexa, (D_final + D) & self.hexa
        return self.digestABCD(A_final, B_final, C_final, D_final)

#check correctness by comparing with md5 library
import hashlib
bytedata = "Hello World!".encode('utf-8')
md5_hash = hashlib.md5(bytedata).digest()
computed_hash = MD5_Hash().digest(bytedata).hashvalue_bytes
if (md5_hash != computed_hash):
    print(f"Correct hash:  {md5_hash}")
    print(f"Computed hash: {computed_hash}")

def md5_light(input_data: str) -> str:
    # use MD5 - light to calculate hash
    Hash = MD5_Hash('light')
    hashvalue = Hash.digest(input_data)
    return hashvalue.hashvalue_bits

md5_light("hello")

'00011011010100110111110010011110'

# Generate Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Path to Google Drive
file_path = '/content/drive/MyDrive/Datasets/'

Mounted at /content/drive


## Dataset for FFN
- Features: hash as normed integer
- Label: message as bitvector

In [None]:
import numpy as np
import random
import struct

# MD5 Light, returning integer
def H(m) -> int:
    return MD5_Hash('light').digest(m).hashvalue_integer

def generate_bitstring(length):
    return ''.join(random.choice('01') for _ in range(length))

def generate_random_bitstrings(num_samples, bitlength):
    bitstrings = set()
    while(len(bitstrings) < num_samples):
        bitstring = generate_bitstring(bitlength)
        bitstrings.add(bitstring)
    return bitstrings

def generate_dataset(num_samples=100000, msglength = 104):# 104 bit messages are processed in one block
    X = []  # Input (normalized Hashvalues)
    Y = []  # Output (128-Bit-Bitvectors)
    msgs = generate_random_bitstrings(num_samples, msglength)
    for msg in msgs:
        hash_value = H(msg)  # calculate 32-Bit-Hash
        hash_normalized = hash_value / (2**32 - 1)  # Normalized to [0,1]
        msg_bits = np.array(list(msg), dtype=np.uint8)  # 128 Bit

        X.append([hash_normalized])
        Y.append(msg_bits)

    X = np.array(X, dtype=np.float32)
    Y = np.array(Y, dtype=np.float32)

    np.save(f"{file_path}X_FFN_MD5light.npy", X)
    np.save(f"{file_path}Y_FFN_MD5light.npy", Y)


generate_dataset(1000000)

In [None]:
!pip install scikeras scikit-optimize

# Feedforward Neural Network Based Pre Image Attack on MD5 Light


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [None]:
# load Dataset
X = np.load(f"{file_path}X_FFN_MD5light.npy")  # Normalisierte Hashwerte
Y = np.load(f"{file_path}Y_FFN_MD5light.npy")  # 104-Bit-Nachrichten als Bitvektoren

# Überprüfen der Datenform
print(f"X Shape: {X.shape}")  # (100000, 1)
print(f"Y Shape: {Y.shape}")  # (100000, 104)

# 80% Training, 20% Test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X Shape: (1000000, 1)
Y Shape: (1000000, 104)


- 256-512-256 Hidden Layers mit ReLU für nichtlineare Transformationen.
- 128 Output-Neuronen mit Sigmoid, um bitweise Vorhersagen zwischen 0 und 1 zu machen.
- Binary Cross-Entropy Loss, weil wir eine bitweise Klassifikation machen.

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
import time
import tracemalloc
from colorama import Fore, Style

# === Define the Network ===

# Hyperparameters to use
num_layers = 4
neurons = 384
learning_rate = 0.000012
dropout_rate = 0.3425
batch_size = 736
activation = "relu"

#Leaky relu
if activation == "leaky_relu":
    activation_func = layers.LeakyReLU()
else:
  activation_func = activation

 # === Build CNN Model ===
model = Sequential()
model.add(Input(shape=(1,)))
model.add(Dense(neurons, activation=activation_func))
model.add(Dropout(dropout_rate))

for _ in range(num_layers - 1):
    model.add(Dense(neurons, activation=activation_func))

model.add(Dense(104, activation="sigmoid"))  # Bitvektor als Ausgabe
# === Compile Model ===
model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
# === Track Time and Memory Usage ===
tracemalloc.start()
start_time = time.time()

# === Train model ===
early_stopping = EarlyStopping(monitor='val_loss',patience=5,restore_best_weights=True)
history = model.fit(X_train, Y_train, epochs=50, batch_size=batch_size, validation_data=(X_test, Y_test), callbacks = [early_stopping])

# === Output Time taken and memory used ===
_, peak = tracemalloc.get_traced_memory()  # Memory in bytes
tracemalloc.stop() # Stop tracking
elapsed_time = time.time() - start_time
print(Fore.RED + f"Time taken: {(elapsed_time / 60):.2f}min" + Style.RESET_ALL)
print(Fore.GREEN + f"Memory Used: {peak:.2f} MB" + Style.RESET_ALL)

Epoch 1/50
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 71ms/step - accuracy: 0.0051 - loss: 0.6932 - val_accuracy: 1.1500e-04 - val_loss: 0.6931
Epoch 2/50
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 71ms/step - accuracy: 0.0129 - loss: 0.6932 - val_accuracy: 0.0000e+00 - val_loss: 0.6931
Epoch 3/50
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 70ms/step - accuracy: 0.0053 - loss: 0.6931 - val_accuracy: 0.0000e+00 - val_loss: 0.6931
Epoch 4/50
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 73ms/step - accuracy: 0.0065 - loss: 0.6931 - val_accuracy: 0.0000e+00 - val_loss: 0.6931
Epoch 5/50
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 70ms/step - accuracy: 0.0033 - loss: 0.6931 - val_accuracy: 1.0000e-05 - val_loss: 0.6931
Epoch 6/50
[1m1087/1087[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 70ms/step - accuracy: 0.0035 - loss: 0.6931 - val_accuracy: 4.5000e-05

## Bayesian Optimization

In [None]:
import optuna
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
import time
import tracemalloc
from colorama import Fore, Style

# Daten laden
X = np.load(f"{file_path}X_FFN_MD5light.npy")
Y = np.load(f"{file_path}Y_FFN_MD5light.npy")

# Train-Test-Split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

historys = []
params_ = []
trial_times = []
trial_memory = []

# Ziel-Funktion für Optuna
def objective(trial):
    # === Track Time and Memory Usage ===
    tracemalloc.start()
    start_time = time.time()

    # Optimierbare Hyperparameter
    num_layers = trial.suggest_int("num_layers", 2, 5, step = 1)
    neurons = trial.suggest_int("neurons", 128, 1024, step=128)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
    batch_size = trial.suggest_int("batch_size", 32, 1024, step=32)
    activation = trial.suggest_categorical("activation", ["relu", "leaky_relu","selu"])
    dropout_rate = trial.suggest_float("dropout", 0.1, 0.5)

     #Leaky relu
    if activation == "leaky_relu":
        activation_func = LeakyReLU()
    else:
      activation_func = activation

    # Modell aufbauen
    model = Sequential()
    model.add(Input(shape=(1,)))
    model.add(Dense(neurons, activation=activation_func))
    model.add(Dropout(dropout_rate))

    for _ in range(num_layers - 1):
        model.add(Dense(neurons, activation=activation_func))

    model.add(Dense(104, activation="sigmoid"))  # Bitvektor als Ausgabe

    # Optimizer
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss="binary_crossentropy", metrics=["accuracy"])

    # EarlyStopping Callback erstellen
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )
    history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=50, batch_size=batch_size, verbose=0, callbacks = [early_stopping])
    historys.append(history)
    params_.append([num_layers,neurons,learning_rate,batch_size,activation, dropout_rate])

    _, peak = tracemalloc.get_traced_memory()  # Memory in bytes
    tracemalloc.stop() # Stop tracking
    elapsed_time = time.time() - start_time
    trial_times.append(elapsed_time)
    trial_memory.append(peak / 1e6)

    # Bewertung auf Testset
    val_loss = history.history['val_loss'][-1]

    return val_loss   # Wir minimieren loss

# === Run Optuna Optimization ===
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

# === Print best hyperparameters ===
print(Fore.CYAN + f"\nBest Hyperparameters: {study.best_params}" + Style.RESET_ALL)

# === Output Time taken and memory used ===
print(Fore.RED + f"Total Time taken: {(sum(trial_times)/60):.2f}min" + Style.RESET_ALL)
print(Fore.YELLOW + f"Average Time Per Trial: {sum(trial_times)/len(trial_times):.2f}s" + Style.RESET_ALL)
print(Fore.GREEN + f"Average Memory Used: {sum(trial_memory)/len(trial_memory):.2f} MB" + Style.RESET_ALL)

[I 2025-02-08 11:41:34,503] A new study created in memory with name: no-name-324f2def-4682-45e2-8d89-ca47ad34373c
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
[I 2025-02-08 11:42:17,809] Trial 0 finished with value: 0.6931566596031189 and parameters: {'num_layers': 2, 'neurons': 128, 'learning_rate': 0.0010594973952916386, 'batch_size': 928, 'activation': 'relu', 'dropout': 0.37710729289718314}. Best is trial 0 with value: 0.6931566596031189.
[I 2025-02-08 11:48:07,236] Trial 1 finished with value: 0.6931626200675964 and parameters: {'num_layers': 2, 'neurons': 768, 'learning_rate': 2.6177986266633664e-05, 'batch_size': 256, 'activation': 'selu', 'dropout': 0.46737891480278193}. Best is trial 0 with value: 0.6931566596031189.
[I 2025-02-08 12:24:53,569] Trial 2 finished with value: 0.6936487555503845 and parameters: {'num_layers': 3, 'neurons': 896, 'learning_rate': 0.0010787913955407527, 'batch_size': 64, 'activation': 'selu', 'dropout': 0.12483081308625295}

[36m
Best Hyperparameters: {'num_layers': 4, 'neurons': 384, 'learning_rate': 1.1737794583316627e-05, 'batch_size': 736, 'activation': 'relu', 'dropout': 0.34251688664150143}[0m
[31mTotal Time taken: 160.31min[0m
[33mAverage Time Per Trial: 480.93s[0m
[32mAverage Memory Used: 45.25 MB[0m


# Installations


In [None]:
!pip install colorama

Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.6


In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.0-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.4/383.4 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M