In [29]:
# RUN ONCE
# import kagglehub

# # Download latest version
# path = kagglehub.dataset_download("marcozuppelli/stegoimagesdataset")
# print("Path to dataset files:", path)

In [31]:
from PIL import Image
import numpy as np
import pandas as pd
import glob
import random
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle



In [2]:
def getLSBFloatBytes(filePath):
    img = Image.open(filePath).convert("RGB")
    pixels = np.array(img).flatten()
    lsb_array = pixels & 1

    usable_length = (len(lsb_array) // 8) * 8
    lsb_array = lsb_array[:usable_length]

    lsb_reshaped = lsb_array.reshape((-1, 8))
    powers = 2**np.arange(7, -1, -1)
    byte_values = np.dot(lsb_reshaped, powers).astype(np.float32)

    float_bytes = byte_values / 255.0
    return float_bytes.tolist()


In [None]:
### GETTING THE PATHS OF ALL THE IMAGES IN THE KAGGLE DATASET
basePath = '/Users/gaim/.cache/kagglehub/datasets/marcozuppelli/stegoimagesdataset/versions/2' ## UPDATE THIS WITH YOUR PATH THAT YOU GET FROM IMPORTING THE DATASET

trainStegImgs = glob.glob(basePath + '/train/train/stego/*.png') #get all the steg image paths
trainCleanImgs = glob.glob(basePath + '/train/train/clean/*.png') #get all the clean image paths
testStegImgs = glob.glob(basePath + '/test/test/stego/*.png')
testCleanImgs = glob.glob(basePath + '/test/test/clean/*.png')

In [None]:
### GET THE RAW BYTES FOR EACH IMAGE AND THEN ADD THEM TO A PANDAS DATAFRAME

trainRows = []
for img in trainStegImgs[:1000]: ## UPDATE/GET RID OF ARRAY SLICING TO CONTROL DATA SIZE
    rowInfo = {
        'imagePath' : img,
        'rawBytes' : getLSBFloatBytes(img),
        'hasSteg' : 1
    }
    trainRows.append(rowInfo)
    #print(img, "COMPLETED")


for img in trainCleanImgs[:1000]: ## UPDATE/GET RID OF ARRAY SLICING TO CONTROL DATA SIZE
    rowInfo = {
        'imagePath' : img,
        'rawBytes' : getLSBFloatBytes(img),
        'hasSteg' : 0
    }
    trainRows.append(rowInfo)
    


trainDf = pd.DataFrame(trainRows)
trainDf = shuffle(trainDf, random_state=8)
display(trainDf)

Unnamed: 0,imagePath,rawBytes,hasSteg
1859,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
1059,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
114,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8352941274642...",1
791,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8627451062202...",1
1813,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
...,...,...,...
986,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2941176593303...",1
133,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.1764705926179886, 0.1568627506494522, ...",1
361,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8313725590705...",1
1364,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0


In [None]:
### GET THE RAW BYTES FOR EACH IMAGE AND THEN ADD THEM TO A PANDAS DATAFRAME

testRows = []
for img in testStegImgs[:1000]: ## UPDATE/GET RID OF ARRAY SLICING TO CONTROL DATA SIZE
    rowInfo = {
        'imagePath' : img,
        'rawBytes' : getLSBFloatBytes(img),
        'hasSteg' : 1
    }
    testRows.append(rowInfo)
    #print(img, "COMPLETED")


for img in testCleanImgs[:1000]: ## UPDATE/GET RID OF ARRAY SLICING TO CONTROL DATA SIZE
    rowInfo = {
        'imagePath' : img,
        'rawBytes' : getLSBFloatBytes(img),
        'hasSteg' : 0
    }
    testRows.append(rowInfo)
    


testDf = pd.DataFrame(testRows)
testDf = shuffle(testDf, random_state=8)
display(testDf)

Unnamed: 0,imagePath,rawBytes,hasSteg
1859,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
1059,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
114,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8117647171020...",1
791,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.38039216...",1
1813,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
...,...,...,...
986,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.7725490331649...",1
133,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.18431372...",1
361,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0392156876623...",1
1364,/Users/gaim/.cache/kagglehub/datasets/marcozup...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0


In [12]:
# trainDf.to_csv('trainDf.csv')

In [13]:
# testDf.to_csv('testDf.csv')

In [14]:
# trainDf = pd.read_csv('trainDf.csv')

In [15]:
# testDf = pd.read_csv('testDf.csv')

In [26]:
def prepareData(df):
    X = np.stack(df['rawBytes'].values)  # each list becomes a row
    y = np.array(df['hasSteg'], dtype=np.uint8)  # Binary labels
    return X, y

xTrain, yTrain = prepareData(trainDf)
xTest, yTest = prepareData(testDf)

In [34]:
model = Sequential()

# Add the input layer and first hidden layer
# Assuming the number of features (raw bytes per image) is the length of xTrain[0]
input_size = xTrain.shape[1]  # This is the length of your byte data per sample (number of features)

model.add(Dense(128, input_dim=input_size, activation='relu'))  # First hidden layer
model.add(Dropout(0.2))  # Dropout to prevent overfitting

# Add another hidden layer
model.add(Dense(64, activation='relu'))

# Add the output layer with a single unit for binary classification (0 or 1)
model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification

# Compile the model
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the model to see the structure
model.summary()

# Train the model
history = model.fit(xTrain, yTrain, epochs=11, batch_size=32, validation_data=(xTest, yTest))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(xTest, yTest)
print(f"Test Accuracy: {test_accuracy*100:.2f}%")

Epoch 1/11
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 88ms/step - accuracy: 0.5589 - loss: 5.1298 - val_accuracy: 0.6685 - val_loss: 0.5640
Epoch 2/11
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 63ms/step - accuracy: 0.6781 - loss: 0.8986 - val_accuracy: 0.7190 - val_loss: 0.6311
Epoch 3/11
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 66ms/step - accuracy: 0.7288 - loss: 0.5634 - val_accuracy: 0.7530 - val_loss: 0.5050
Epoch 4/11
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 63ms/step - accuracy: 0.7578 - loss: 0.5002 - val_accuracy: 0.7635 - val_loss: 0.4773
Epoch 5/11
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 66ms/step - accuracy: 0.7819 - loss: 0.4728 - val_accuracy: 0.7395 - val_loss: 0.5184
Epoch 6/11
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 63ms/step - accuracy: 0.7884 - loss: 0.4792 - val_accuracy: 0.7340 - val_loss: 0.5671
Epoch 7/11
[1m63/63[0m [32m━━━━

In [35]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)