# Deep Learning Project

In [13]:
import os
import numpy as np
from pathlib import Path
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# For PyTorch or TensorFlow later
# import torch
# from torch.utils.data import Dataset, DataLoader

DATA_DIR = Path(r"D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000")
for split in ['train', 'valid', 'test']:
    for label in ['ben', 'mal']:
        files = list((DATA_DIR / split / label).glob('*'))
        print(f"{split}/{label}: {len(files)} files")

MAX_LEN = 10000  # Number of bytes per file (pad/truncate to this length)

def load_binary_file(path, max_len=MAX_LEN):
    with open(path, 'rb') as f:
        bytez = f.read()
    byte_arr = np.frombuffer(bytez, dtype=np.uint8)
    if len(byte_arr) > max_len:
        byte_arr = byte_arr[:max_len]
    else:
        byte_arr = np.pad(byte_arr, (0, max_len - len(byte_arr)), 'constant')
    return byte_arr

train/ben: 10922 files
train/mal: 10922 files
valid/ben: 2500 files
valid/mal: 2446 files
test/ben: 10000 files
test/mal: 10000 files


In [14]:
def load_dataset(split):
    X, y = [], []
    for label, folder in enumerate(['ben', 'mal']):
        folder_path = DATA_DIR / split / folder
        print(f'Loading {folder} files from {folder_path}')
        for filepath in tqdm(folder_path.glob('*')):
            if filepath.is_file():
                X.append(load_binary_file(filepath))
                y.append(label)
    return np.array(X), np.array(y)

# Load a small sample of training data to verify
X_train, y_train = load_dataset('train')
X_val, y_val = load_dataset('valid')
X_test, y_test = load_dataset('test')

print(f'Train shape: {X_train.shape}, Labels: {np.bincount(y_train)}')
print(f'Val shape: {X_val.shape}, Labels: {np.bincount(y_val)}')
print(f'Test shape: {X_test.shape}, Labels: {np.bincount(y_test)}')

Loading ben files from D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\train\ben


10922it [02:07, 85.66it/s]


Loading mal files from D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\train\mal


10922it [01:46, 102.66it/s]


Loading ben files from D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\valid\ben


2500it [00:16, 154.92it/s]


Loading mal files from D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\valid\mal


2446it [00:24, 101.42it/s]


Loading ben files from D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\test\ben


10000it [01:25, 116.71it/s]


Loading mal files from D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\test\mal


10000it [01:50, 90.41it/s]


Train shape: (21844, 10000), Labels: [10922 10922]
Val shape: (4946, 10000), Labels: [2500 2446]
Test shape: (20000, 10000), Labels: [10000 10000]


In [15]:
# Reshape flat arrays (10000,) into (100, 100, 1)
X_train_2d = X_train.reshape(-1, 100, 100, 1).astype('float32') / 255.0
X_val_2d = X_val.reshape(-1, 100, 100, 1).astype('float32') / 255.0
X_test_2d = X_test.reshape(-1, 100, 100, 1).astype('float32') / 255.0

# Convert labels to categorical 
from tensorflow.keras.utils import to_categorical
y_train_cat = to_categorical(y_train, 2)
y_val_cat = to_categorical(y_val, 2)
y_test_cat = to_categorical(y_test, 2)


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

model = Sequential([
    Input(shape=(100, 100, 1)),  # 👈 this replaces input_shape in Conv2D

    Conv2D(32, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    
    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),

    Conv2D(128, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    
    Flatten(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dense(2, activation='softmax')  # 2 classes: benign, malicious
])


In [17]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [18]:
history = model.fit(
    X_train_2d, y_train_cat,
    validation_data=(X_val_2d, y_val_cat),
    epochs=20,
    batch_size=8,
    verbose=2
)


Epoch 1/20
2731/2731 - 153s - 56ms/step - accuracy: 0.6852 - loss: 0.6294 - val_accuracy: 0.7523 - val_loss: 0.4991
Epoch 2/20
2731/2731 - 149s - 55ms/step - accuracy: 0.7534 - loss: 0.4912 - val_accuracy: 0.7467 - val_loss: 0.4938
Epoch 3/20
2731/2731 - 149s - 55ms/step - accuracy: 0.7777 - loss: 0.4576 - val_accuracy: 0.7764 - val_loss: 0.4595
Epoch 4/20
2731/2731 - 157s - 57ms/step - accuracy: 0.8060 - loss: 0.4148 - val_accuracy: 0.8201 - val_loss: 0.3935
Epoch 5/20
2731/2731 - 157s - 57ms/step - accuracy: 0.8254 - loss: 0.3804 - val_accuracy: 0.8188 - val_loss: 0.3934
Epoch 6/20
2731/2731 - 155s - 57ms/step - accuracy: 0.8423 - loss: 0.3495 - val_accuracy: 0.7938 - val_loss: 0.4444
Epoch 7/20
2731/2731 - 154s - 56ms/step - accuracy: 0.8613 - loss: 0.3219 - val_accuracy: 0.8463 - val_loss: 0.3712
Epoch 8/20
2731/2731 - 153s - 56ms/step - accuracy: 0.8709 - loss: 0.2966 - val_accuracy: 0.8457 - val_loss: 0.3595
Epoch 9/20
2731/2731 - 158s - 58ms/step - accuracy: 0.8849 - loss: 0.274

In [19]:
test_loss, test_acc = model.evaluate(X_test_2d, y_test_cat, verbose=0)
print(f"Test Accuracy: {test_acc * 100:.2f}%")


Test Accuracy: 84.85%
