# Deep Learning Project

In [3]:
import os
import numpy as np
from pathlib import Path
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# For PyTorch or TensorFlow later
# import torch
# from torch.utils.data import Dataset, DataLoader

DATA_DIR = Path(r"C:\Users\heran\Downloads\total")
for split in ['train', 'valid', 'test']:
    for label in ['ben', 'mal']:
        files = list((DATA_DIR / split / label).glob('*'))
        print(f"{split}/{label}: {len(files)} files")

MAX_LEN = 10000  # Number of bytes per file (pad/truncate to this length)

def load_binary_file(path, max_len=MAX_LEN):
    with open(path, 'rb') as f:
        bytez = f.read()
    byte_arr = np.frombuffer(bytez, dtype=np.uint8)
    if len(byte_arr) > max_len:
        byte_arr = byte_arr[:max_len]
    else:
        byte_arr = np.pad(byte_arr, (0, max_len - len(byte_arr)), 'constant')
    return byte_arr

train/ben: 30000 files
train/mal: 30000 files
valid/ben: 2000 files
valid/mal: 2000 files
test/ben: 4500 files
test/mal: 4500 files


In [5]:
def load_dataset(split):
    X, y = [], []
    for label, folder in enumerate(['ben', 'mal']):
        folder_path = DATA_DIR / split / folder
        print(f'Loading {folder} files from {folder_path}')
        for filepath in tqdm(folder_path.glob('*')):
            if filepath.is_file():
                X.append(load_binary_file(filepath))
                y.append(label)
    return np.array(X), np.array(y)

# Load a small sample of training data to verify
X_train, y_train = load_dataset('train')
X_val, y_val = load_dataset('valid')
X_test, y_test = load_dataset('test')

print(f'Train shape: {X_train.shape}, Labels: {np.bincount(y_train)}')
print(f'Val shape: {X_val.shape}, Labels: {np.bincount(y_val)}')
print(f'Test shape: {X_test.shape}, Labels: {np.bincount(y_test)}')

Loading ben files from C:\Users\heran\Downloads\total\train\ben


30000it [00:44, 677.65it/s]


Loading mal files from C:\Users\heran\Downloads\total\train\mal


30000it [00:44, 673.90it/s]


Loading ben files from C:\Users\heran\Downloads\total\valid\ben


2000it [00:02, 673.94it/s]


Loading mal files from C:\Users\heran\Downloads\total\valid\mal


2000it [00:02, 788.88it/s] 


Loading ben files from C:\Users\heran\Downloads\total\test\ben


4500it [00:08, 544.39it/s]


Loading mal files from C:\Users\heran\Downloads\total\test\mal


4500it [00:07, 566.22it/s]


Train shape: (60000, 10000), Labels: [30000 30000]
Val shape: (4000, 10000), Labels: [2000 2000]
Test shape: (9000, 10000), Labels: [4500 4500]


In [7]:
# Reshape flat arrays (10000,) into (100, 100, 1)
X_train_2d = X_train.reshape(-1, 100, 100, 1).astype('float32') / 255.0
X_val_2d = X_val.reshape(-1, 100, 100, 1).astype('float32') / 255.0
X_test_2d = X_test.reshape(-1, 100, 100, 1).astype('float32') / 255.0

# Convert labels to categorical 
from tensorflow.keras.utils import to_categorical
y_train_cat = to_categorical(y_train, 2)
y_val_cat = to_categorical(y_val, 2)
y_test_cat = to_categorical(y_test, 2)


In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

model = Sequential([
    Input(shape=(100, 100, 1)),  # 👈 this replaces input_shape in Conv2D

    Conv2D(32, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    
    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),

    Conv2D(128, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    
    Flatten(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dense(2, activation='softmax')  # 2 classes: benign, malicious
])


In [13]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [15]:
history = model.fit(
    X_train_2d, y_train_cat,
    validation_data=(X_val_2d, y_val_cat),
    epochs=20,
    batch_size=8,
    verbose=2
)


Epoch 1/20
7500/7500 - 503s - 67ms/step - accuracy: 0.7314 - loss: 0.5338 - val_accuracy: 0.7918 - val_loss: 0.4320
Epoch 2/20
7500/7500 - 493s - 66ms/step - accuracy: 0.7931 - loss: 0.4295 - val_accuracy: 0.8322 - val_loss: 0.3739
Epoch 3/20
7500/7500 - 492s - 66ms/step - accuracy: 0.8257 - loss: 0.3801 - val_accuracy: 0.8325 - val_loss: 0.3619
Epoch 4/20
7500/7500 - 491s - 66ms/step - accuracy: 0.8451 - loss: 0.3462 - val_accuracy: 0.8575 - val_loss: 0.3277
Epoch 5/20
7500/7500 - 494s - 66ms/step - accuracy: 0.8579 - loss: 0.3192 - val_accuracy: 0.8748 - val_loss: 0.3352
Epoch 6/20
7500/7500 - 512s - 68ms/step - accuracy: 0.8707 - loss: 0.2945 - val_accuracy: 0.8675 - val_loss: 0.3168
Epoch 7/20
7500/7500 - 556s - 74ms/step - accuracy: 0.8813 - loss: 0.2723 - val_accuracy: 0.8658 - val_loss: 0.3020
Epoch 8/20
7500/7500 - 582s - 78ms/step - accuracy: 0.8907 - loss: 0.2550 - val_accuracy: 0.8802 - val_loss: 0.2929
Epoch 9/20
7500/7500 - 572s - 76ms/step - accuracy: 0.8963 - loss: 0.241

In [17]:
test_loss, test_acc = model.evaluate(X_test_2d, y_test_cat, verbose=0)
print(f"Test Accuracy: {test_acc * 100:.2f}%")


Test Accuracy: 89.44%
