# Deep Learning Project

This notebook combines models and techniques from two separate CNN-based approaches to malware classification in order to improve accuracy.

## Part 1: Base CNN Model

In [1]:
import os
import numpy as np
from pathlib import Path
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# For PyTorch or TensorFlow later
# import torch
# from torch.utils.data import Dataset, DataLoader

DATA_DIR = Path(r"D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000")
for split in ['train', 'valid', 'test']:
    for label in ['ben', 'mal']:
        files = list((DATA_DIR / split / label).glob('*'))
        print(f"{split}/{label}: {len(files)} files")

MAX_LEN = 10000  # Number of bytes per file (pad/truncate to this length)

def load_binary_file(path, max_len=MAX_LEN):
    with open(path, 'rb') as f:
        bytez = f.read()
    byte_arr = np.frombuffer(bytez, dtype=np.uint8)
    if len(byte_arr) > max_len:
        byte_arr = byte_arr[:max_len]
    else:
        byte_arr = np.pad(byte_arr, (0, max_len - len(byte_arr)), 'constant')
    return byte_arr

train/ben: 10922 files
train/mal: 10922 files
valid/ben: 2500 files
valid/mal: 2446 files
test/ben: 10000 files
test/mal: 10000 files


In [2]:
def load_dataset(split):
    X, y = [], []
    for label, folder in enumerate(['ben', 'mal']):
        folder_path = DATA_DIR / split / folder
        print(f'Loading {folder} files from {folder_path}')
        for filepath in tqdm(folder_path.glob('*')):
            if filepath.is_file():
                X.append(load_binary_file(filepath))
                y.append(label)
    return np.array(X), np.array(y)

# Load a small sample of training data to verify
X_train, y_train = load_dataset('train')
X_val, y_val = load_dataset('valid')
X_test, y_test = load_dataset('test')

print(f'Train shape: {X_train.shape}, Labels: {np.bincount(y_train)}')
print(f'Val shape: {X_val.shape}, Labels: {np.bincount(y_val)}')
print(f'Test shape: {X_test.shape}, Labels: {np.bincount(y_test)}')

Loading ben files from D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\train\ben


10922it [01:28, 123.08it/s]


Loading mal files from D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\train\mal


10922it [02:00, 90.60it/s] 


Loading ben files from D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\valid\ben


2500it [00:11, 218.68it/s]


Loading mal files from D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\valid\mal


2446it [00:24, 101.72it/s]


Loading ben files from D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\test\ben


10000it [01:36, 103.82it/s]


Loading mal files from D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\test\mal


10000it [01:25, 117.32it/s]


Train shape: (21844, 10000), Labels: [10922 10922]
Val shape: (4946, 10000), Labels: [2500 2446]
Test shape: (20000, 10000), Labels: [10000 10000]


In [3]:
# Reshape flat arrays (10000,) into (100, 100, 1)
X_train_2d = X_train.reshape(-1, 100, 100, 1).astype('float32') / 255.0
X_val_2d = X_val.reshape(-1, 100, 100, 1).astype('float32') / 255.0
X_test_2d = X_test.reshape(-1, 100, 100, 1).astype('float32') / 255.0

# Convert labels to categorical 
from tensorflow.keras.utils import to_categorical
y_train_cat = to_categorical(y_train, 2)
y_val_cat = to_categorical(y_val, 2)
y_test_cat = to_categorical(y_test, 2)


In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization

model = Sequential([
    Input(shape=(100, 100, 1)),  # this replaces input_shape in Conv2D

    Conv2D(32, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    
    Conv2D(64, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),

    Conv2D(128, (3, 3), activation='relu'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    
    Flatten(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dense(2, activation='softmax')  # 2 classes: benign, malicious
])


In [5]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [6]:
history = model.fit(
    X_train_2d, y_train_cat,
    validation_data=(X_val_2d, y_val_cat),
    epochs=20,
    batch_size=8,
    verbose=2
)


Epoch 1/20
2731/2731 - 163s - 60ms/step - accuracy: 0.6982 - loss: 0.6143 - val_accuracy: 0.7461 - val_loss: 0.5445
Epoch 2/20
2731/2731 - 167s - 61ms/step - accuracy: 0.7553 - loss: 0.4936 - val_accuracy: 0.7683 - val_loss: 0.4441
Epoch 3/20
2731/2731 - 176s - 64ms/step - accuracy: 0.7800 - loss: 0.4520 - val_accuracy: 0.7831 - val_loss: 0.4518
Epoch 4/20
2731/2731 - 178s - 65ms/step - accuracy: 0.8061 - loss: 0.4095 - val_accuracy: 0.7928 - val_loss: 0.4621
Epoch 5/20
2731/2731 - 174s - 64ms/step - accuracy: 0.8222 - loss: 0.3809 - val_accuracy: 0.8023 - val_loss: 0.4084
Epoch 6/20
2731/2731 - 177s - 65ms/step - accuracy: 0.8398 - loss: 0.3500 - val_accuracy: 0.8283 - val_loss: 0.3828
Epoch 7/20
2731/2731 - 176s - 64ms/step - accuracy: 0.8470 - loss: 0.3283 - val_accuracy: 0.8265 - val_loss: 0.3703
Epoch 8/20
2731/2731 - 180s - 66ms/step - accuracy: 0.8673 - loss: 0.2985 - val_accuracy: 0.8308 - val_loss: 0.3728
Epoch 9/20
2731/2731 - 183s - 67ms/step - accuracy: 0.8794 - loss: 0.272

In [8]:
test_loss, test_acc = model.evaluate(X_test_2d, y_test_cat, verbose=0)
print(f"Test Accuracy: {test_acc * 100:.2f}%")


Test Accuracy: 85.62%


## Part 2: Additional CNN Techniques

## Malware Detection Using CNNs


In [9]:
import os
import numpy as np
from PIL import Image

**Data Processing**

In [10]:
def binary_file_to_image(filepath, size=(64, 64)):
    #open file and read binary  data
    with open(filepath, 'rb') as fname:
        byte_arr = np.frombuffer(fname.read(), dtype=np.uint8)
    
    # Flatten or pad the array to fit the target image size
    desired_size = size[0] * size[1]
    # Trucate if file size is too large
    if len(byte_arr) > desired_size:
        byte_arr = byte_arr[:desired_size]
    #Pad with zeros if file size is too small
    else:
        byte_arr = np.pad(byte_arr, (0, desired_size - len(byte_arr)), 'constant', constant_values=0)
    # returns image
    image = byte_arr.reshape(size)
    return image

In [11]:
#Define function to load the data set from the folders
def load_dataset(folder_path, size=(64, 64)): 
    # List X  to hold features and list Y variable holds class labels
    X, y = [], []
    for label, subfolder in enumerate(['ben', 'mal']):
        # prepares the subfolders 
        class_folder = os.path.join(folder_path, subfolder)
        for file_name in os.listdir(class_folder):
            file_path = os.path.join(class_folder, file_name)
        # Calls previous function to convert data to grayscale and resizes to 64 X 64
        # Then saves to X and Y
            try:
                img = binary_file_to_image(file_path, size)
                X.append(img)
                y.append(label)
        # If  file file is corrupt, error message will be printed and file will be skiped
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
    # returns X and Y
    return np.array(X), np.array(y)

**Load and normalize**

In [13]:
import keras
from sklearn.model_selection import train_test_split


# Load datasets
train_X, train_y = load_dataset(r"D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\train")
val_X, val_y = load_dataset(r"D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\valid")
test_X, test_y = load_dataset(r"D:\project\myJupyter\deep-learning-with-python-notebooks-master\data_exe_100000\test")

# Normalize and reshape for CNN
train_X = train_X / 255.0
val_X = val_X / 255.0
test_X = test_X / 255.0

train_X = train_X[..., np.newaxis]
val_X = val_X[..., np.newaxis]
test_X = test_X[..., np.newaxis]

## Buliding Model

In [14]:
import keras
from tensorflow.keras import layers, models
from tensorflow.keras import regularizers

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=train_X.shape[1:]),
    layers.MaxPooling2D(2, 2),

    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D(2, 2),

    layers.Flatten(),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

   



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
model.summary()

**Configuring the model for training**

In [17]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

**Early Stoping**

In [18]:
from keras.callbacks import EarlyStopping
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_accuracy',       # will monitor val_accuracy
    patience=3,               # will wait 3 epochs to see if there is improvement
    restore_best_weights=True # returns previous best model if there is no improvment
)

**Training the model**

In [19]:

model.fit(train_X, train_y, epochs=10, batch_size=32, callbacks=[early_stopping], validation_data=(val_X, val_y))

Epoch 1/10
[1m683/683[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 31ms/step - accuracy: 0.6518 - loss: 0.6104 - val_accuracy: 0.7657 - val_loss: 0.4685
Epoch 2/10
[1m683/683[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 30ms/step - accuracy: 0.7934 - loss: 0.4293 - val_accuracy: 0.8300 - val_loss: 0.3830
Epoch 3/10
[1m683/683[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 31ms/step - accuracy: 0.8325 - loss: 0.3699 - val_accuracy: 0.8474 - val_loss: 0.3461
Epoch 4/10
[1m683/683[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 30ms/step - accuracy: 0.8558 - loss: 0.3219 - val_accuracy: 0.8387 - val_loss: 0.3541
Epoch 5/10
[1m683/683[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 30ms/step - accuracy: 0.8722 - loss: 0.2910 - val_accuracy: 0.8601 - val_loss: 0.3259
Epoch 6/10
[1m683/683[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 30ms/step - accuracy: 0.8887 - loss: 0.2604 - val_accuracy: 0.8611 - val_loss: 0.3292
Epoch 7/10
[1m6

<keras.src.callbacks.history.History at 0x1c360aa4290>

**Evaluating the model**

In [20]:
print ('\nTest')
test_loss, test_acc = model.evaluate(test_X, test_y)
print(f"Test Accuracy: {test_acc * 100:.2f}%")


Test
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9007 - loss: 0.2709
Test Accuracy: 86.91%


**Confusion Matrix**

In [21]:
from sklearn.metrics import confusion_matrix
import numpy as np

y_pred=model.predict(test_X)
y_pred_classes =(y_pred > 0.5)
confmat=confusion_matrix(test_y,y_pred_classes)
print(confmat)

[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step
[[9133  867]
 [1750 8250]]
