## Malware Detection Using CNNs


In [17]:
import os
import numpy as np
from PIL import Image

**Data Processing**

In [11]:
def binary_file_to_image(filepath, size=(64, 64)):
    #open file and read binary  data
    with open(filepath, 'rb') as fname:
        byte_arr = np.frombuffer(fname.read(), dtype=np.uint8)
    
    # Flatten or pad the array to fit the target image size
    desired_size = size[0] * size[1]
    # Trucate if file size is too large
    if len(byte_arr) > desired_size:
        byte_arr = byte_arr[:desired_size]
    #Pad with zeros if file size is too small
    else:
        byte_arr = np.pad(byte_arr, (0, desired_size - len(byte_arr)), 'constant', constant_values=0)
    # returns image
    image = byte_arr.reshape(size)
    return image

In [13]:
#Define function to load the data set from the folders
def load_dataset(folder_path, size=(64, 64)): 
    # List X  to hold features and list Y variable holds class labels
    X, y = [], []
    for label, subfolder in enumerate(['ben', 'mal']):
        # prepares the subfolders 
        class_folder = os.path.join(folder_path, subfolder)
        for file_name in os.listdir(class_folder):
            file_path = os.path.join(class_folder, file_name)
        # Calls previous function to convert data to grayscale and resizes to 64 X 64
        # Then saves to X and Y
            try:
                img = binary_file_to_image(file_path, size)
                X.append(img)
                y.append(label)
        # If  file file is corrupt, error message will be printed and file will be skiped
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
    # returns X and Y
    return np.array(X), np.array(y)

**Load and normalize**

In [19]:
import keras
from sklearn.model_selection import train_test_split


# Load datasets
train_X, train_y = load_dataset('total/train/')# 30,000 mal and ben files
val_X, val_y = load_dataset('total/valid/')# 2,000 mal and ben files
test_X, test_y = load_dataset('total/test/')# 4,500 mal and ben files


# Normalize and reshape for CNN
train_X = train_X / 255.0
val_X = val_X / 255.0
test_X = test_X / 255.0

train_X = train_X[..., np.newaxis]
val_X = val_X[..., np.newaxis]
test_X = test_X[..., np.newaxis]

## Buliding Model

In [199]:
import keras
from tensorflow.keras import layers, models
from tensorflow.keras import regularizers

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=train_X.shape[1:]),
    layers.MaxPooling2D(2, 2),

    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D(2, 2),

    layers.Flatten(),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

   



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [201]:
model.summary()

**Configuring the model for training**

In [203]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

**Early Stoping**

In [214]:
from keras.callbacks import EarlyStopping
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',       # will monitor val_accuracy
    patience=3,               # will wait 3 epochs to see if there is improvement
    restore_best_weights=True # returns previous best model if there is no improvment
)

**Training the model**

In [216]:

model.fit(train_X, train_y, epochs=10, batch_size=32, callbacks=[early_stopping], validation_data=(val_X, val_y))

Epoch 1/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 35ms/step - accuracy: 0.9316 - loss: 0.1633 - val_accuracy: 0.8950 - val_loss: 0.2690
Epoch 2/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 35ms/step - accuracy: 0.9348 - loss: 0.1549 - val_accuracy: 0.8975 - val_loss: 0.2610
Epoch 3/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 36ms/step - accuracy: 0.9388 - loss: 0.1507 - val_accuracy: 0.9013 - val_loss: 0.2751
Epoch 4/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 35ms/step - accuracy: 0.9412 - loss: 0.1413 - val_accuracy: 0.9010 - val_loss: 0.2694
Epoch 5/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 35ms/step - accuracy: 0.9451 - loss: 0.1365 - val_accuracy: 0.9010 - val_loss: 0.2736
Epoch 6/10
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 36ms/step - accuracy: 0.9494 - loss: 0.1247 - val_accuracy: 0.9040 - val_loss: 0.2734
Epoc

<keras.src.callbacks.history.History at 0x289e81366f0>

**Evaluating the model**

In [218]:
print ('\nTest')
test_loss, test_acc = model.evaluate(test_X, test_y)
print(f'test accuracy: {test_acc:.4f}')


Test
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 16ms/step - accuracy: 0.9203 - loss: 0.2226
test accuracy: 0.9009


**Confusion Matrix**

In [197]:
from sklearn.metrics import confusion_matrix
import numpy as np

y_pred=model.predict(test_X)
y_pred_classes =(y_pred > 0.5)
confmat=confusion_matrix(test_y,y_pred_classes)
print(confmat)

[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step
[[4085  415]
 [ 491 4009]]


### **Test Result Log**

| Dropout Rate (D) | Accuracy | Epochs | Notes                                              | Tester         | Input Data % | Conv Layers | DenseLayers|
|------------------|----------|--------|----------------------------------------------------|----------------|--------------|-------------|------------|
| –                | 0.8912   | 10     | Model performance without overfitting techniques.  | Emma   | 96%                  | 2          | 1     |      
| 0.55             | 0.9017   | 10     | Significant improvement after increasing dataset.  | Emma          | 96%          | 2          | 1           |
| 0.55             | 0.8984   | 10     | Slight improvement observed with 0.55 dropout.     | Emma          | 96%          | 2           | 1           |
| 0.4              | 0.8984   | 10     | Matched performance. Is this a good thing?         | Emma          | 96%          | 2          | 1           |
| 0.3              | 0.8926   | 10     | Slight dip at lower dropout.                       | Emma          | 96%          | 2          | 1           |
| 0.55             | 0.9004   | 7      | Fewer epochs didn’t improve accuracy.              | Emma           | 96%          | 2          | 1          |
