In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("rdrtud/malimg")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/rdrtud/malimg?dataset_version_number=1...


100%|██████████| 1.09G/1.09G [00:13<00:00, 89.0MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/rdrtud/malimg/versions/1


In [None]:
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Dense, Flatten, GlobalAveragePooling2D
from tensorflow.keras.layers import Lambda
import tensorflow as tf

In [None]:
def spatial_pyramid_pooling(inputs, pool_list):
    """Creates spatial pyramid pooling based on a list of bin sizes."""
    outputs = []
    for pool_size in pool_list:
        pooled = tf.image.resize(inputs, [pool_size, pool_size])
        pooled = GlobalAveragePooling2D()(pooled)
        outputs.append(pooled)
    return tf.concat(outputs, axis=-1)

In [None]:
def create_sppnet(input_shape=(192, 192, 1), num_classes=25):
    inputs = Input(shape=input_shape)

    # Convolutional layers
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(inputs)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)

    # Spatial Pyramid Pooling
    x = Lambda(lambda x: spatial_pyramid_pooling(x, [1, 2, 4]))(x)

    # Fully connected layer
    x = Dense(512, activation='relu')(x)
    outputs = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs, outputs)
    return model

In [None]:
path = kagglehub.dataset_download("rdrtud/malimg")

In [None]:
model = create_sppnet()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
model.summary()


In [None]:
import os

# Print the path and check directory structure
print("Base dataset path:", path)
print("Contents of base dataset path:", os.listdir(path))

# Drill down one more level if needed
for subdir in os.listdir(path):
    print(f"Contents of {subdir}:", os.listdir(os.path.join(path, subdir)))


Base dataset path: /root/.cache/kagglehub/datasets/rdrtud/malimg/versions/1
Contents of base dataset path: ['data']
Contents of data: ['malimg_paper_dataset_imgs']


In [None]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import kagglehub

# Download the dataset using kagglehub
path = kagglehub.dataset_download("rdrtud/malimg")

# Set dataset_path based on the downloaded dataset location
dataset_path = os.path.join(path, "malimg_paper_dataset_imgs")
img_size = (192, 192)  # Ensure this matches your model's input size

# Verify that the dataset path exists
if not os.path.exists(dataset_path):
    print("Dataset path does not exist. Please verify the path.")
else:
    images = []
    labels = []

    # Load images and labels
    for label_dir in os.listdir(dataset_path):
        label_path = os.path.join(dataset_path, label_dir)
        if os.path.isdir(label_path):  # Check if it's a directory
            label_index = len(labels)  # Assign unique label per directory
            for img_name in os.listdir(label_path):
                img_path = os.path.join(label_path, img_name)
                if os.path.isfile(img_path):  # Ensure it's a file
                    image = load_img(img_path, color_mode="grayscale", target_size=img_size)
                    image = img_to_array(image) / 255.0  # Normalize pixel values to [0, 1]
                    images.append(image)
                    labels.append(label_index)

    # Check if any data was loaded
    if len(images) == 0 or len(labels) == 0:
        print("No images or labels were loaded. Please check the dataset path and directory structure.")
    else:
        # Convert to numpy arrays and proceed with train-test split
        images = np.array(images)
        labels = to_categorical(np.array(labels))

        # Split into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.3, random_state=42)

        # Model training (assuming `model` is defined and compiled)
        history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))


Dataset path does not exist. Please verify the path.


In [None]:
print("Downloaded dataset path:", path)


Downloaded dataset path: /root/.cache/kagglehub/datasets/rdrtud/malimg/versions/1


In [None]:
import os
import numpy as np
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten

# Set dataset_path to point to the actual folder containing the images
dataset_path = os.path.join("/root/.cache/kagglehub/datasets/rdrtud/malimg/versions/1", "data", "malimg_paper_dataset_imgs")
img_size = (192, 192)  # Adjust according to your model requirements

# Check the structure of the directory
print("Contents of 'malimg_paper_dataset_imgs' directory:")
print(os.listdir(dataset_path))

images = []
labels = []
class_names = []

# Load images and labels
for label_index, label_dir in enumerate(os.listdir(dataset_path)):
    label_path = os.path.join(dataset_path, label_dir)
    if os.path.isdir(label_path):  # Check if it's a directory (for each malware family)
        class_names.append(label_dir)  # Keep track of class names
        for img_name in os.listdir(label_path):
            img_path = os.path.join(label_path, img_name)
            if os.path.isfile(img_path):  # Ensure it's a file
                image = load_img(img_path, color_mode="grayscale", target_size=img_size)
                image = img_to_array(image) / 255.0  # Normalize pixel values to [0, 1]
                images.append(image)
                labels.append(label_index)

# Check if any data was loaded
if len(images) == 0 or len(labels) == 0:
    print("No images or labels were loaded. Please check the dataset path and directory structure.")
else:
    # Convert to numpy arrays
    images = np.array(images)
    labels = np.array(labels)

    # Check the number of classes
    num_classes = len(np.unique(labels))
    print("Number of classes:", num_classes)

    # One-hot encode labels based on actual number of classes
    labels = to_categorical(labels, num_classes=num_classes)

    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.3, random_state=42)

    # Define a simple model (for example purposes, adjust as needed)
    model = Sequential([
        Flatten(input_shape=(img_size[0], img_size[1], 1)),
        Dense(128, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))


Contents of 'malimg_paper_dataset_imgs' directory:
['Agent.FYI', 'Rbot!gen', 'Allaple.A', 'Wintrim.BX', 'C2LOP.gen!g', 'Yuner.A', 'Lolyda.AA1', 'Lolyda.AA2', 'Allaple.L', 'Instantaccess', 'Dontovo.A', 'Malex.gen!J', 'Autorun.K', 'Adialer.C', 'Lolyda.AT', 'VB.AT', 'Swizzor.gen!I', 'Swizzor.gen!E', 'Lolyda.AA3', 'Skintrim.N', 'C2LOP.P', 'Obfuscator.AD', 'Fakerean', 'Dialplatform.B', 'Alueron.gen!J']
Number of classes: 25


  super().__init__(**kwargs)


Epoch 1/50
[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 84ms/step - accuracy: 0.5664 - loss: 5.2147 - val_accuracy: 0.9011 - val_loss: 0.4653
Epoch 2/50
[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 77ms/step - accuracy: 0.9350 - loss: 0.3078 - val_accuracy: 0.9429 - val_loss: 0.2130
Epoch 3/50
[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 80ms/step - accuracy: 0.9816 - loss: 0.0983 - val_accuracy: 0.9493 - val_loss: 0.1698
Epoch 4/50
[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 80ms/step - accuracy: 0.9939 - loss: 0.0492 - val_accuracy: 0.9636 - val_loss: 0.1420
Epoch 5/50
[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 79ms/step - accuracy: 0.9987 - loss: 0.0197 - val_accuracy: 0.9690 - val_loss: 0.1136
Epoch 6/50
[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 77ms/step - accuracy: 0.9999 - loss: 0.0088 - val_accuracy: 0.9718 - val_loss: 0.1043
Epoch 7/50
[1m2

In [None]:
accuracy = model.evaluate(X_test, y_test, verbose=0)[1]
print("Test Accuracy:", accuracy)

Test Accuracy: 0.9721627235412598
