Student: Maria A. Hernandez R.
Malware Classifier

In [1]:
import torch
import torch.nn as nn
import pandas as pd
from tqdm import tqdm



In [2]:
df = pd.read_csv("asm_feature.csv")

In [3]:
df.head()

Unnamed: 0,ID,HEADER:,.text:,.Pav:,.idata:,.data:,.bss:,.rdata:,.edata:,.rsrc:,...,edx,esi,eax,ebx,ecx,edi,ebp,esp,eip,Class
0,01azqd4InC7m9JpocGv5,18,22430,0,1158,1366754,0,1794,0,0,...,808,2290,1281,587,701,0,15,14,456,9
1,01jsnpXSAlgw6aPeDxrU,18,68883,0,304,662,0,1093,0,0,...,5,547,5,451,56,0,27,0,117,9
2,01kcPWA9K2BOxQeS5Rju,19,744,0,127,57,0,323,0,3,...,18,66,15,43,83,0,17,48,29,1
3,01SuzwMJEIXsK7A8dQbl,18,10368,0,206,4595,92,0,0,3,...,18,1228,24,1546,107,0,15,0,76,8
4,02JqQ7H3yEoD8viYWlmS,0,129362,0,644,24994,0,24509,0,0,...,147,353,168,375,63,0,22,0,236,2


In [4]:
zero_columns = []
for column in df.columns:
    if (df[column] == 0).all():
        zero_columns.append(column)
zero_columns

['.BSS:', '.CODE', 'rtn']

In [5]:
for column in zero_columns:
    df.drop(column, inplace=True, axis=1)

In [6]:
df = df.drop(df.columns[0], axis=1)
df = df.drop(df.columns[0], axis=1)

In [7]:
df.head(10)

Unnamed: 0,.text:,.Pav:,.idata:,.data:,.bss:,.rdata:,.edata:,.rsrc:,.tls:,.reloc:,...,edx,esi,eax,ebx,ecx,edi,ebp,esp,eip,Class
0,22430,0,1158,1366754,0,1794,0,0,0,0,...,808,2290,1281,587,701,0,15,14,456,9
1,68883,0,304,662,0,1093,0,0,0,0,...,5,547,5,451,56,0,27,0,117,9
2,744,0,127,57,0,323,0,3,0,3,...,18,66,15,43,83,0,17,48,29,1
3,10368,0,206,4595,92,0,0,3,0,0,...,18,1228,24,1546,107,0,15,0,76,8
4,129362,0,644,24994,0,24509,0,0,0,0,...,147,353,168,375,63,0,22,0,236,2
5,93532,0,503,6551,0,21273,0,0,0,0,...,12761,17438,13613,6078,301,0,44,17,209,2
6,0,0,176,0,0,23,0,3,19,3,...,126,135,100,556,90,0,8,0,66,6
7,0,0,253,0,0,0,0,3,0,3,...,362,374,353,754,209,0,10,0,100,6
8,3405,0,123,783910,0,598,0,0,0,0,...,87,135,216,187,224,0,12,0,39,3
9,60476,0,349,3760,0,0,0,3,0,3,...,2496,5013,3960,6382,717,0,27,0,88,1


In [8]:
df.Class -= 1

In [9]:
df['edi'].value_counts()

0      3966
2         8
1         8
3         6
15        5
426       3
8         2
21        1
100       1
Name: edi, dtype: int64

In [10]:
# converting a data set to an array then a tensor. 
dataset = torch.tensor(df.to_numpy())

In [11]:
dataset.shape

torch.Size([4000, 48])

In [12]:
# Getting features and labels in separate arrays. 
labels = dataset[:, -1] 
features = dataset[:, :-1]

In [13]:
# Normalizing the data by subtracting the mean and dividing by the standard deviation
features = features.to(torch.float32)
features -= features.mean(0) # mean across all data
features /= features.std(0)

In [14]:
# Shuffling features and labels simultaneously. 
indices = torch.randperm(4000)
features = features[indices, :]
labels = labels[indices]

In [15]:
# Separating input data into training data and testing data. We should have features and labels for both training and testing. 
# We are using 75% of the features for training and 25% for validation.
n_train = int(features.shape[0] * 0.75)
train_features = features[:n_train, :]
train_labels = labels[:n_train]
validation_features = features[n_train:, :]
validation_labels = labels[n_train:]

In [16]:
# epocs is how many times you are going through the whole data set and update it. 
# batch size is how many instances you check before updating the model. 
BATCH_SIZE = 64
LEARNING_RATE = 0.0001
EPOCHS = 1000

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [20]:
train_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(train_features, train_labels), shuffle=True, batch_size=BATCH_SIZE)
validation_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(validation_features, validation_labels), batch_size=BATCH_SIZE)

In [21]:
# we don't have evidence that there are redundant useless features in the dataset 
# twolayer perceptron are very easy to train.. 
# I can check if the loss decrease with twolayerperceptroN.. 
class TwoLayerPerceptron(nn.Module):
    def __init__(self, input_features: int, hidden_units: int ):
        super().__init__()

        self.linear1 = nn.Linear(input_features, hidden_units)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(hidden_units, 9)

    def forward(self, x:torch.Tensor) -> torch.Tensor:
        anything = self.linear1(x)
        anything = self.relu(anything)
        anything = self.linear2(anything)
        return anything

In [22]:
# criterion calculates the loss between labels and logits
# optimizer updates the weights by finding the best solution. 
model = TwoLayerPerceptron(len(features[0]), 64).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [23]:
def try_model() -> tuple[list, list]:
    train_accuracy = []
    validation_accuracy = []

    for epoch in (progress_bar := tqdm(range(EPOCHS))):
    #for epoch in (range(EPOCHS)):
        # training part of the model
        model.train() # (some layers can be deactivated)
        epoch_loss = 0
        n_correct = 0
        n_examples = 0

        for examples, labels in (progress_bar := tqdm(train_loader, desc=f"training epoch {epoch}")):
        #for examples, labels in (train_loader): # train loader gives us batches of 64 instances (right now)
            
            # resetting the gradient
            optimizer.zero_grad()
            examples = examples.to(device)
            labels = labels.to(device)
            # we are passing batches of 64 instances into the model. 
            logits = model(examples)
            loss = criterion(logits, labels)
            # calculating for how much approximately the weights should be updated 
            loss.backward()
            # optimizer finds the best solution to update the weights
            optimizer.step()

            # finding out how many logits were predicted correctly at this 64 batch 
            n_correct += (logits.max(1).indices == labels).sum().item()

            epoch_loss += loss.item()
            n_examples += labels.shape[0]

            progress_bar.set_postfix_str(
                f"mean loss: {epoch_loss / n_examples:.5f}, accuracy: {n_correct / n_examples:.4f}"
            )

        train_accuracy.append(n_correct / n_examples)

        # evaluating the model
        model.eval()
        validation_loss = 0 
        n_correct = 0
        n_examples = 0
    
        for examples, labels in (progress_bar := tqdm(validation_loader, desc="validating")):
        # for examples, labels in (validation_loader):
            examples = examples.to(device)
            labels = labels.to(device)

            # we are disabling the gradients bc we are not updating 
            with torch.autograd.no_grad():
                logits = model(examples)
                loss = criterion(logits, labels)
            # max(1) bc that specific instance belongs to the class with the highest probability.
            n_correct += (logits.max(1).indices == labels).sum().item()

            validation_loss += loss.item()
            n_examples += labels.shape[0]
            progress_bar.set_postfix_str(
                f"mean loss: {validation_loss / n_examples:.5f}, accuracy: {n_correct / n_examples:.4f}"
            )

        validation_accuracy.append(n_correct / n_examples)
        #print(epoch)
    return train_accuracy, validation_accuracy

In [24]:
train_accuracy, validation_accuracy = try_model()

training epoch 0: 100%|██████████| 47/47 [00:00<00:00, 82.71it/s, mean loss: 0.03442, accuracy: 0.2580]
validating: 100%|██████████| 16/16 [00:00<00:00, 482.21it/s, mean loss: 0.03463, accuracy: 0.2780]
training epoch 1: 100%|██████████| 47/47 [00:00<00:00, 289.86it/s, mean loss: 0.03328, accuracy: 0.2950]
validating: 100%|██████████| 16/16 [00:00<00:00, 447.75it/s, mean loss: 0.03351, accuracy: 0.3260]
training epoch 2: 100%|██████████| 47/47 [00:00<00:00, 344.25it/s, mean loss: 0.03224, accuracy: 0.3377]
validating: 100%|██████████| 16/16 [00:00<00:00, 470.99it/s, mean loss: 0.03247, accuracy: 0.3500]
training epoch 3: 100%|██████████| 47/47 [00:00<00:00, 351.40it/s, mean loss: 0.03128, accuracy: 0.3573]
validating: 100%|██████████| 16/16 [00:00<00:00, 477.40it/s, mean loss: 0.03146, accuracy: 0.3700]
training epoch 4: 100%|██████████| 47/47 [00:00<00:00, 328.87it/s, mean loss: 0.03034, accuracy: 0.3697]
validating: 100%|██████████| 16/16 [00:00<00:00, 464.56it/s, mean loss: 0.03049,