Static Malware Classfication 

In [1]:
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
dataset = pd.read_csv('malware.csv', sep='|')
data = pd.read_csv('malware.csv', sep='|')
X = data.drop(['Name', 'md5', 'legitimate'], axis=1)
y = data['legitimate']

In [3]:
# Set up MLflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Malware_Classification")

<Experiment: artifact_location='/Users/klmklmk/Documents/GitHub/HS-IML-Project/Model/experiments/mlruns/1', creation_time=1721150609341, experiment_id='1', last_update_time=1721150609341, lifecycle_stage='active', name='Malware_Classification', tags={}>

In [4]:
# Feature selection
extratrees = ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(extratrees, prefit=True)
X_new = model.transform(X)
nbfeatures = X_new.shape[1]



In [7]:
# Print number of features
print(f'Number of selected features: {nbfeatures}')

# Get the names of the selected features
selected_features = X.columns[model.get_support()]

# Print the names of the selected features
print('Selected feature names:')
for feature in selected_features:
    print(feature)

Number of selected features: 13
Selected feature names:
Machine
SizeOfOptionalHeader
Characteristics
ImageBase
MajorOperatingSystemVersion
MajorSubsystemVersion
Subsystem
DllCharacteristics
SizeOfStackReserve
SectionsMaxEntropy
ResourcesMinEntropy
ResourcesMaxEntropy
VersionInformationSize


In [5]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.29, stratify=y)



In [9]:
import math
num_rows = X_train.shape[0]
print(f'Number of training data rows: {num_rows}')

# Calculate the square root of the number of rows
sqrt_num_rows = math.sqrt(num_rows)
print(f'Square root of the number of training data rows: {sqrt_num_rows}')

Number of training data rows: 98013
Square root of the number of training data rows: 313.07027964979363


In [8]:
# N Estimators will range between 100 - 350 
# Depth of the Tree will be around 10 - 20 

In [10]:
# Function to train and log models with MLflow
def train_and_log_model(model_name, model, X_train, y_train, X_test, y_test, param_name, param_value):
    with mlflow.start_run(run_name=f"{model_name}_{param_name}_{param_value}"):
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        predictions = model.predict(X_test)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, predictions)
        
        # Log the model and metrics
        mlflow.sklearn.log_model(model, model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_param(param_name, param_value)
        
        print(f"{model_name} with {param_name}={param_value} : {accuracy}")


In [12]:
# Running experiments for RandomForest with n_estimators from 100 to 350
for n_estimators in range(100, 351, 50):
    model = RandomForestClassifier(n_estimators=n_estimators)
    train_and_log_model("RandomForest", model, X_train, y_train, X_test, y_test, "n_estimators", n_estimators)


RandomForest with n_estimators=100 : 0.9938302442923515
RandomForest with n_estimators=150 : 0.9936803716840685
RandomForest with n_estimators=200 : 0.9937303292201629
RandomForest with n_estimators=250 : 0.9935804566118799
RandomForest with n_estimators=300 : 0.99375530798821
RandomForest with n_estimators=350 : 0.9937802867562572


In [13]:
# Running experiments for DecisionTree with max_depth between 10 to 20
for max_depth in range(10, 21):
    model = DecisionTreeClassifier(max_depth=max_depth)
    train_and_log_model("DecisionTree", model, X_train, y_train, X_test, y_test, "max_depth", max_depth)

DecisionTree with max_depth=10 : 0.9906829195184094
DecisionTree with max_depth=11 : 0.9904830893740321
DecisionTree with max_depth=12 : 0.9904581106059849
DecisionTree with max_depth=13 : 0.9904081530698906
DecisionTree with max_depth=14 : 0.9908827496627867
DecisionTree with max_depth=15 : 0.990782834590598
DecisionTree with max_depth=16 : 0.9906079832142679
DecisionTree with max_depth=17 : 0.9909327071988809
DecisionTree with max_depth=18 : 0.990782834590598
DecisionTree with max_depth=19 : 0.9906579407503622
DecisionTree with max_depth=20 : 0.9903332167657491


In [None]:
''' 
Conclusion 

N Estimators best at 100

Max Depth best at 17 

'''

In [15]:
import joblib

# Train RandomForest model with 100 n_estimators
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

# Save the model to disk
model_filename = "random_forest_100_estimators.joblib"
joblib.dump(model, model_filename)

print(f"Model saved to {model_filename}")


Model saved to random_forest_100_estimators.joblib
