In [45]:
import numpy as np
import pandas as pd
from scipy.sparse import hstack

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
)

def load_data(malware_csv: str, ransomware_csv: str):
    mw = pd.read_csv(malware_csv)
    rw = pd.read_csv(ransomware_csv)
    mw['label'] = 0
    rw['label'] = 1
    df = pd.concat([mw, rw], ignore_index=True)
    df['opcodes'] = df['opcodes'].fillna('').astype(str)
    #df['api_calls'] = df['api_calls'].fillna('').astype(str)
    return df


def vectorise(df, ngram_min: int, ngram_max: int, max_features: int):
    opcode_vect = TfidfVectorizer(
        tokenizer=lambda s: s.split(),
        ngram_range=(ngram_min, ngram_max),
        max_features=max_features,
    )
    op_mat = opcode_vect.fit_transform(df['opcodes'])
    #api_mat = api_vect.fit_transform(df['api_calls'])
    X = hstack([op_mat], format='csr')
    feature_names = (
        [f"OP::{t}" for t in opcode_vect.get_feature_names_out()]
    )
    return X, df['label'].values, feature_names


def train_models(X_train, y_train):
    rf = RandomForestClassifier(
        n_estimators=100, n_jobs=-1
    ).fit(X_train, y_train)

    svm = SVC(
        kernel='linear', probability=True
    ).fit(X_train, y_train)

    ada = AdaBoostClassifier(
        n_estimators=100
    ).fit(X_train, y_train)

    return rf, svm, ada


def evaluate(name, clf, X_test, y_test):
    pred = clf.predict(X_test)
    print(f"\n=== {name} ===")
    print(f"Accuracy: {accuracy_score(y_test, pred):.4f}")
    print(classification_report(y_test, pred, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_test, pred))
    return pred


def top_features(clf, feature_names, top_k=20):
    print("\nTop features:")
    if hasattr(clf, "feature_importances_"):
        idx = np.argsort(clf.feature_importances_)[::-1][:top_k]
        for i in idx:
            print(f"{feature_names[i]:30s} {clf.feature_importances_[i]:.4f}")
    elif hasattr(clf, "coef_"):
        coefs = clf.coef_
        if hasattr(coefs, "toarray"):
            coefs = coefs.toarray().ravel()
        else:
            coefs = np.asarray(coefs).ravel()
        idx = np.argsort(np.abs(coefs))[::-1][:top_k]
        for i in idx:
            print(f"{feature_names[i]:30s} {coefs[i]:+.4f}")
    else:
        print("Feature importance not available for this estimator.")


def agreement(a, b):
    return np.mean(a == b)


def main():
    # --- Data & features -------------------------------------------------
    df = load_data("../mw_ghidra_truncated.csv", "../rw_ghidra_truncated.csv")
    X, y, feat_names = vectorise(df, 1, 1, 5000)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y
    )

    # --- Baseline --------------------------------------------------------
    dummy = DummyClassifier(strategy="most_frequent").fit(X_train, y_train)
    print("Dummy majority baseline accuracy:", dummy.score(X_test, y_test))

    # --- Train models ----------------------------------------------------
    rf, svm, ada = train_models(X_train, y_train)

    # --- Evaluate --------------------------------------------------------
    rf_pred = evaluate("Random Forest", rf, X_test, y_test)
    svm_pred = evaluate("Linear SVM", svm, X_test, y_test)
    ada_pred = evaluate("AdaBoost", ada, X_test, y_test)

    # --- Agreement -------------------------------------------------------
    print("\n=== Prediction agreement on test set ===")
    print("RF vs SVM :", agreement(rf_pred, svm_pred))
    print("RF vs Ada :", agreement(rf_pred, ada_pred))
    print("SVM vs Ada:", agreement(svm_pred, ada_pred))


    # --- Top features ----------------------------------------------------
    print("\n=== Top features per model ===")
    print("Random Forest:")
    top_features(rf, feat_names)
    print("\nLinear SVM:")
    top_features(svm, feat_names)
    print("\nAdaBoost:")
    top_features(ada, feat_names)


if __name__ == "__main__":
    main()




Dummy majority baseline accuracy: 0.5118708452041786

=== Random Forest ===
Accuracy: 0.9801
              precision    recall  f1-score   support

           0     0.9850    0.9759    0.9804       539
           1     0.9750    0.9844    0.9797       514

    accuracy                         0.9801      1053
   macro avg     0.9800    0.9802    0.9800      1053
weighted avg     0.9801    0.9801    0.9801      1053

Confusion matrix:
 [[526  13]
 [  8 506]]

=== Linear SVM ===
Accuracy: 0.9459
              precision    recall  f1-score   support

           0     0.9617    0.9314    0.9463       539
           1     0.9303    0.9611    0.9455       514

    accuracy                         0.9459      1053
   macro avg     0.9460    0.9462    0.9459      1053
weighted avg     0.9464    0.9459    0.9459      1053

Confusion matrix:
 [[502  37]
 [ 20 494]]

=== AdaBoost ===
Accuracy: 0.9715
              precision    recall  f1-score   support

           0     0.9757    0.9685    0.972

In [38]:
def load_data(malware_csv: str, ransomware_csv: str):
    mw = pd.read_csv(malware_csv)
    rw = pd.read_csv(ransomware_csv)
    mw['label'] = 0
    rw['label'] = 1
    df = pd.concat([mw, rw], ignore_index=True)
    df['opcodes'] = df['opcodes'].fillna('').astype(str)
    df['api_calls'] = df['api_calls'].fillna('').astype(str)
    return df


def vectorise(df, ngram_min: int, ngram_max: int, max_features: int):
    #opcode_vect = TfidfVectorizer(
    #    tokenizer=lambda s: s.split(),
    #    ngram_range=(ngram_min, ngram_max),
    #    max_features=max_features,
    #)
    api_vect = TfidfVectorizer(
        tokenizer=lambda s: [tok for tok in s.split(';') if tok],
        ngram_range=(ngram_min, ngram_max),
        max_features=max_features,
    )
    #op_mat = opcode_vect.fit_transform(df['opcodes'])
    api_mat = api_vect.fit_transform(df['api_calls'])
    X = hstack([api_mat], format='csr')
    feature_names = (
        [f"API::{t}" for t in api_vect.get_feature_names_out()]
    )
    return X, df['label'].values, feature_names


def train_models(X_train, y_train):
    rf = RandomForestClassifier(
        n_estimators=100, n_jobs=-1, class_weight='balanced'
    ).fit(X_train, y_train)

    svm = SVC(
        kernel='linear', probability=True, class_weight='balanced'
    ).fit(X_train, y_train)

    ada = AdaBoostClassifier(
        n_estimators=100
    ).fit(X_train, y_train)

    return rf, svm, ada


def evaluate(name, clf, X_test, y_test):
    pred = clf.predict(X_test)
    print(f"\n=== {name} ===")
    print(f"Accuracy: {accuracy_score(y_test, pred):.4f}")
    print(classification_report(y_test, pred, digits=4))
    print("Confusion matrix:\n", confusion_matrix(y_test, pred))
    return pred


def top_features(clf, feature_names, top_k=20):
    print("\nTop features:")
    if hasattr(clf, "feature_importances_"):
        idx = np.argsort(clf.feature_importances_)[::-1][:top_k]
        for i in idx:
            print(f"{feature_names[i]:30s} {clf.feature_importances_[i]:.4f}")
    elif hasattr(clf, "coef_"):
        coefs = clf.coef_
        if hasattr(coefs, "toarray"):
            coefs = coefs.toarray().ravel()
        else:
            coefs = np.asarray(coefs).ravel()
        idx = np.argsort(np.abs(coefs))[::-1][:top_k]
        for i in idx:
            print(f"{feature_names[i]:30s} {coefs[i]:+.4f}")
    else:
        print("Feature importance not available for this estimator.")


def agreement(a, b):
    return np.mean(a == b)


def main():
    # --- Data & features -------------------------------------------------
    df = load_data("../mw_ghidra_truncated.csv", "../rw_ghidra_truncated.csv")
    X, y, feat_names = vectorise(df, 1, 1, 5000)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y
    )

    # --- Baseline --------------------------------------------------------
    dummy = DummyClassifier(strategy="most_frequent").fit(X_train, y_train)
    print("Dummy majority baseline accuracy:", dummy.score(X_test, y_test))

    # --- Train models ----------------------------------------------------
    rf, svm, ada = train_models(X_train, y_train)

    # --- Evaluate --------------------------------------------------------
    rf_pred = evaluate("Random Forest", rf, X_test, y_test)
    svm_pred = evaluate("Linear SVM", svm, X_test, y_test)
    ada_pred = evaluate("AdaBoost", ada, X_test, y_test)

    # --- Agreement -------------------------------------------------------
    print("\n=== Prediction agreement on test set ===")
    print("RF vs SVM :", agreement(rf_pred, svm_pred))
    print("RF vs Ada :", agreement(rf_pred, ada_pred))
    print("SVM vs Ada:", agreement(svm_pred, ada_pred))


    # --- Top features ----------------------------------------------------
    print("\n=== Top features per model ===")
    print("Random Forest:")
    top_features(rf, feat_names)
    print("\nLinear SVM:")
    top_features(svm, feat_names)
    print("\nAdaBoost:")
    top_features(ada, feat_names)


if __name__ == "__main__":
    main()




Dummy majority baseline accuracy: 0.5118708452041786

=== Random Forest ===
Accuracy: 0.9886
              precision    recall  f1-score   support

           0     0.9907    0.9870    0.9888       539
           1     0.9864    0.9903    0.9883       514

    accuracy                         0.9886      1053
   macro avg     0.9886    0.9886    0.9886      1053
weighted avg     0.9886    0.9886    0.9886      1053

Confusion matrix:
 [[532   7]
 [  5 509]]

=== Linear SVM ===
Accuracy: 0.9582
              precision    recall  f1-score   support

           0     0.9365    0.9852    0.9602       539
           1     0.9835    0.9300    0.9560       514

    accuracy                         0.9582      1053
   macro avg     0.9600    0.9576    0.9581      1053
weighted avg     0.9595    0.9582    0.9582      1053

Confusion matrix:
 [[531   8]
 [ 36 478]]

=== AdaBoost ===
Accuracy: 0.9763
              precision    recall  f1-score   support

           0     0.9742    0.9796    0.976