## 分析
這邊試圖處理的問題是：標註為惡意的log與標註為良性的log在語意(sementic)上是否有所不同?
這邊關係到強化學習的代理Agent能否有效地找到標註為惡意的log(可以分辨出某些惡意行為與其他行為不同)並且標上正確的TTP標籤(找出攻擊的方法)

## 方法
嘗試了兩種類型的方法，純粹的詞意與轉換成Provenance Graph後再進行嵌入轉換

#### 一般路徑以及引入的package設定
##### 動態宣告
個別檔案的路徑名稱將其與對應的檔案路徑變數連結起來

In [15]:
import os
import json
import re
import umap
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.cm as cm

# =========================
# Path settings
# =========================
pathSAGA = os.path.join("D:\\", "dataset", "SAGA")
pathCasinoLimit = os.path.join("D:\\", "dataset", "CasinoLimit")
pathOutput=os.path.join(pathSAGA,"result")
pathSAGAmalicious=os.path.join(pathSAGA,"SAGA_malicious.json")
print("SAGA exists:", os.path.exists(pathSAGA))

listSAGAFiles=[]
for dirAPTCampaign in os.listdir(pathSAGA):
    pathAPTCampaign=os.path.join(pathSAGA,dirAPTCampaign)
    if os.path.isdir(pathAPTCampaign) and dirAPTCampaign != "result":
        for auditLog in os.listdir(pathAPTCampaign):
            pathAuditLog=os.path.join(pathAPTCampaign,auditLog)
            #print("current processing:",pathAuditLog)
            #print(auditLog)
            if os.path.isfile(pathAuditLog):
                listSAGAFiles.append("path"+"SAGA"+auditLog.split(".")[0])
                globals()["path"+"SAGA"+auditLog.split(".")[0]]=pathAuditLog
print("SAGA所有有的檔案以及其對應路徑")
for i in listSAGAFiles:
    print(i+":"+globals()[i])


def load_saga_json(path):
    with open(path, "r", encoding="utf-8") as file:
        l = []
        for i in file:
           l.append(json.loads(i)) 
    return l


SAGA exists: True
SAGA所有有的檔案以及其對應路徑
pathSAGAM1:D:\dataset\SAGA\Composite APT Campaigns Dataset\M1.json
pathSAGAM10:D:\dataset\SAGA\Composite APT Campaigns Dataset\M10.json
pathSAGAM2:D:\dataset\SAGA\Composite APT Campaigns Dataset\M2.json
pathSAGAM3:D:\dataset\SAGA\Composite APT Campaigns Dataset\M3.json
pathSAGAM4:D:\dataset\SAGA\Composite APT Campaigns Dataset\M4.json
pathSAGAM5:D:\dataset\SAGA\Composite APT Campaigns Dataset\M5.json
pathSAGAM6:D:\dataset\SAGA\Composite APT Campaigns Dataset\M6.json
pathSAGAM7:D:\dataset\SAGA\Composite APT Campaigns Dataset\M7.json
pathSAGAM8:D:\dataset\SAGA\Composite APT Campaigns Dataset\M8.json
pathSAGAM9:D:\dataset\SAGA\Composite APT Campaigns Dataset\M9.json
pathSAGAG1:D:\dataset\SAGA\Generated APT Campaigns Dataset\G1.json
pathSAGAG10:D:\dataset\SAGA\Generated APT Campaigns Dataset\G10.json
pathSAGAG11:D:\dataset\SAGA\Generated APT Campaigns Dataset\G11.json
pathSAGAG12:D:\dataset\SAGA\Generated APT Campaigns Dataset\G12.json
pathSAGAG13:D:\dat

##### 個別測試

In [None]:
sagaLogsG16 = load_saga_json(pathSAGAG16)
sagaLogsG16[0]['srcNode'].keys()

## 語意分析

In [None]:
def log_to_embedding_text(log):
    """
    Convert a single audit / graph log to a semi-structured text
    (label must NOT be used here)
    """

    src = log.get("srcNode") or {}
    dst = log.get("dstNode") or {} 
    relation = log.get("relation", "unknown")

    proc_name = src.get("Name", "")
    proc_path = src.get("Image", "")
    action = relation

    dst_type = dst.get("Type", "")
    dst_key = dst.get("Key", "") or dst.get("Name", "")

    # simple abstraction
    if proc_path and (
        "System32" in proc_path or "Program Files" in proc_path
    ):
        proc_category = "system process"
    else:
        proc_category = "user process"

    text = f"""
Process: {proc_name}
Category: {proc_category}
Action: {action}
Target type: {dst_type}
Target description: {dst_key}
""".strip()

    return text
# Visualization (Benign vs TTP)
def PCA(df):
    df["color"] = df["label"].apply(lambda x: "blue" if x == "benign" else "red")

    plt.figure(figsize=(8, 6))

    # Benign
    subset_benign = df[df["label"] == "benign"]
    plt.scatter(
        subset_benign["x"],
        subset_benign["y"],
        color="blue",
        alpha=0.4,
        s=15,
        label="benign")

    # Non-benign (all TTP)
    subset_ttp = df[df["label"] != "benign"]
    plt.scatter(
        subset_ttp["x"],
        subset_ttp["y"],
        color="red",
        alpha=0.6,
        s=20,
        label="TTP")

    plt.legend()
    plt.title("PCA of Audit Log Embeddings (Benign vs TTP)")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.grid(True)
    plt.tight_layout()
    plt.show()
def umap(embeddings):
    # =========================
    # UMAP
    # =========================
    print("Running UMAP...")

    umap_model = umap.UMAP(
        n_neighbors=15,
        min_dist=0.1,
        n_components=2,
        metric="cosine",
        random_state=42
    )

    emb_2d_umap = umap_model.fit_transform(embeddings)

    df_umap = pd.DataFrame({
        "x": emb_2d_umap[:, 0],
        "y": emb_2d_umap[:, 1],
        "label": labels
    })

    plt.figure(figsize=(8, 6))

    # benign
    plt.scatter(
        df_umap[df_umap["label"] == "benign"]["x"],
        df_umap[df_umap["label"] == "benign"]["y"],
        color="blue",
        alpha=0.3,
        s=12,
        label="benign")

    # TTP (non-benign)
    plt.scatter(
        df_umap[df_umap["label"] != "benign"]["x"],
        df_umap[df_umap["label"] != "benign"]["y"],
        color="red",
        alpha=0.6,
        s=18,
        label="TTP"
    )

    plt.legend()
    plt.title("UMAP of Audit Log Embeddings (Benign vs TTP)")
    plt.xlabel("UMAP-1")
    plt.ylabel("UMAP-2")
    plt.grid(True)
    plt.tight_layout()
def tSNE():
    # =========================
    # t-SNE
    # =========================
    print("Running t-SNE...")

    tsne = TSNE(
        n_components=2,
        perplexity=30,
        learning_rate=200,
        metric="cosine",
        random_state=42
    )

    emb_2d_tsne = tsne.fit_transform(embeddings)

    df_tsne = pd.DataFrame({
        "x": emb_2d_tsne[:, 0],
        "y": emb_2d_tsne[:, 1],
        "label": labels
    })

    plt.figure(figsize=(8, 6))

    # benign
    plt.scatter(
        df_tsne[df_tsne["label"] == "benign"]["x"],
        df_tsne[df_tsne["label"] == "benign"]["y"],
        color="blue",
        alpha=0.3,
        s=12,
        label="benign")

    # TTP (non-benign)
    plt.scatter(
        df_tsne[df_tsne["label"] != "benign"]["x"],
        df_tsne[df_tsne["label"] != "benign"]["y"],
        color="red",
        alpha=0.6,
        s=18,
        label="TTP")

    plt.legend()
    plt.title("t-SNE of Audit Log Embeddings (Benign vs TTP)")
    plt.xlabel("t-SNE-1")
    plt.ylabel("t-SNE-2")
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:

saga_logs = load_saga_json(pathSAGAG16)#轉換成embedding的範例
print(log_to_embedding_text(saga_logs[0]))#印出其中一條轉換成embedding的範例
print("Total logs:", len(saga_logs))#印出這份SAGA Audit log總共有多少條log

# Prepare embedding inputs
embedding_texts = []
labels = []   # only used for coloring later

for log in tqdm(saga_logs):
    embedding_text = log_to_embedding_text(log)#將log轉換成全文字格式
    embedding_texts.append(embedding_text)#丟進list方便後續處理
    labels.append(log.get("label", "unknown"))

# =========================
# Embedding
# =========================
print("Embedding logs...")
model = SentenceTransformer("cisco-ai/SecureBERT2.0-biencoder")#載入語意嵌入轉換模型
embeddings = model.encode(embedding_texts, show_progress_bar=True)#開始轉換
embeddings = np.array(embeddings)#丟進np array可以加速處理
# 列出所有標籤(等等所有非benign的會被以不同顏色標註所以要列出來，這邊檢查看是否有其他問題)
df_ttp = df[df["label"] == "TTP"]
print("TTP count:", (df["label"] == "TTP").sum())
print(df["label"].value_counts())


In [None]:
#以下是針對只針對惡意的Audit轉換成語意嵌入後印出圖表
#顯示了不同的TTP在語意空間中也會有不同的分布特性
# 只保留非 benign
df_ttp = df[df["label"] != "benign"]

print("Number of TTP events:", len(df_ttp))
print("Number of TTP classes:", df_ttp["label"].nunique())

# 取得所有 TTP 類別
ttp_labels = sorted(df_ttp["label"].unique())

# 自動產生顏色（tab20 很適合離散類別）
cmap = cm.get_cmap("tab20", len(ttp_labels))

plt.figure(figsize=(8, 6))

for i, lbl in enumerate(ttp_labels):
    subset = df_ttp[df_ttp["label"] == lbl]
    plt.scatter(
        subset["x"],
        subset["y"],
        color=cmap(i),
        label=lbl,
        alpha=0.7,
        s=20
    )

plt.title("PCA of Audit Log Embeddings (Non-Benign TTPs)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)

# TTP 類別多，legend 放外面
plt.legend(
    bbox_to_anchor=(1.05, 1),
    loc="upper left",
    fontsize=8,
    title="TTP Label"
)

plt.tight_layout()
plt.show()

## 統計分析
### 資料集結構分析
只看第一層的話SAGA資料集的結構如下：
- 'srcNode'
- 'dstNode'
- 'relation'
- 'timestamp'
- 'label'
#### srcNode所有的鍵值
 {'Cmdline', 'Image', 'Name', 'Pid', 'Type', 'UUID'},
#### dstNode所有的鍵值
 {'Cmdline',
  'Dstaddress',
  'Image',
  'Key',
  'Name',
  'Path',
  'Pid',
  'Port',
  'Srcaddress',
  'Type',
  'UUID',
  'Value'})
#### 全鍵值不重複統計
經過掃描後，可以確認不重複地後，SAGA資料集所有的鍵值如下：
- 'Image'
- 'Path'
- 'dstNode'
- 'timestamp'
- 'srcNode'
- 'Key'
- 'Port'
- 'Type'
- 'relation'
- 'Name'
- 'Cmdline'
- 'Pid'
- 'Dstaddress'
- 'UUID'
- 'label'
- 'Srcaddress'
- 'Value'
P.S.這不能直接用於建立RL環境的依據，因為SrcNode與DstNode有相似的結構會導致鍵值可能出現兩次但只被不重複地算到一次

In [16]:
#提取所有鍵值的小功能
def collectUniqueKeys(obj,setKeys)->list:
    if isinstance(obj,dict):
        for key,value in obj.items():
            setKeys.add(key)
            collectUniqueKeys(value,setKeys)
    elif isinstance(obj,list):
        for i in obj:
            collectUniqueKeys(i,setKeys)


In [17]:
#print(sagaLogsG16[0].keys())#只算第一層的Key
#這邊要找出SAGA Audit log所有存在的鍵以便於定義
setKeys=set()
setKeysSrcNode=set()
setKeysDstNode=set()
for i in listSAGAFiles:
    print(i)
    logs=load_saga_json(globals()[i])
    collectUniqueKeys(logs,setKeys)
    for j in logs:
        collectUniqueKeys(j['srcNode'],setKeysSrcNode)
        collectUniqueKeys(j['dstNode'],setKeysDstNode)
    

pathSAGAM1
pathSAGAM10
pathSAGAM2
pathSAGAM3
pathSAGAM4
pathSAGAM5
pathSAGAM6
pathSAGAM7
pathSAGAM8
pathSAGAM9
pathSAGAG1
pathSAGAG10
pathSAGAG11
pathSAGAG12
pathSAGAG13
pathSAGAG14
pathSAGAG15
pathSAGAG16
pathSAGAG17
pathSAGAG18
pathSAGAG19
pathSAGAG2
pathSAGAG20
pathSAGAG3
pathSAGAG4
pathSAGAG5
pathSAGAG6
pathSAGAG7
pathSAGAG8
pathSAGAG9
pathSAGAC1
pathSAGAC2
pathSAGAC3
pathSAGAC3_1
pathSAGAC3_2
pathSAGAC4
pathSAGAC4__00
pathSAGAC4__01
pathSAGAC4__02
pathSAGAC5
pathSAGAC6
pathSAGAC7
pathSAGAC8


In [18]:

setKeys,setKeysSrcNode,setKeysDstNode

({'Cmdline',
  'Dstaddress',
  'Image',
  'Key',
  'Name',
  'Path',
  'Pid',
  'Port',
  'Srcaddress',
  'Type',
  'UUID',
  'Value',
  'dstNode',
  'label',
  'relation',
  'srcNode',
  'timestamp'},
 {'Cmdline', 'Image', 'Name', 'Pid', 'Type', 'UUID'},
 {'Cmdline',
  'Dstaddress',
  'Image',
  'Key',
  'Name',
  'Path',
  'Pid',
  'Port',
  'Srcaddress',
  'Type',
  'UUID',
  'Value'})