In [None]:
import pandas as pd
import os

# ======== Cấu hình ========
input_file = "processed_data.csv"
output_dir = "cwe_filtered"
target_cwes = ['CWE-119', 'CWE-399', 'CWE-416', 'CWE-189', 'CWE-20']
cwe_column = "CWE ID"  # Đã xác định đúng tên cột

# ======== Tạo thư mục đầu ra ========
os.makedirs(output_dir, exist_ok=True)

# ======== Đọc dữ liệu ========
df = pd.read_csv(input_file)

# ======== Lọc theo từng CWE ========
for cwe in target_cwes:
    df_cwe = df[df[cwe_column] == cwe]
    if not df_cwe.empty:
        output_path = os.path.join(output_dir, f"{cwe.replace('-', '_')}.csv")
        df_cwe.to_csv(output_path, index=False)
        print(f"Đã tạo: {output_path} ({len(df_cwe)} dòng)")
    else:
        print(f"⚠️ Không có mẫu cho {cwe}")


  df = pd.read_csv(input_file)


Đã tạo: cwe_filtered\CWE_119.csv (26497 dòng)
Đã tạo: cwe_filtered\CWE_399.csv (14806 dòng)
Đã tạo: cwe_filtered\CWE_416.csv (9780 dòng)
Đã tạo: cwe_filtered\CWE_189.csv (6964 dòng)
Đã tạo: cwe_filtered\CWE_20.csv (20501 dòng)


In [3]:
import pandas as pd

def count_target_per_cwe(file_path):
    # Đọc file CSV
    df = pd.read_csv(file_path)

    # Nhóm theo 'CWE ID' và 'target', đếm số mẫu
    result = df.groupby(['CWE ID', 'target']).size().unstack(fill_value=0)

    # Đổi tên cột cho rõ nghĩa
    result.columns = ['target_0_count', 'target_1_count']

    # In kết quả
    for cwe_id, row in result.iterrows():
        print(f"CWE ID {cwe_id}: target=0 → {row['target_0_count']} mẫu, target=1 → {row['target_1_count']} mẫu")

count_target_per_cwe("processed_data.csv")  


  df = pd.read_csv(file_path)


CWE ID CWE-1021: target=0 → 62 mẫu, target=1 → 1 mẫu
CWE ID CWE-119: target=0 → 24370 mẫu, target=1 → 2127 mẫu
CWE ID CWE-120: target=0 → 89 mẫu, target=1 → 7 mẫu
CWE ID CWE-125: target=0 → 7092 mẫu, target=1 → 625 mẫu
CWE ID CWE-129: target=0 → 59 mẫu, target=1 → 4 mẫu
CWE ID CWE-134: target=0 → 636 mẫu, target=1 → 20 mẫu
CWE ID CWE-16: target=0 → 57 mẫu, target=1 → 1 mẫu
CWE ID CWE-17: target=0 → 633 mẫu, target=1 → 51 mẫu
CWE ID CWE-172: target=0 → 24 mẫu, target=1 → 4 mẫu
CWE ID CWE-18: target=0 → 15 mẫu, target=1 → 34 mẫu
CWE ID CWE-189: target=0 → 6627 mẫu, target=1 → 337 mẫu
CWE ID CWE-19: target=0 → 668 mẫu, target=1 → 55 mẫu
CWE ID CWE-190: target=0 → 3364 mẫu, target=1 → 307 mẫu
CWE ID CWE-191: target=0 → 67 mẫu, target=1 → 3 mẫu
CWE ID CWE-20: target=0 → 19359 mẫu, target=1 → 1142 mẫu
CWE ID CWE-200: target=0 → 8133 mẫu, target=1 → 503 mẫu
CWE ID CWE-209: target=0 → 0 mẫu, target=1 → 1 mẫu
CWE ID CWE-22: target=0 → 655 mẫu, target=1 → 35 mẫu
CWE ID CWE-252: target=0 → 0 mẫu,

In [None]:
import pandas as pd

def count_target_for_selected_cwes(file_path, selected_cwes):
    # Đọc file CSV
    df = pd.read_csv(file_path)

    # Chỉ giữ lại các dòng có CWE ID nằm trong danh sách chỉ định
    df_filtered = df[df["CWE ID"].isin(selected_cwes)]

    # Nhóm theo 'CWE ID' và 'target', đếm số lượng
    result = df_filtered.groupby(['CWE ID', 'target']).size().unstack(fill_value=0)

    # Đảm bảo đủ 2 cột target
    if 0 not in result.columns:
        result[0] = 0
    if 1 not in result.columns:
        result[1] = 0

    result = result[[0, 1]]  # sắp xếp cột theo thứ tự target=0, target=1
    result.columns = ['target_0_count', 'target_1_count']

    # In kết quả
    for cwe_id, row in result.iterrows():
        print(f"CWE ID {cwe_id}: target=0 → {row['target_0_count']} mẫu, target=1 → {row['target_1_count']} mẫu")

# Danh sách các CWE cần kiểm tra        
selected_cwes = ['CWE-119', 'CWE-399', 'CWE-189', 'CWE-20', 'CWE-416']

# Gọi hàm
count_target_for_selected_cwes("processed_data.csv" , selected_cwes)


  df = pd.read_csv(file_path)


CWE ID CWE-119: target=0 → 24370 mẫu, target=1 → 2127 mẫu
CWE ID CWE-189: target=0 → 6627 mẫu, target=1 → 337 mẫu
CWE ID CWE-20: target=0 → 19359 mẫu, target=1 → 1142 mẫu
CWE ID CWE-399: target=0 → 14070 mẫu, target=1 → 736 mẫu
CWE ID CWE-416: target=0 → 9450 mẫu, target=1 → 330 mẫu


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import os

# ========== Cấu hình ==========
input_file = "cwe_filtered/CWE_20.csv"
output_file = "cwe20/support_vectors_benign_cwe20.csv"
text_column = "processed_func"
label_column = "target"

# ========== Bước 1: Đọc dữ liệu ==========
df = pd.read_csv(input_file)
df = df.dropna(subset=[text_column])

# ========== Bước 2: TF-IDF ==========
vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", max_features=5000)
X = vectorizer.fit_transform(df[text_column])
y = df[label_column].values

# ========== Bước 3: Huấn luyện SVC (kernel='rbf') và lấy support vector ==========
svc = SVC(kernel='rbf')
svc.fit(X, y)
support_indices = svc.support_

# ========== Bước 4: Lấy sample tương ứng và lọc benign ==========
support_df = df.iloc[support_indices]
support_df_benign = support_df[support_df[label_column] == 0]

# ========== Bước 5: Ghi ra file ==========
support_df_benign.to_csv(output_file, index=False)
print(f"✅ Đã lưu {len(support_df_benign)} support vectors benign vào {output_file}")


  df = pd.read_csv(input_file)


✅ Đã lưu 4486 support vectors benign vào cwe20/support_vectors_benign_cwe20.csv


In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from collections import Counter
import pandas as pd
import numpy as np

# ====================== CONFIG ======================
CSV_FILE = "cwe20/support_vectors_benign_cwe20.csv"
MAX_LEN = 200
EMBEDDING_DIM = 100
TOP_K = 10
USE_DROPOUT = False
# ====================================================

# ====== Load Data ======
df = pd.read_csv(CSV_FILE)
texts = df['processed_func'].astype(str).tolist()

# ====== Tokenize với bộ lọc giữ dấu gạch dưới _ ======
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')  # giữ lại '_'
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}
X = pad_sequences(sequences, maxlen=MAX_LEN)

# ====== Dummy embedding và mô hình attention đơn giản ======
from keras.layers import (Input, Embedding, Bidirectional, LSTM, Dense,
                          Flatten, Dropout, RepeatVector, Permute, Multiply, Activation)

def attention_3d_block(inputs):
    attention = Dense(1, activation='tanh')(inputs)
    attention = Flatten()(attention)
    attention = Activation('softmax', name='attention_vec')(attention)
    attention = RepeatVector(inputs.shape[-1])(attention)
    attention = Permute([2, 1])(attention)
    return Multiply()([inputs, attention])

def BiLSTM_network(MAX_LEN, EMBEDDING_DIM, word_index, embedding_matrix, use_dropout=False):
    inputs = Input(shape=(MAX_LEN,))
    embedding = Embedding(len(word_index)+1, EMBEDDING_DIM, weights=[embedding_matrix],
                          input_length=MAX_LEN, trainable=False)(inputs)
    bilstm = Bidirectional(LSTM(64, return_sequences=True))(embedding)
    bilstm2 = Bidirectional(LSTM(64, return_sequences=True))(bilstm)
    attn = attention_3d_block(bilstm2)
    flat = Flatten()(attn)
    dense = Dense(64, activation='relu')(Dropout(0.5)(flat)) if use_dropout else Dense(64, activation='relu')(flat)
    dense = Dense(32, activation='relu')(dense)
    out = Dense(1, activation='sigmoid')(dense)
    return Model(inputs=inputs, outputs=out)

# Dummy embedding
embedding_matrix = np.random.uniform(-0.05, 0.05, (len(word_index) + 1, EMBEDDING_DIM))

# Build model
model = BiLSTM_network(MAX_LEN, EMBEDDING_DIM, word_index, embedding_matrix, USE_DROPOUT)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# ====== Extract attention weights ======
attention_model = Model(inputs=model.input, outputs=model.get_layer('attention_vec').output)
attention_weights = attention_model.predict(X, batch_size=32)

# ====== Extract attention words ======
attention_words = []
all_top_words = []

for i in range(len(attention_weights)):
    attn = attention_weights[i]
    token_ids = X[i]
    top_idx = np.argsort(attn)[-TOP_K:][::-1]

    words = []
    for j in top_idx:
        token_id = token_ids[j]
        if token_id > 0 and token_id in index_word:
            words.append(index_word[token_id])

    attention_words.append(words)
    all_top_words.extend(words)

# ====== Lưu attention words và code ======
output_df = pd.DataFrame({
    "sample_index": list(range(len(attention_words))),
    "processed_func": texts,
    "attention_words": [", ".join(words) for words in attention_words]
})
output_df.to_csv("cwe20/attention_words_cwe20.csv", index=False)
print("✅ Đã lưu attention words")

# ====== Thống kê top attention words ======
word_counts = Counter(all_top_words)
most_common_words = word_counts.most_common(20)

print("\n📊 Top từ attention phổ biến:")
for word, count in most_common_words:
    print(f"{word}: {count} lần")

# Lưu thống kê ra file
df_stats = pd.DataFrame(most_common_words, columns=["word", "count"])
df_stats.to_csv("cwe20/top_attention_word_stats_cwe20.csv", index=False)
print("📁 Đã lưu thống kê top attention words")




[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 31ms/step
✅ Đã lưu attention words

📊 Top từ attention phổ biến:
if: 841 lần
struct: 750 lần
void: 515 lần
1: 500 lần
return: 485 lần
0: 400 lần
static: 333 lần
true: 147 lần
this: 146 lần
bool: 144 lần
std: 143 lần
size: 124 lần
string: 124 lần
const: 123 lần
unsigned: 122 lần
d: 111 lần
int: 111 lần
ctx: 106 lần
offset: 91 lần
false: 88 lần
📁 Đã lưu thống kê top attention words


In [8]:
import pandas as pd
import os

# Thư mục chứa các file
folder = "cwe_filtered"

# Danh sách các file muốn thêm cột idx
file_names = [
    "CWE_399.csv",
    "CWE_89.csv",
    "CWE_119.csv",
    "CWE_189.csv",
    "CWE_416.csv",
    "CWE_20.csv",
    "CWE_190.csv"
]

# Thêm cột idx cho từng file
for file_name in file_names:
    file_path = os.path.join(folder, file_name)
    df = pd.read_csv(file_path)
    df["idx"] = range(len(df))
    df.to_csv(file_path, index=False)
    print(f"✅ Đã thêm idx cho {file_name}")


✅ Đã thêm idx cho CWE_399.csv
✅ Đã thêm idx cho CWE_89.csv


  df = pd.read_csv(file_path)


✅ Đã thêm idx cho CWE_119.csv
✅ Đã thêm idx cho CWE_189.csv
✅ Đã thêm idx cho CWE_416.csv


  df = pd.read_csv(file_path)


✅ Đã thêm idx cho CWE_20.csv
✅ Đã thêm idx cho CWE_190.csv


In [16]:
# Đọc file predict.txt cũ (chỉ có label), thêm idx vào đầu mỗi dòng
with open("cwe119/predict_linevul_cwe119.txt", "r") as f:
    labels = f.readlines()

# Nếu file cũ không có header
with open("cwe119/predict_linevul_cwe119.txt", "w") as f:
    f.write("idx\tlabel\n")  # Thêm header
    for idx, label in enumerate(labels):
        label = label.strip()
        if label:  # Bỏ qua dòng trống
            f.write(f"{idx}\t{label}\n")


In [20]:
# Đọc file predict.txt cũ (chỉ có label), thêm idx vào đầu mỗi dòng
with open("cwe119/predict_adv_cwe119.txt", "r") as f:
    labels = f.readlines()

# Nếu file cũ không có header
with open("cwe119/predict_adv_cwe119.txt", "w") as f:
    f.write("idx\tlabel\n")  # Thêm header
    for idx, label in enumerate(labels):
        label = label.strip()
        if label:  # Bỏ qua dòng trống
            f.write(f"{idx}\t{label}\n")


In [17]:
import os
import pandas as pd
import json
import random
import numpy as np
from fga_selection import *

# ========== Tham số ==========
K = 3
max_generation = 30
alpha = 2
penalty = 0.01
decay_rate = 2
insert_position = 15
adv_file_path = "cwe_filtered/CWE_119.csv"
pre_result_path = "cwe119/predict_linevul_cwe119.txt"
output_path = "cwe119/adversarial_output_cwe119.csv"
df = pd.read_csv(adv_file_path)

# ========== Dữ liệu ban đầu ==========
ad_content = pd.read_csv("preserved_pool_attack.csv")["NoiDung"].dropna().astype(str).tolist()
label_list = df["idx"].tolist()
true_labels = df["target"].tolist()

# ========== Đọc kết quả dự đoán ==========
with open(pre_result_path, "r") as f:
    pre_result = {int(line.split("\t")[0]): int(line.split("\t")[1].strip()) for line in f if line.strip()}

# ========== Lọc mẫu bị đánh nhầm ==========
vul_idx = get_vul_idx(label_list, pre_result, true_labels)
vul_codes = get_vul_codes(df.to_dict("records"), vul_idx)

print(f"🔍 Có {len(vul_codes)} mã dễ bị tấn công được chọn để sinh mã độc.")

# ========== Fuzzy clustering ==========
centroids = centriod_init(K, min_distance=0.1)

# ========== Tạo population ban đầu ==========
pop_size = 10
pop_dict = {}
for _ in range(pop_size):
    snippet = random.choice(ad_content)
    score = get_fitness_score(pre_result_path, adv_file_path, snippet_len=len(snippet), penalty=penalty)
    pop_dict[snippet] = score

# ========== Tiến hóa ==========
for gen in range(max_generation):
    print(f"⚙️ Thế hệ {gen+1}/{max_generation}")
    new_offspring = []
    offspring_scores = []

    for _ in range(pop_size):
        parent_snippet = select(pop_dict, np.mean(list(pop_dict.values())), centroids, decay_rate)
        mutated = parent_snippet + " " + random.choice(ad_content)
        mutated_score = get_fitness_score(pre_result_path, adv_file_path, snippet_len=len(mutated), penalty=penalty)
        new_offspring.append(mutated)
        offspring_scores.append(mutated_score)

    pop_dict = update_global_pop(new_offspring, pop_dict, offspring_scores)

# ========== Lấy snippet tốt nhất ==========
best_snippet = max(pop_dict, key=pop_dict.get)
print(f"🏆 Best adversarial snippet:\n{best_snippet}")

# ========== Chèn mã độc vào processed_func ==========
vul_rows = df[df["idx"].isin(vul_idx)]
vul_codes = dict(zip(vul_rows["idx"], vul_rows["processed_func"]))
adv_codes, _ = add_adver_sample_2_ast(vul_codes, insert_position, best_snippet.split())

# ========== Ghi lại DataFrame gốc với mã độc đã thay thế ==========
df_adv = df.copy()
for i, idx in enumerate(vul_rows.index):
    df_adv.at[idx, "processed_func"] = adv_codes[i]

df_adv.to_csv(output_path, index=False)
print(f"File mã độc đã ghi vào: {output_path}")


  df = pd.read_csv(adv_file_path)


🔍 Có 1763 mã dễ bị tấn công được chọn để sinh mã độc.


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 1/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 2/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 3/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 4/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 5/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 6/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 7/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 8/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 9/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 10/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 11/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 12/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 13/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 14/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 15/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 16/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 17/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 18/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 19/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 20/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 21/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 22/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 23/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 24/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 25/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 26/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 27/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 28/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 29/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


⚙️ Thế hệ 30/30


  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)
  df = pd.read_csv(adv_file_path)


🏆 Best adversarial snippet:

{
	char dummy_char = 'a'
	switch (dummy_char) {
	case 'b':
		return
	default:
		break
	}
}




100%|██████████| 1763/1763 [00:00<00:00, 46851.71it/s]


File mã độc đã ghi vào: cwe119/adversarial_output_cwe119.csv


In [21]:
import pandas as pd
from sklearn.metrics import accuracy_score

# === Load original prediction và adversarial prediction ===
def load_preds(path):
    with open(path) as f:
        return {int(l.split('\t')[0]): int(l.split('\t')[1]) for l in f if l.strip()}

orig_pred = load_preds("cwe119/predict_linevul_cwe119.txt")
adv_pred = load_preds("cwe119/predict_adv_cwe119.txt")

# === Load ground-truth labels từ CSV ===
df = pd.read_csv("cwe119/adversarial_output_cwe119.csv") 
labels = dict(zip(df["idx"], df["target"]))

# === Xác định mẫu vulnerable đúng bị phát hiện ban đầu ===
vul_correct_ids = [idx for idx in labels if labels[idx] == 1 and orig_pred.get(idx, 0) == 1]

# === Sau tấn công: bị đánh nhầm là benign ===
successful_attack_ids = [idx for idx in vul_correct_ids if adv_pred.get(idx, 1) == 0]

# === Tính ASR (Attack Success Rate) ===
ASR = len(successful_attack_ids) / len(vul_correct_ids) if vul_correct_ids else 0
print(f"ASR = {ASR:.3%} ({len(successful_attack_ids)}/{len(vul_correct_ids)})")


ASR = 92.456% (1630/1763)


  df = pd.read_csv("cwe119/adversarial_output_cwe119.csv")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Đọc file CSV gốc
df = pd.read_csv("cwe_filtered/CWE_399.csv")

# Bước 1: Chia tạm thành train (80%) và temp (20%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])

# Bước 2: Chia temp thành val (10%) và test (10%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['target'])

# Ghi ra các file kết quả
train_df.to_csv("cwe399_train.csv", index=False)
val_df.to_csv("cwe399_val.csv", index=False)
test_df.to_csv("cwe399_test.csv", index=False)

print(f"Đã chia cwe399.csv thành 3 tập:")
print(f"  • train: {len(train_df)} mẫu")
print(f"  • val:   {len(val_df)} mẫu")
print(f"  • test:  {len(test_df)} mẫu")


✅ Đã chia cwe399.csv thành 3 tập:
  • train: 11844 mẫu
  • val:   1481 mẫu
  • test:  1481 mẫu


In [1]:
import pandas as pd

# Đọc hai file CSV
df1 = pd.read_csv("adversarial_outputs_119.csv")
df2 = pd.read_csv("adversarial_outputs_189.csv")
df3 = pd.read_csv("adversarial_outputs_416.csv")
df4 = pd.read_csv("output_adversarial.csv")

# Hợp hai DataFrame lại với nhau
df_merged = pd.concat([df1, df2, df3, df4], ignore_index=True)

# Ghi ra file CSV mới
df_merged.to_csv("preserved_pool_attack.csv", index=False)

print(f"✅ Đã hợp nhất thành công. Gồm {len(df_merged)} dòng.")


✅ Đã hợp nhất thành công. Gồm 969 dòng.
