In [1]:
import pathlib
import pickle

import numpy as np
import scipy.sparse
import scipy.io
import pandas as pd

In [2]:
save_prefix = 'data/preprocessed/new_vulkgdata_preprocessed/'

In [3]:
df = pd.read_csv('data/raw/new_vulkgdata/label_with_negatives.csv')
df = df[df['label']==1]
df.head()

Unnamed: 0,cveID,Product Version,label
0,CVE-2019-13719,Google Chrome <78.0.3904.70,1
1,CVE-2019-13717,Google Chrome <78.0.3904.70,1
2,CVE-2019-13718,Google Chrome <78.0.3904.70,1
3,CVE-2019-13716,Google Chrome <78.0.3904.70,1
4,CVE-2019-13715,Google Chrome <78.0.3904.70,1


In [4]:
num_cve = len(set(df['cveID']))
num_pv = len(set(df['Product Version']))

In [5]:
num_samples = len(df)
# 设定训练集、验证集、测试集的比例
train_ratio = 0.7
val_ratio = 0.2
# 注意： 测试集的比例可以通过 1 减去训练集和验证集的比例来计算

# 计算各个集合的大小
train_size = int(train_ratio * num_samples)
val_size = int(val_ratio * num_samples)
# 打乱索引
indices = df.index.values
np.random.shuffle(indices)

# 分割数据集索引
train_idx = indices[:train_size]
val_idx = indices[train_size:train_size + val_size]
test_idx = indices[train_size + val_size:]

In [6]:
cves = list(set(df['cveID']))
cves.sort()
pvs = list(set(df['Product Version']))
pvs.sort()

In [7]:
import pickle

with open('cves.pickle','wb') as f:
    pickle.dump(cves,f)

In [8]:
cve_pv = df.loc[train_idx].reset_index(drop=True)

In [9]:
# build the adjacency matrix
# 0 for cve, 1 for pv
dim = num_cve + num_pv

type_mask = np.zeros((dim), dtype=int)
type_mask[num_cve:] = 1

adjM = np.zeros((dim, dim), dtype=int)
for _, row in cve_pv.iterrows():
    
    adjM[cves.index(row['cveID']),len(cves)+ pvs.index(row['Product Version'])] = 1
    adjM[len(cves)+pvs.index(row['Product Version']), cves.index(row['cveID'])] = 1


In [10]:
cve_pv_list = {i: adjM[i, num_cve:num_cve+num_pv].nonzero()[0] for i in range(num_cve)}
pv_cve_list = {i: adjM[num_cve + i, :num_cve].nonzero()[0] for i in range(num_pv)}

In [11]:
# 0-1-0
u_a_u = []
for a, u_list in pv_cve_list.items():
    u_a_u.extend([(u1, a, u2) for u1 in u_list for u2 in u_list])
u_a_u = np.array(u_a_u)
u_a_u[:, 1] += num_cve
sorted_index = sorted(list(range(len(u_a_u))), key=lambda i : u_a_u[i, [0, 2, 1]].tolist())
u_a_u = u_a_u[sorted_index]

# 1-0-1
a_u_a = []
for u, a_list in cve_pv_list.items():
    a_u_a.extend([(a1, u, a2) for a1 in a_list for a2 in a_list])
a_u_a = np.array(a_u_a)
a_u_a[:, [0, 2]] += num_cve
sorted_index = sorted(list(range(len(a_u_a))), key=lambda i : a_u_a[i, [0, 2, 1]].tolist())
a_u_a = a_u_a[sorted_index]

In [12]:
expected_metapaths = [
    [(0, 1, 0)],
    [(1, 0, 1)]
]
# create the directories if they do not exist
for i in range(len(expected_metapaths)):
    pathlib.Path(save_prefix + '{}'.format(i)).mkdir(parents=True, exist_ok=True)

metapath_indices_mapping = {(0, 1, 0): u_a_u,
                            
                            (1, 0, 1): a_u_a
                            }

# write all things
target_idx_lists = [np.arange(num_cve), np.arange(num_pv)]
offset_list = [0, num_cve]
for i, metapaths in enumerate(expected_metapaths):
    for metapath in metapaths:
        edge_metapath_idx_array = metapath_indices_mapping[metapath]
        
        with open(save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '_idx.pickle', 'wb') as out_file:
            target_metapaths_mapping = {}
            left = 0
            right = 0
            for target_idx in target_idx_lists[i]:
                while right < len(edge_metapath_idx_array) and edge_metapath_idx_array[right, 0] == target_idx + offset_list[i]:
                    right += 1
                target_metapaths_mapping[target_idx] = edge_metapath_idx_array[left:right, ::-1]
                left = right
            pickle.dump(target_metapaths_mapping, out_file)

        #np.save(save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '_idx.npy', edge_metapath_idx_array)
        
        with open(save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '.adjlist', 'w') as out_file:
            left = 0
            right = 0
            for target_idx in target_idx_lists[i]:
                while right < len(edge_metapath_idx_array) and edge_metapath_idx_array[right, 0] == target_idx + offset_list[i]:
                    right += 1
                neighbors = edge_metapath_idx_array[left:right, -1] - offset_list[i]
                neighbors = list(map(str, neighbors))
                if len(neighbors) > 0:
                    out_file.write('{} '.format(target_idx) + ' '.join(neighbors) + '\n')
                else:
                    out_file.write('{}\n'.format(target_idx))
                left = right

scipy.sparse.save_npz(save_prefix + 'adjM.npz', scipy.sparse.csr_matrix(adjM))
np.save(save_prefix + 'node_types.npy', type_mask)

In [13]:
# output user_artist.npy
df['cve'] = df['cveID'].apply(lambda x: cves.index(x))
df['pv'] = df['Product Version'].apply(lambda x: pvs.index(x))
cve_pv = df[['cve', 'pv']].to_numpy()
np.save(save_prefix + 'user_artist.npy', cve_pv)

In [14]:
# output positive and negative samples for training, validation and testing

np.random.seed(453289)


cve_pv = np.load(save_prefix + 'user_artist.npy')

neg_candidates = []
counter = 0
for i in range(num_cve):
    for j in range(num_pv):
        if counter < len(cve_pv):
            if i == cve_pv[counter, 0] and j == cve_pv[counter, 1]:
                counter += 1
            else:
                neg_candidates.append([i, j])
        else:
            neg_candidates.append([i, j])
neg_candidates = np.array(neg_candidates)

idx = np.random.choice(len(neg_candidates), len(val_idx) + len(test_idx), replace=False)
val_neg_candidates = neg_candidates[sorted(idx[:len(val_idx)])]
test_neg_candidates = neg_candidates[sorted(idx[len(val_idx):])]

train_user_artist = cve_pv[train_idx]
train_neg_candidates = []
counter = 0
for i in range(num_cve):
    for j in range(num_pv):
        if counter < len(train_user_artist):
            if i == train_user_artist[counter, 0] and j == train_user_artist[counter, 1]:
                counter += 1
            else:
                train_neg_candidates.append([i, j])
        else:
            train_neg_candidates.append([i, j])
train_neg_candidates = np.array(train_neg_candidates)

np.savez(save_prefix + 'train_val_test_neg_user_artist.npz',
         train_neg_user_artist=train_neg_candidates,
         val_neg_user_artist=val_neg_candidates,
         test_neg_user_artist=test_neg_candidates)
np.savez(save_prefix + 'train_val_test_pos_user_artist.npz',
         train_pos_user_artist=cve_pv[train_idx],
         val_pos_user_artist=cve_pv[val_idx],
         test_pos_user_artist=cve_pv[test_idx])

In [15]:
num_cve

13325

In [16]:
num_pv

10176