In [1]:
import pathlib

import numpy as np
import scipy.sparse
import scipy.io
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
import utils.preprocess
from sklearn.model_selection import train_test_split

In [2]:
save_prefix = 'data/preprocessed/VULKG_processed/'
num_ntypes = 2

In [3]:
# load raw data, delete movies with no actor or director
df = pd.read_excel('data/raw/VULKG/label_with_negatives.xlsx')[:].dropna(
    axis=0).reset_index(drop=True)

In [4]:
# extract labels, and delete movies with unwanted genres
# 0 for action, 1 for comedy, 2 for drama, -1 for others
labels = df['label'].values
# for movie_idx, genres in movies['genres'].iteritems():
#     labels[movie_idx] = -1
#     for genre in genres.split('|'):
#         if genre == 'Action':
#             labels[movie_idx] = 0
#             break
#         elif genre == 'Comedy':
#             labels[movie_idx] = 1
#             break
#         elif genre == 'Drama':
#             labels[movie_idx] = 2
#             break
# unwanted_idx = np.where(labels == -1)[0]
# movies = movies.drop(unwanted_idx).reset_index(drop=True)
# labels = np.delete(labels, unwanted_idx, 0)

In [5]:
cves = list(set(df['cveID']))
cves.sort()

In [6]:
print(len(cves))

152


In [6]:
pvs = list(set(df['Product Version']))
pvs.sort()

In [8]:
print(len(pvs))

201


In [7]:
# build the adjacency matrix for the graph consisting of cves, pvs
# 0 for cves, 1 for pvs
dim = len(cves) + len(pvs)
type_mask = np.zeros((dim), dtype=int)
type_mask[len(cves):] = 1

adjM = np.zeros((dim, dim), dtype=int)
for i in range(len(df)):
    cve = df['cveID'].iloc[i]
    pv = df['Product Version'].iloc[i]
    cve_idx = cves.index(cve)
    pv_idx = pvs.index(pv)
    adjM[cve_idx, len(cves)+pv_idx] = 1
    adjM[len(cves)+pv_idx, cve_idx] = 1

In [8]:
from scipy.sparse import random as sparse_random

# 矩阵的尺寸
rows, cols = len(cves), 100

# 稀疏度：非零元素的比例
density = 0.05  # 例如，5%的元素是非零的

# 使用scipy.sparse中的random函数生成随机稀疏矩阵
# 数据默认服从[0, 1)的均匀分布
cve_X = sparse_random(rows, cols, density, format='csr')
adjM_da2m = adjM[len(cves):, :len(cves)]
adjM_da2m_normalized = np.diag(1 / adjM_da2m.sum(axis=1)).dot(adjM_da2m)
director_actor_X = scipy.sparse.csr_matrix(adjM_da2m_normalized).dot(cve_X)
full_X = scipy.sparse.vstack([cve_X, director_actor_X])

In [9]:
print(adjM_da2m_normalized.shape)
print(cve_X.shape)
print(full_X.shape)

(201, 152)
(152, 100)
(353, 100)


In [11]:
expected_metapaths = [
    [(0, 1, 0)],
    [(1, 0, 1)]
]
# create the directories if they do not exist
for i in range(num_ntypes):
    pathlib.Path(save_prefix + '{}'.format(i)).mkdir(parents=True, exist_ok=True)
for i in range(num_ntypes):
    # get metapath based neighbor pairs
    neighbor_pairs = utils.preprocess.get_metapath_neighbor_pairs(adjM, type_mask, expected_metapaths[i])
    # construct and save metapath-based networks
    #print(neighbor_pairs[0][(0,0)])
    G_list = utils.preprocess.get_networkx_graph(neighbor_pairs, type_mask, i)
    #print(G_list[0].nodes())
    # networkx graph (metapath specific)
    for G, metapath in zip(G_list, expected_metapaths[i]):
        nx.write_adjlist(G, save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '.adjlist')
    # node indices of edge metapaths
    all_edge_metapath_idx_array = utils.preprocess.get_edge_metapath_idx_array(neighbor_pairs)
    for metapath, edge_metapath_idx_array in zip(expected_metapaths[i], all_edge_metapath_idx_array):
        np.save(save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '_idx.npy', edge_metapath_idx_array)

# save data
# all nodes adjacency matrix
scipy.sparse.save_npz(save_prefix + 'adjM.npz', scipy.sparse.csr_matrix(adjM))
# all nodes (movies, directors and actors) features
for i in range(num_ntypes):
    scipy.sparse.save_npz(save_prefix + 'features_{}.npz'.format(i), full_X[np.where(type_mask == i)[0]])
# all nodes (movies, directors and actors) type labels
np.save(save_prefix + 'node_types.npy', type_mask)
# movie genre labels
np.save(save_prefix + 'labels.npy', labels)
# movie train/validation/test splits
rand_seed = 1566911444
train_idx, val_idx = train_test_split(np.arange(len(labels)), test_size=100, random_state=rand_seed)
train_idx, test_idx = train_test_split(train_idx, test_size=100, random_state=rand_seed)
train_idx.sort()
val_idx.sort()
test_idx.sort()
np.savez(save_prefix + 'train_val_test_idx.npz',
         val_idx=val_idx,
         train_idx=train_idx,
         test_idx=test_idx)

In [13]:
np.save(save_prefix + 'labels.npy', labels)