## 方案一
- 只有两个节点信息

In [46]:
import networkx as nx
import numpy as np
import pandas as pd
from torch_geometric.utils import k_hop_subgraph
import torch
from tqdm import tqdm
import random
import os
random.seed(42)
tqdm.pandas()

text_info = pd.read_csv('../../dataset/cora/cora_text.tsv', delimiter='\t')[['pid', 'title', 'abs', 'label']]
cites = pd.read_csv('../../dataset/cora/cora.cites', delimiter='\t', header=None)
cites.columns = ['cite_id', 'cited_id']
text_info

Unnamed: 0,pid,title,abs,label
0,31336,Title: The megaprior heuristic for discovering...,Abstract: Several computer algorithms for disc...,Neural_Networks
1,1061127,Title: Applications of machine learning: a med...,Abstract: This paper describes preliminary wor...,Rule_Learning
2,1106406,"Title: Submitted to NIPS96, Section: Applicati...","Abstract: In cellular telephone systems, an im...",Reinforcement_Learning
3,13195,Title: Planning and Acting in Partially Observ...,"Abstract: In this paper, we bring techniques f...",Reinforcement_Learning
4,37879,Note: c Massachusetts Institute of Technology ...,Abstract: Graphical models enhance the represe...,Probabilistic_Methods
...,...,...,...,...
2703,1128975,Title: learning easier tasks. More work is nec...,Abstract: We have attempted to obtain a strong...,Genetic_Algorithms
2704,1128977,Title: A Genome Compiler for High Performance ...,Abstract: Genetic Programming is very computat...,Genetic_Algorithms
2705,1128978,Title: The MAX Problem for Genetic Programming...,Abstract: The Crossover operator is common to ...,Genetic_Algorithms
2706,117328,Title: Functional Representation as Design Rat...,Abstract: Design rationale is a record of desi...,Case_Based


In [52]:
# 正例: 有边的node pair
# 反例: 没有边的node pair
# 各10个

sample = text_info.copy()

pos_sample = cites.sample(10, random_state=42).reset_index(drop=True)
pos_sample['link'] = 1

neg_srcs = sample['pid'].sample(10, random_state=42).values
neg_tars = []
for node in neg_srcs:
    neighbors = set(cites[cites.cite_id == node].cited_id) | set(cites[cites.cited_id == node].cite_id)
    neg_tar = sample[~sample.pid.isin(neighbors)].pid.sample(1, random_state=42).values[0]
    neg_tars.append(neg_tar)
    
neg_sample = pd.DataFrame({'cite_id': neg_srcs, 'cited_id': neg_tars, 'link': [0] * len(neg_srcs)})

sample_one = pd.concat([pos_sample, neg_sample], axis=0)


sample_one = sample_one.merge(text_info, how='left', left_on='cite_id', right_on='pid').drop(columns='pid').rename(columns={
    'title': 'cite_title', 'abs': 'cite_abs', 'label': 'cite_label'})

sample_one = sample_one.merge(text_info, how='left', left_on='cited_id', right_on='pid').drop(columns='pid').rename(columns={
'title': 'cited_title', 'abs': 'cited_abs', 'label': 'cited_label'})

path = 'sample/no'
if not os.path.exists(path):
    os.mkdir(path)
    
sample_one = sample_one[['cite_id', 'cite_title', 'cite_abs', 'cite_label', 'cited_id', 'cited_title', 'cited_abs', 'cited_label', 'link']]
sample_one.to_csv(f'{path}/sample_one.csv', index=False)

## 方案二: two_hop间接链接
- 正例: (A, B, C, A->B, B->C, A->C成立)
- 反例: (A, B, C, A->B, B->C, A->C不成立)

In [43]:
import networkx as nx
import numpy as np
import pandas as pd
from torch_geometric.utils import k_hop_subgraph
import torch
from tqdm import tqdm
import random
import os
random.seed(42)
tqdm.pandas()

text_info = pd.read_csv('../../dataset/cora/cora_text.tsv', delimiter='\t')[['pid', 'title', 'abs', 'label']]
cites = pd.read_csv('../../dataset/cora/cora.cites', delimiter='\t', header=None)
cites.columns = ['cite_id', 'cited_id']
text_info

Unnamed: 0,pid,title,abs,label
0,31336,Title: The megaprior heuristic for discovering...,Abstract: Several computer algorithms for disc...,Neural_Networks
1,1061127,Title: Applications of machine learning: a med...,Abstract: This paper describes preliminary wor...,Rule_Learning
2,1106406,"Title: Submitted to NIPS96, Section: Applicati...","Abstract: In cellular telephone systems, an im...",Reinforcement_Learning
3,13195,Title: Planning and Acting in Partially Observ...,"Abstract: In this paper, we bring techniques f...",Reinforcement_Learning
4,37879,Note: c Massachusetts Institute of Technology ...,Abstract: Graphical models enhance the represe...,Probabilistic_Methods
...,...,...,...,...
2703,1128975,Title: learning easier tasks. More work is nec...,Abstract: We have attempted to obtain a strong...,Genetic_Algorithms
2704,1128977,Title: A Genome Compiler for High Performance ...,Abstract: Genetic Programming is very computat...,Genetic_Algorithms
2705,1128978,Title: The MAX Problem for Genetic Programming...,Abstract: The Crossover operator is common to ...,Genetic_Algorithms
2706,117328,Title: Functional Representation as Design Rat...,Abstract: Design rationale is a record of desi...,Case_Based


In [44]:
## 首先计算所有节点的one_hop two_hop node

sample = text_info.copy()
sample['one_hop'] = None
sample['two_hop'] = None
sample['intersection'] = None

for i in tqdm(range(len(sample))):
    pid = sample.loc[i, 'pid']
    one_hop = set(cites[cites.cite_id == pid].cited_id) | set(cites[cites.cited_id == pid].cite_id)
    
    sample.at[i, 'one_hop'] = one_hop if one_hop else np.nan
    
    # two_hop
    two_hop = {}
    
    for node in one_hop:
        if cites[cites.cite_id == node].shape[0] > 0 or cites[cites.cited_id == node].shape[0] > 0:
            two_hop[node] = []
            if cites[cites.cite_id == node].shape[0] > 0:
                two_hop[node] += cites[cites.cite_id == node].cited_id.values.tolist()
            if cites[cites.cited_id == node].shape[0] > 0:
                two_hop[node] += cites[cites.cited_id == node].cite_id.values.tolist()
    
    sample.at[i, 'two_hop'] = two_hop if two_hop else np.nan
    
    # intersection
    intersection = []
    
    if two_hop:
        for one in two_hop.keys():
            for two in two_hop[one]:
                if two in one_hop:
                    intersection.append([one, two])
    
    sample.at[i, 'intersection'] = intersection if intersection else np.nan
    
sample

100%|██████████| 2708/2708 [00:31<00:00, 85.16it/s] 


Unnamed: 0,pid,title,abs,label,one_hop,two_hop,intersection
0,31336,Title: The megaprior heuristic for discovering...,Abstract: Several computer algorithms for disc...,Neural_Networks,"{1129442, 10531, 686532, 31349, 31353}","{1129442: [10531, 31336, 31353, 43698], 10531:...","[[1129442, 10531], [1129442, 31353], [10531, 1..."
1,1061127,Title: Applications of machine learning: a med...,Abstract: This paper describes preliminary wor...,Rule_Learning,{2440},"{2440: [1000012, 1061127, 1106388, 1107095, 11...",
2,1106406,"Title: Submitted to NIPS96, Section: Applicati...","Abstract: In cellular telephone systems, an im...",Reinforcement_Learning,"{6169, 114, 6213, 23774}","{6169: [1106406, 1114629, 1130567, 170338, 205...","[[6213, 23774], [23774, 6213]]"
3,13195,Title: Planning and Acting in Partially Observ...,"Abstract: In this paper, we bring techniques f...",Reinforcement_Learning,"{1107312, 755217, 1120731}","{1107312: [7419, 13195, 28278, 28387], 755217:...",
4,37879,Note: c Massachusetts Institute of Technology ...,Abstract: Graphical models enhance the represe...,Probabilistic_Methods,{1105116},"{1105116: [35922, 37879]}",
...,...,...,...,...,...,...,...
2703,1128975,Title: learning easier tasks. More work is nec...,Abstract: We have attempted to obtain a strong...,Genetic_Algorithms,"{486840, 575795, 144701}","{486840: [1128975, 595063, 35, 35852], 575795:...",
2704,1128977,Title: A Genome Compiler for High Performance ...,Abstract: Genetic Programming is very computat...,Genetic_Algorithms,"{593060, 467383}","{593060: [1128977, 38205, 57119], 467383: [112...",
2705,1128978,Title: The MAX Problem for Genetic Programming...,Abstract: The Crossover operator is common to ...,Genetic_Algorithms,"{579008, 592993, 592996, 578669}","{579008: [1128943, 1128978, 248425, 593921, 59...",
2706,117328,Title: Functional Representation as Design Rat...,Abstract: Design rationale is a record of desi...,Case_Based,"{95188, 91852, 38829, 1115166}","{95188: [1110579, 1116410, 20179, 35490, 64271...",


In [45]:
pos = sample[~sample.intersection.isna()].sample(10, random_state=42).reset_index(drop=True)
neg = sample[sample.intersection.isna()].sample(10, random_state=42).reset_index(drop=True)


pos_sample = pd.DataFrame({'src': [0] * 10, 'one_hop': [0] * 10, 'two_hop': [0] * 10, 'src_to_one': None, 'one_to_two': None, 'link': [1] * 10})
# 正样本
for i in range(len(pos)):
    src, hops = pos.loc[i, 'pid'], random.choice(pos.loc[i, 'intersection'])
    one_hop, two_hop = hops[0], hops[1]
    pos_sample.loc[i, 'src'], pos_sample.loc[i, 'one_hop'], pos_sample.loc[i, 'two_hop'] = src, one_hop, two_hop
    
    if cites[(cites.cite_id == src) & (cites.cited_id == one_hop)].shape[0] > 0:
        pos_sample.loc[i, 'src_to_one'] = 1
    elif cites[(cites.cited_id == src) & (cites.cite_id == one_hop)].shape[0] > 0:
        pos_sample.loc[i, 'src_to_one'] = 0
        
    if cites[(cites.cite_id == one_hop) & (cites.cited_id == two_hop)].shape[0] > 0:
        pos_sample.loc[i, 'one_to_two'] = 1
    elif cites[(cites.cited_id == one_hop) & (cites.cite_id == two_hop)].shape[0] > 0:
        pos_sample.loc[i, 'one_to_two'] = 0

        
neg_sample = pd.DataFrame({'src': [0] * 10, 'one_hop': [0] * 10, 'two_hop': [0] * 10, 'src_to_one': None, 'one_to_two': None, 'link': [0] * 10})

for i in range(len(neg)):
    src, one_hop = pos.loc[i, 'pid'], random.choice(list(pos.loc[i, 'two_hop'].keys()))
    two_hop = random.choice(pos.loc[i, 'two_hop'][one_hop])
    neg_sample.loc[i, 'src'], neg_sample.loc[i, 'one_hop'], neg_sample.loc[i, 'two_hop'] = src, one_hop, two_hop
    if cites[(cites.cite_id == src) & (cites.cited_id == one_hop)].shape[0] > 0:
        neg_sample.loc[i, 'src_to_one'] = 1
    elif cites[(cites.cited_id == src) & (cites.cite_id == one_hop)].shape[0] > 0:
        neg_sample.loc[i, 'src_to_one'] = 0
        
    if cites[(cites.cite_id == one_hop) & (cites.cited_id == two_hop)].shape[0] > 0:
        neg_sample.loc[i, 'one_to_two'] = 1
    elif cites[(cites.cited_id == one_hop) & (cites.cite_id == two_hop)].shape[0] > 0:
        neg_sample.loc[i, 'one_to_two'] = 0
        
two_sample = pd.concat([pos_sample, neg_sample], axis=0)
two_sample = two_sample.merge(text_info, how='left', left_on='src', right_on='pid').drop(columns='pid').rename(columns={
'title': 'src_title', 'abs': 'src_abs', 'label': 'src_label'})
two_sample = two_sample.merge(text_info, how='left', left_on='one_hop', right_on='pid').drop(columns='pid').rename(columns={
'title': 'one_hop_title', 'abs': 'one_hop_abs', 'label': 'one_hop_label'})
two_sample = two_sample.merge(text_info, how='left', left_on='two_hop', right_on='pid').drop(columns='pid').rename(columns={
'title': 'two_hop_title', 'abs': 'two_hop_abs', 'label': 'two_hop_label'})

two_sample = two_sample[['src', 'src_title', 'src_abs', 'src_label', 
                         'one_hop', 'one_hop_title', 'one_hop_abs', 'one_hop_label',
                        'two_hop', 'two_hop_title', 'two_hop_abs', 'two_hop_label',
                        'src_to_one', 'one_to_two', 'link']]

two_sample

path = 'sample/two_hop'
if not os.path.exists(path):
    os.mkdir(path)

two_sample.to_csv(f'{path}/sample_two.csv', index=False)

## 方案三: 全子图信息
- 抽三张3hop子图
- 和原点有边为正样本，没有边为负样本

In [40]:
import networkx as nx
import numpy as np
import pandas as pd
from torch_geometric.utils import k_hop_subgraph
import torch
from tqdm import tqdm
import random
import os
random.seed(42)
tqdm.pandas()

text_info = pd.read_csv('../../dataset/cora/cora_text.tsv', delimiter='\t')[['pid', 'title', 'abs', 'label']]
cites = pd.read_csv('../../dataset/cora/cora.cites', delimiter='\t', header=None)
cites.columns = ['cite_id', 'cited_id']
edge_index = torch.tensor(cites.values.T)
text_info

Unnamed: 0,pid,title,abs,label
0,31336,Title: The megaprior heuristic for discovering...,Abstract: Several computer algorithms for disc...,Neural_Networks
1,1061127,Title: Applications of machine learning: a med...,Abstract: This paper describes preliminary wor...,Rule_Learning
2,1106406,"Title: Submitted to NIPS96, Section: Applicati...","Abstract: In cellular telephone systems, an im...",Reinforcement_Learning
3,13195,Title: Planning and Acting in Partially Observ...,"Abstract: In this paper, we bring techniques f...",Reinforcement_Learning
4,37879,Note: c Massachusetts Institute of Technology ...,Abstract: Graphical models enhance the represe...,Probabilistic_Methods
...,...,...,...,...
2703,1128975,Title: learning easier tasks. More work is nec...,Abstract: We have attempted to obtain a strong...,Genetic_Algorithms
2704,1128977,Title: A Genome Compiler for High Performance ...,Abstract: Genetic Programming is very computat...,Genetic_Algorithms
2705,1128978,Title: The MAX Problem for Genetic Programming...,Abstract: The Crossover operator is common to ...,Genetic_Algorithms
2706,117328,Title: Functional Representation as Design Rat...,Abstract: Design rationale is a record of desi...,Case_Based


In [42]:

src_nodes = text_info.pid.sample(3, random_state=42)

path = 'sample/one_hop_subgraph'

if not os.path.exists(path):
    os.mkdir(path)

# 储存节点语义信息和边信息
for node in src_nodes:
    node_path = os.path.join(path, str(node))
    if not os.path.exists(node_path):
        os.mkdir(node_path)
    subset, subgraph, mapping, mask = k_hop_subgraph(node, 1, edge_index, relabel_nodes=False, directed=False)
    node_info = text_info[text_info.pid.isin(subset.tolist())].reset_index(drop=True)
    edge_info = pd.DataFrame(subgraph.t())
    edge_info.columns = ['cite_id', 'cited_id']
    node_info.to_csv(f'{node_path}/sample_{str(node)}.csv', index=False)
    edge_info.to_csv(f'{node_path}/edge_{str(node)}.csv', index=False)