In [20]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 1. 读取 content 文件（包含节点 ID 和标签）
content = pd.read_csv('citeseer.content', sep='\t', header=None)
content.columns = ['paper_id'] + [f'feat_{i}' for i in range(content.shape[1] - 2)] + ['label']
content.drop_duplicates(subset='paper_id', inplace=True)  # 去重

content_paper_ids = content['paper_id'].tolist()
labels = content['label'].tolist()

# ✅ 2. 创建 paper_id 到 node_id 编码（仅对 content 中的 paper）
paper_id_to_node = {pid: idx for idx, pid in enumerate(content_paper_ids)}
valid_nodes_set = set(content_paper_ids)

# ✅ 3. 标签编码
label_encoder = LabelEncoder()
label_encoded = label_encoder.fit_transform(labels)
label_df = pd.DataFrame({
    'node': list(range(len(content_paper_ids))),
    'label': label_encoded
})
label_df.to_csv("citeseer-labels.csv", index=False, header=False)

# ✅ 4. 读取 cites 边文件
cites = pd.read_csv('citeseer.cites', sep='\t', header=None)
cites.columns = ['source', 'target']

# ✅ 5. 过滤出 source 和 target 都在 content 中的边（双向都在 content 中）
cites_filtered = cites[cites['source'].isin(valid_nodes_set) & cites['target'].isin(valid_nodes_set)]

# ✅ 6. 将边映射为连续编号（0 ~ N-1）
cites_mapped = cites_filtered.applymap(lambda x: paper_id_to_node[x])
cites_mapped.to_csv("citeseer.csv", sep=',', header=False, index=False)

# ✅ 验证信息输出
print(f"原始边数: {len(cites)}")
print(f"保留有效边数: {len(cites_mapped)}")
print(f"节点数: {len(content_paper_ids)}")


原始边数: 4732
保留有效边数: 3298
节点数: 3312


  content = pd.read_csv('citeseer.content', sep='\t', header=None)
  cites_mapped = cites_filtered.applymap(lambda x: paper_id_to_node[x])


In [21]:
cites_mapped

Unnamed: 0,source,target
15,2177,2903
17,1011,2028
18,1011,2029
19,2179,2200
21,1012,2181
...,...,...
4727,2171,2172
4728,2174,1385
4729,2174,2173
4730,1008,455


In [23]:
cites_mapped['source'].nunique() 

1578