### Data Cleaning
In the file 'Cit_HepTh.txt', some paper ID lost their begins(e.g. 1001 should be 9301001).

In [1]:
def process_node_ids(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    with open(output_file, 'w', encoding='utf-8') as file:
        for line in lines:
            if line.startswith("#"):
                file.write(line)
                continue
            
            from_node, to_node = line.strip().split()
            
            # handle from_node
            if len(from_node) == 4:
                from_node = f"930{from_node}"
            elif len(from_node) == 5:
                from_node = f"93{from_node}"
            elif len(from_node) == 6:
                from_node = f"0{from_node}"
            
            # handle to_node
            if len(to_node) == 4:
                to_node = f"930{to_node}"
            elif len(to_node) == 5:
                to_node = f"93{to_node}"
            elif len(to_node) == 6:
                to_node = f"0{to_node}"
            
            file.write(f"{from_node}\t{to_node}\n")


In [2]:
input_file = 'Cit_HepTh.txt'
output_file = 'HepTh_edges.txt'
process_node_ids(input_file, output_file)

Check Cit_HepTh for nodes that do not exist in paper_details.

In [3]:
def read_paper_details(file_path):
    node_ids = set()
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if parts:
                node_id = parts[0]
                node_ids.add(node_id)
    return node_ids

def check_citations(citations_file, paper_details_file, output_file):
    paper_details_ids = read_paper_details(paper_details_file)
    missing_nodes = set()
    
    with open(citations_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    for line in lines:
        if line.startswith("#"):
            continue
        from_node, to_node = line.strip().split()
        
        if len(from_node) == 4:
            from_node = f"930{from_node}"
        if len(to_node) == 4:
            to_node = f"930{to_node}"
        
        if from_node not in paper_details_ids:
            missing_nodes.add(from_node)
        if to_node not in paper_details_ids:
            missing_nodes.add(to_node)
    
    with open(output_file, 'w', encoding='utf-8') as file:
        for node_id in sorted(missing_nodes):
            file.write(f"{node_id}\n")

In [4]:
citations_file = 'HepTh_edges.txt'
paper_details_file = 'HepTh_abstracts/cleaned_paper_details.txt'
output_file = 'missing_nodes.txt'
check_citations(citations_file, paper_details_file, output_file)


missing_nodes.txt stores paper nodes without paper detailes, and deletes the edges of HepTh_dges.txt containing these missing_nodes.

In [6]:
def load_missing_nodes(missing_nodes_file):
    """ 从 missing_nodes.txt 文件中加载没有详细信息的节点ID """
    with open(missing_nodes_file, 'r', encoding='utf-8') as file:
        missing_nodes = set(line.strip() for line in file if line.strip())
    return missing_nodes

def filter_edges(edges_file, missing_nodes, output_file):
    """ 过滤包含在 missing_nodes 中的节点的边，并将结果保存到 output_file """
    with open(edges_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    with open(output_file, 'w', encoding='utf-8') as file:
        for line in lines:
            if line.startswith("#"):
                file.write(line)
                continue
            
            from_node, to_node = line.strip().split()
            
            if from_node not in missing_nodes and to_node not in missing_nodes:
                file.write(f"{from_node}\t{to_node}\n")


In [7]:
missing_nodes_file = 'missing_nodes.txt'  
edges_file = 'HepTh_edges.txt'  
output_file = 'Filtered_HepTh_edges.txt'  
missing_nodes = load_missing_nodes(missing_nodes_file)
# Filter edges and save results
filter_edges(edges_file, missing_nodes, output_file)
