In [3]:
import networkx as nx
import numpy as np
import json

In [4]:
network_file = 'Filtered_HepTh_edges.txt'
details_file = 'updated_paper_details.txt'
pagerank_file = 'HepTh_pagerank_results.json'

### Filter nodes based on largest Scc

In [5]:
# 1. Read original data

# Create a direct graph
G = nx.DiGraph()

with open(network_file, "r") as file:
    for line_number, line in enumerate(file, start=1):
        # Remove the Spaces at both ends of the line and split it into a list of nodes
        nodes = line.strip().split()
        if len(nodes) != 2:
            print(f"Error: Line {line_number} does not contain exactly two nodes.")
            break
        else:
            node1, node2 = nodes
            G.add_edge(node1, node2)
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())

Number of nodes: 25031
Number of edges: 324459


In [6]:
# 2. Identify the maximum strongly connected component (SCC)
largest_scc = max(nx.strongly_connected_components(G), key=len)

# 3. Extract the subgraph in the maximum SCC
largest_scc_subgraph = G.subgraph(largest_scc).copy()

# 4. Save the maximum SCC to a new file
with open('edges.txt', 'w') as file:
    for edge in largest_scc_subgraph.edges():
        file.write(f"{edge[0]} {edge[1]}\n")

print(f"Largest SCC with {len(largest_scc_subgraph.nodes())} nodes and {len(largest_scc_subgraph.edges())} edges saved to 'edges.txt'.")


Largest SCC with 13056 nodes and 203852 edges saved to 'edges.txt'.


### Calculate the PageRank result

In [7]:
def read_network(file_path):
    G = nx.DiGraph()
    with open(file_path, 'r') as file:
        for line in file:
            source, target = line.strip().split()
            G.add_edge(source, target)
    return G

def read_dates(file_path):
    dates = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for idx, line in enumerate(file):
            parts = line.strip().split('\t')
            if len(parts) < 3:
                continue  
            
            paper_id = parts[0].strip()
            title = parts[1].strip()
            date = parts[2].strip()
            
            dates[str(idx + 1)] = (paper_id, title, date)
    return dates

def calculate_weights(G, dates, current_year=2024, lambda_decay=0.1):
    citation_counts = {node: 0 for node in G.nodes()}
    for _, target in G.edges():
        citation_counts[target] += 1
    
    time_decay = {node: np.exp(-lambda_decay * (current_year - int(dates[node][1].split('-')[0]))) for node in G.nodes() if node in dates}
    
    weights = {node: citation_counts[node] * time_decay.get(node, 0) for node in G.nodes()}
    return weights

def weighted_pagerank(G, weights, alpha=0.85, max_iter=100, tol=1.0e-6):
    N = len(G)
    pagerank = {node: 1.0 / N for node in G}
    for _ in range(max_iter):
        new_pagerank = {}
        for node in G:
            rank_sum = 0
            for pred in G.predecessors(node):
                weight_sum = sum(weights[succ] for succ in G.successors(pred))
                if weight_sum != 0:
                    rank_sum += pagerank[pred] * weights[pred] / weight_sum
            new_pagerank[node] = (1 - alpha) / N + alpha * rank_sum
        if all(abs(new_pagerank[node] - pagerank[node]) < tol for node in pagerank):
            return new_pagerank
        pagerank = new_pagerank
    return pagerank

def get_important_nodes(pagerank):
    sorted_nodes = sorted(pagerank, key=pagerank.get, reverse=True)
    return sorted_nodes


In [8]:
# Read data
G = read_network('edges.txt')
dates = read_dates(details_file)

# Calculate
weights = calculate_weights(G, dates)

# Calculate PageRank
pagerank = weighted_pagerank(G, weights)

# Gets nodes in order of importance
important_nodes = get_important_nodes(pagerank)

# Save results
with open(pagerank_file, 'w') as file:
    json.dump({'pagerank': pagerank, 'important_nodes': important_nodes}, file)