In [3]:
import json
import zipfile
import os
import networkx as nx

In [4]:
def extract_dataset(dataset_folder=None, out_path="dblp_subset.json"):
    """Read all dblp-ref-*.json files from a local `dblp-ref` folder,
    filter them by year (2010-2015) and n_citation >= 60, and write a newline-
    separated JSON file `dblp_subset.json` in the notebook directory."""
    # Try common local locations for the `dblp-ref` folder (sibling or parent)
    candidates = [
        os.path.join(os.getcwd(), 'dblp-ref'),
        os.path.abspath(os.path.join(os.getcwd(), '..', 'dblp-ref')),
        '/Users/ankushchhabra/Downloads/dblp-ref',
    ]
    if dataset_folder is not None:
        candidates.insert(0, dataset_folder)

    folder = None
    for c in candidates:
        if os.path.isdir(c):
            folder = c
            break
    if folder is None:
        raise FileNotFoundError('Could not find a local dblp-ref folder. Checked: ' + str(candidates))

    print(f'Using dataset folder: {folder}')

    def is_valid(paper):
        try:
            return 2010 <= paper.get('year', 0) <= 2015 and paper.get('n_citation', 0) >= 60
        except Exception:
            return False

    subset = []
    # Process all files matching the pattern dblp-ref-*.json in the folder
    files = sorted([f for f in os.listdir(folder) if f.startswith('dblp-ref') and f.endswith('.json')])
    if not files:
        raise FileNotFoundError(f'No dblp-ref-*.json files found in {folder}')

    for fname in files:
        file_path = os.path.join(folder, fname)
        print(f'Processing {file_path} ...')
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    paper = json.loads(line)
                    if is_valid(paper):
                        subset.append(paper)
                except json.JSONDecodeError:
                    continue

    print(f'Total papers selected: {len(subset)}')

    with open(out_path, 'w', encoding='utf-8') as out:
        for paper in subset:
            json.dump(paper, out)
            out.write('\n')

    print(f'Filtered dataset saved as {out_path}')

# Example: extract_dataset()  # call this if you want to produce dblp_subset.json locally

In [5]:
def construct_graph(dataset_folder=None, subset_path="dblp_subset.json"):
    """Construct a directed citation graph from local dblp-ref files or a
    precomputed subset file. The function prefers a local `dblp_subset.json` if
    present in the notebook folder; otherwise it will read the dblp-ref JSON
    files from the discovered local folder (same search strategy used by
    `extract_dataset`)."""
    # locate dataset folder similarly to extract_dataset
    candidates = [
        os.path.join(os.getcwd(), 'dblp-ref'),
        os.path.abspath(os.path.join(os.getcwd(), '..', 'dblp-ref')),
        '/Users/ankushchhabra/Downloads/dblp-ref',
    ]
    if dataset_folder is not None:
        candidates.insert(0, dataset_folder)

    folder = None
    for c in candidates:
        if os.path.isdir(c):
            folder = c
            break

    use_subset_file = os.path.isfile(subset_path)
    if not use_subset_file and folder is None:
        raise FileNotFoundError('No dataset found. Looked for subset file and dblp-ref folder.')

    G = nx.DiGraph()

    def is_valid(paper):
        try:
            return 2010 <= paper.get('year', 0) <= 2015 and paper.get('n_citation', 0) >= 60
        except Exception:
            return False

    all_paper_ids = set()
    papers_source = []

    if use_subset_file:
        print(f'Using subset file: {subset_path}')
        with open(subset_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    paper = json.loads(line.strip())
                    papers_source.append(paper)
                    if 'id' in paper:
                        all_paper_ids.add(paper['id'])
                except json.JSONDecodeError:
                    continue
    else:
        print(f'Using dblp-ref folder: {folder}')
        files = sorted([f for f in os.listdir(folder) if f.startswith('dblp-ref') and f.endswith('.json')])
        for fname in files:
            file_path = os.path.join(folder, fname)
            print(f'Processing {file_path} ...')
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    try:
                        paper = json.loads(line.strip())
                        if 'id' in paper and is_valid(paper):
                            papers_source.append(paper)
                            all_paper_ids.add(paper['id'])
                    except json.JSONDecodeError:
                        continue

    # Build graph from papers_source (only nodes and edges among collected ids)
    for paper in papers_source:
        paper_id = paper.get('id')
        if paper_id is None:
            continue
        G.add_node(paper_id,
                   title=paper.get('title'),
                   authors=paper.get('authors'),
                   year=paper.get('year'),
                   venue=paper.get('venue'),
                   n_citation=paper.get('n_citation'))
        for ref in paper.get('references', []):
            if ref in all_paper_ids:
                G.add_edge(paper_id, ref)

    print(f'Total papers (nodes): {G.number_of_nodes()}')
    print(f'Total citations (edges): {G.number_of_edges()}')

    # Example: list all papers cited by a specific paper (if present)
    sample_paper = '2f9a0337-c299-496f-93c7-192cc071dbb8'
    if sample_paper in G:
        print('Cites:', list(G.successors(sample_paper)))

    return G

# Build graph using local folder / subset
G = construct_graph()

Using dblp-ref folder: /Users/ankushchhabra/Downloads/Data Mining Assignment2/dblp-ref
Processing /Users/ankushchhabra/Downloads/Data Mining Assignment2/dblp-ref/dblp-ref-0.json ...
Processing /Users/ankushchhabra/Downloads/Data Mining Assignment2/dblp-ref/dblp-ref-1.json ...
Processing /Users/ankushchhabra/Downloads/Data Mining Assignment2/dblp-ref/dblp-ref-2.json ...
Processing /Users/ankushchhabra/Downloads/Data Mining Assignment2/dblp-ref/dblp-ref-3.json ...
Total papers (nodes): 49572
Total citations (edges): 163309
Cites: ['67ee0b47-908f-4205-a35a-af3ed53cac4d', 'ea6c65c4-e688-4129-9cef-75c884099cc8']


In [6]:
# Basic counts
num_vertices = G.number_of_nodes()
num_edges = G.number_of_edges()

# Weakly connected components (WCC)
wccs = list(nx.weakly_connected_components(G))
num_wcc = len(wccs)
largest_wcc_size = max(len(c) for c in wccs) if wccs else 0
smallest_wcc_size = min(len(c) for c in wccs) if wccs else 0

# Strongly connected components (SCC)
sccs = list(nx.strongly_connected_components(G))
num_scc = len(sccs)
largest_scc_size = max(len(c) for c in sccs) if sccs else 0
smallest_scc_size = min(len(c) for c in sccs) if sccs else 0

# Print results
print("Graph Statistics")
print("================")
print(f"Number of vertices: {num_vertices}")
print(f"Number of edges: {num_edges}")
print(f"Number of weakly connected components (WCC): {num_wcc}")
print(f"Number of strongly connected components (SCC): {num_scc}")
print(f"Number of nodes in largest WCC: {largest_wcc_size}")
print(f"Number of nodes in smallest WCC: {smallest_wcc_size}")
print(f"Number of nodes in largest SCC: {largest_scc_size}")
print(f"Number of nodes in smallest SCC: {smallest_scc_size}")

Graph Statistics
Number of vertices: 49572
Number of edges: 163309
Number of weakly connected components (WCC): 6985
Number of strongly connected components (SCC): 47731
Number of nodes in largest WCC: 41225
Number of nodes in smallest WCC: 1
Number of nodes in largest SCC: 171
Number of nodes in smallest SCC: 1


In [7]:
# Generate filtered subset file (writes dblp_subset.json in this folder)
# If you have the raw dblp-ref files in a nearby folder, this will read them and create the subset.
extract_dataset()

Using dataset folder: /Users/ankushchhabra/Downloads/Data Mining Assignment2/dblp-ref
Processing /Users/ankushchhabra/Downloads/Data Mining Assignment2/dblp-ref/dblp-ref-0.json ...
Processing /Users/ankushchhabra/Downloads/Data Mining Assignment2/dblp-ref/dblp-ref-1.json ...
Processing /Users/ankushchhabra/Downloads/Data Mining Assignment2/dblp-ref/dblp-ref-1.json ...
Processing /Users/ankushchhabra/Downloads/Data Mining Assignment2/dblp-ref/dblp-ref-2.json ...
Processing /Users/ankushchhabra/Downloads/Data Mining Assignment2/dblp-ref/dblp-ref-2.json ...
Processing /Users/ankushchhabra/Downloads/Data Mining Assignment2/dblp-ref/dblp-ref-3.json ...
Processing /Users/ankushchhabra/Downloads/Data Mining Assignment2/dblp-ref/dblp-ref-3.json ...
Total papers selected: 49572
Total papers selected: 49572
Filtered dataset saved as dblp_subset.json
Filtered dataset saved as dblp_subset.json
