In [83]:
import os
import json
import pandas as pd
import networkx as nx
from typing import Dict, List, Tuple, Optional


In [84]:
# Reuse your existing code for parsing the .dpd file:
def parseDpdFile(filePath: str) -> Tuple[Dict[str, Dict[str, str]], List[Tuple[str, str]]]:
    nodes: Dict[str, Dict[str, str]] = {}
    edges: List[Tuple[str, str]] = []
    with open(filePath, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line.startswith("N:"):
                parts = line.split(maxsplit=3)
                nodeId: str = parts[1]
                label: str = parts[2].strip('"')
                attributes: str = parts[3] if len(parts) > 3 else ""
                nodes[nodeId] = {"label": label, "attributes": attributes}
            elif line.startswith("E:"):
                parts = line.split()
                source: str = parts[1]
                target: str = parts[2]
                edges.append((source, target))
    return nodes, edges

def createGraphFromDpd(filePath: str) -> nx.DiGraph:
    nodes, edges = parseDpdFile(filePath)
    G = nx.DiGraph()
    for nodeId, data in nodes.items():
        G.add_node(nodeId, **data)
    G.add_edges_from(edges)
    return G

# Helpers from your sort_dataset.py
def get_second_word(text: str) -> str:
    if not text:
        return ""
    tokens = text.split()
    if len(tokens) >= 2:
        return tokens[1].replace(':', '')
    return ""


def parse_attributes(attr_str: str) -> dict:
    # Example: [level=1, something=foo];
    # remove brackets & semicolons, split by comma
    attr_str = attr_str.strip("[];")
    attr_pairs = attr_str.split(", ")
    attr_dict = {}
    for pair in attr_pairs:
        if "=" in pair:
            key, value = pair.split("=")
            attr_dict[key.strip()] = value.strip().strip('"')
    return attr_dict

In [85]:

dpdFilePath = r'C:\Users\User\Documents\GitHub\autoformalization\src\dependency_graph\EverythingLF.dpd'

# Single-file dataset JSON from step (1)
single_file_json_path = r'C:\Users\User\Documents\GitHub\autoformalization\src\dataset\processed_data\coq_proofs_dataset_single.json'

# Where to store final merged result
output_file = r'C:\Users\User\Documents\GitHub\autoformalization\src\dataset\processed_data\df_single.json'

# 2) Build graph from .dpd file
G = createGraphFromDpd(dpdFilePath)

# 3) Load single-file dataset
with open(single_file_json_path, "r", encoding='utf-8') as f:
    json_data = json.load(f)

# We expect json_data to be a *list* with a single element
# That element has "fileName" = "EverythingLF.v", and an "items" array
# We'll flatten it to a DataFrame
data_records = []
for file_entry in json_data:
    file_name = file_entry.get("fileName", "")
    items = file_entry.get("items", [])
    for item in items:
        raw_text = item.get("raw", "")
        item_type = item.get("type", "")
        data_records.append({
            "fileName": file_name,
            "type": item_type,
            "raw": raw_text
        })

df = pd.DataFrame(data_records)

# 4) Create a "Label" column (2nd word of each block)
df["Label"] = df["raw"].apply(get_second_word)

df.head(200)



Unnamed: 0,fileName,type,raw,Label
0,EverythingLF.v,Require,Require Stdlib.Arith.Arith.,Stdlib.Arith.Arith.
1,EverythingLF.v,Require,Require Stdlib.Lists.List.,Stdlib.Lists.List.
2,EverythingLF.v,Require,Require Corelib.Init.Nat.,Corelib.Init.Nat.
3,EverythingLF.v,Require,Require Stdlib.Strings.String.,Stdlib.Strings.String.
4,EverythingLF.v,Require,Require Corelib.Setoids.Setoid.,Corelib.Setoids.Setoid.
...,...,...,...,...
195,EverythingLF.v,Module,Module Export LF.,Export
196,EverythingLF.v,Module,Module Induction.,Induction.
197,EverythingLF.v,Import,Export LF.Basics.,LF.Basics.
198,EverythingLF.v,Theorem,"Theorem add_0_r : forall n:nat, n + 0 = n.\nPr...",add_0_r


In [88]:
expanded_data = []
for node, attributes in G.nodes(data=True):
    attrs = parse_attributes(attributes["attributes"])
    expanded_data.append({"Node": node, "Label": attributes["label"]})

df_dpd = pd.DataFrame(expanded_data)

df_dpd.head(200)


Unnamed: 0,Node,Label
0,644,All
1,643,All_In
2,1072,Collatz_holds_for_12
3,1074,Collatz_holds_for_ind
4,1073,Collatz_holds_for_sind
...,...,...
195,467,app_length_S
196,867,app_ne
197,1625,app_nil_r
198,464,app_nil_r
