In [4]:
from google.colab import files
import os

# Step 1 – upload the JSON file manually
uploaded = files.upload()  # opens a file-picker in Colab
# Step 2 – get its path (without loading contents)
path = list(uploaded.keys())[0]
path = os.path.abspath(path)

print(f"JSON file path saved in variable 'path': {path}")

Saving dataset0.jsonl to dataset0.jsonl
JSON file path saved in variable 'path': /content/dataset0.jsonl


In [5]:
import orjson, io

def iter_jsonl(path):
    """Yield one JSON object per line from a .jsonl file."""
    with io.open(path, "rb") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            yield orjson.loads(line)

def head_jsonl(path, k=3):
    """Return the first k JSON objects from the file."""
    out = []
    for i, ex in enumerate(iter_jsonl(path)):
        out.append(ex)
        if i + 1 >= k:
            break
    return out


In [10]:


# ---- Imports
import os, json, argparse, io
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
import orjson


# ---- Graph utilities
def build_graph(n, edges, directed=True):
    G = nx.DiGraph() if directed else nx.Graph()
    G.add_nodes_from(range(n))
    G.add_edges_from(edges)
    return G

def graph_features(example):
    n = example["n"]
    directed = bool(example.get("directed", True))
    edges = example["edges"]
    m = len(edges)
    expected_len = int(example["expected"]["length"])
    num_paths = len(example["expected"].get("paths", []))

    G = build_graph(n, edges, directed)

    # Density & connectivity
    if directed:
        density = m / (n * (n - 1)) if n > 1 else 0.0
        dag = nx.is_directed_acyclic_graph(G)
        scc_sizes = [len(c) for c in nx.strongly_connected_components(G)] if m else [1]*n
        n_scc = len(scc_sizes)
        largest_scc = max(scc_sizes) if scc_sizes else 0
        indeg = np.array([d for _, d in G.in_degree()])
        outdeg = np.array([d for _, d in G.out_degree()])
        deg_mean = float(outdeg.mean())
        deg_std = float(outdeg.std())
        indeg_mean = float(indeg.mean())
    else:
        density = (2*m) / (n * (n - 1)) if n > 1 else 0.0
        dag = False
        cc_sizes = [len(c) for c in nx.connected_components(G)] if m else [1]*n
        n_scc = len(cc_sizes)
        largest_scc = max(cc_sizes) if cc_sizes else 0
        deg = np.array([d for _, d in G.degree()])
        deg_mean = float(deg.mean())
        deg_std = float(deg.std())
        indeg_mean = None

    # Clustering coefficient
    try:
        clust = nx.average_clustering(G.to_undirected() if directed else G)
    except Exception:
        clust = np.nan

    return dict(
        id=example.get("id", ""),
        n=n, m=m, directed=directed, density=density,
        expected_len=expected_len, len_ratio=(expected_len / max(1, n - 1)),
        num_paths=num_paths, dag=dag, n_scc=n_scc, largest_scc=largest_scc,
        deg_mean=deg_mean, deg_std=deg_std, indeg_mean=indeg_mean, clustering=clust
    )

# ---- Plotting helpers
def save_hist(series, title, xlabel, outpath, bins=30):
    plt.figure()
    plt.hist(series.dropna().values, bins=bins)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(outpath, dpi=150)
    plt.close()

def save_scatter(x, y, title, xlabel, ylabel, outpath):
    plt.figure()
    plt.scatter(x, y, s=10, alpha=0.7)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.tight_layout()
    plt.savefig(outpath, dpi=150)
    plt.close()

# ---- Main EDA routine
def main(data_path):
    outdir_fig = "figures"
    outdir_res = "results"
    os.makedirs(outdir_fig, exist_ok=True)
    os.makedirs(outdir_res, exist_ok=True)

    # Write sample
    head = list(head_jsonl(data_path, k=3))
    with open(os.path.join(outdir_res, "sample_examples.json"), "w") as f:
        json.dump(head, f, indent=2)

    # Extract features
    rows = []
    for ex in tqdm(iter_jsonl(data_path), desc="EDA"):
        try:
            rows.append(graph_features(ex))
        except Exception:
            continue

    df = pd.DataFrame(rows)
    df.to_csv(os.path.join(outdir_res, "per_graph_features.csv"), index=False)

    # Summary
    summary = {
        "dataset_size": int(df.shape[0]),
        "n_min": int(df["n"].min()), "n_med": float(df["n"].median()), "n_max": int(df["n"].max()),
        "|E|_min": int(df["m"].min()), "|E|_med": float(df["m"].median()), "|E|_max": int(df["m"].max()),
        "density_med": float(df["density"].median()),
        "expected_len_med": float(df["expected_len"].median()),
        "len_ratio_med": float(df["len_ratio"].median()),
        "%directed": float(100 * df["directed"].mean()),
        "%DAG (among directed)": float(100 * df.loc[df["directed"], "dag"].mean()) if df["directed"].any() else None,
        "median_largest_SCC": float(df["largest_scc"].median()),
        "clustering_med": float(df["clustering"].median(skipna=True)),
    }
    with open(os.path.join(outdir_res, "eda_stats.json"), "w") as f:
        json.dump(summary, f, indent=2)

    # Plots
    save_hist(df["n"], "Graph order n", "n", os.path.join(outdir_fig, "hist_n.png"))
    save_hist(df["m"], "Edge count |E|", "|E|", os.path.join(outdir_fig, "hist_m.png"))
    save_hist(df["density"], "Edge density", "density", os.path.join(outdir_fig, "hist_density.png"))
    save_hist(df["expected_len"], "Target: longest path length", "length", os.path.join(outdir_fig, "hist_expected_len.png"))
    save_hist(df["len_ratio"], "Target ratio: length / (n-1)", "ratio", os.path.join(outdir_fig, "hist_len_ratio.png"))
    save_hist(df["largest_scc"], "Largest SCC size", "size", os.path.join(outdir_fig, "hist_largest_scc.png"))
    save_scatter(df["n"], df["expected_len"], "n vs longest path length", "n", "longest path",
                 os.path.join(outdir_fig, "scatter_n_vs_len.png"))

    if df["directed"].nunique() > 1:
        plt.figure()
        df["directed"].value_counts().sort_index().plot(kind="bar")
        plt.title("Directed vs Undirected count")
        plt.xlabel("directed flag")
        plt.ylabel("count")
        plt.tight_layout()
        plt.savefig(os.path.join(outdir_fig, "bar_directed.png"), dpi=150)
        plt.close()

    print("✅ Wrote all results to:", outdir_res, "and figures to:", outdir_fig)

# ---- Example usage in Colab
# Replace with your actual file path in Google Colab (after upload)
path = "/content/dataset0.jsonl"  # <-- change this to your file
main(path)


EDA: 5000it [00:02, 1722.83it/s]


✅ Wrote all results to: results and figures to: figures


In [11]:
import argparse
import os
import random
import json
import networkx as nx
import matplotlib.pyplot as plt


def build_graph(n, edges, directed=True):
    G = nx.DiGraph() if directed else nx.Graph()
    G.add_nodes_from(range(n))
    G.add_edges_from(edges)
    return G


def draw_graph_with_path(ex, outpath, seed=7):
    n = ex["n"]
    directed = bool(ex.get("directed", True))
    edges = ex["edges"]
    paths = ex["expected"].get("paths", [])
    G = build_graph(n, edges, directed)
    pos = nx.spring_layout(G, seed=seed)

    plt.figure(figsize=(4, 4))
    nx.draw_networkx_nodes(G, pos, node_size=250)
    nx.draw_networkx_labels(G, pos, font_size=8)
    nx.draw_networkx_edges(G, pos, alpha=0.5, arrows=directed)

    if paths:
        p = paths[0]
        pe = list(zip(p[:-1], p[1:]))
        nx.draw_networkx_edges(G, pos, edgelist=pe, width=3)  # overlay
        plt.title(f"id={ex.get('id','')}, n={n}, L*={ex['expected']['length']}")
    else:
        plt.title(f"id={ex.get('id','')}, n={n}, L*={ex['expected']['length']} (no path listed)")

    plt.axis("off")
    plt.tight_layout()
    plt.savefig(outpath, dpi=150)
    plt.close()


def main():
   # Example: run inside Colab without CLI args
  path = "/content/dataset0.jsonl"  # your uploaded file
  os.makedirs("figures", exist_ok=True)

  # Load and visualize
  by_n = {}
  for ex in iter_jsonl(path):
      by_n.setdefault(ex["n"], []).append(ex)

  keys = sorted(by_n.keys())
  picks = []
  random.seed(42)
  step = max(1, len(keys) // 4)
  for kn in keys[::step]:
      picks.append(random.choice(by_n[kn]))

  for i, ex in enumerate(picks):
      draw_graph_with_path(ex, f"figures/graph_example_{i+1}.png")



if __name__ == "__main__":
    main()


In [15]:
import os, json
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import joblib

# --- Your CSV file path ---
csv_path = "/content/results/per_graph_features.csv"  # change to your actual filename
target   = "expected_len"                      # or "len_ratio"

# --- Helpers ---
def load_table(path):
    df = pd.read_csv(path)
    for c in df.columns:
        if df[c].dtype == bool:
            df[c] = df[c].astype(int)
    return df

def pick_features(df, target, drop_extra=("id",)):
    num = df.select_dtypes(include=[np.number]).columns.tolist()
    feats = [c for c in num if c != target]
    for d in drop_extra:
        if d in feats:
            feats.remove(d)
    return feats

def save_scatter(y_true, y_pred, outpath, title, xlabel, ylabel):
    plt.figure()
    plt.scatter(y_true, y_pred, s=12, alpha=0.7)
    lims = [min(y_true.min(), y_pred.min()), max(y_true.max(), y_pred.max())]
    plt.plot(lims, lims)
    plt.title(title); plt.xlabel(xlabel); plt.ylabel(ylabel)
    plt.tight_layout(); plt.savefig(outpath, dpi=150); plt.close()

def save_hist(data, outpath, title, xlabel):
    plt.figure()
    plt.hist(data, bins=30)
    plt.title(title); plt.xlabel(xlabel); plt.ylabel("count")
    plt.tight_layout(); plt.savefig(outpath, dpi=150); plt.close()

# --- Run training directly ---
os.makedirs("results", exist_ok=True)
os.makedirs("figures", exist_ok=True)

df = load_table(csv_path)
feats = pick_features(df, target)
X = df[feats].values
y = df[target].values
ids = df["id"].values if "id" in df.columns else np.arange(len(df))

Xtr, Xte, ytr, yte, idtr, idte = train_test_split(
    X, y, ids, test_size=0.2, random_state=42, shuffle=True
)

rf = RandomForestRegressor(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=1,
    n_jobs=-1,
    random_state=42,
)
rf.fit(Xtr, ytr)
yhat = rf.predict(Xte)

mae  = float(mean_absolute_error(yte, yhat))
rmse = float(mean_squared_error(yte, yhat) ** 0.5)
r2   = float(r2_score(yte, yhat))

print("\n=== Baseline (RandomForest) ===")
print(f"target          : {target}")
print(f"num_features    : {len(feats)}")
print(f"MAE             : {mae:.4f}")
print(f"RMSE            : {rmse:.4f}")
print(f"R^2             : {r2:.4f}")

metrics = {"target": target, "num_features": len(feats),
           "MAE": mae, "RMSE": rmse, "R2": r2}
with open(f"results/baseline_rf_metrics_{target}.json", "w") as f:
    json.dump(metrics, f, indent=2)

joblib.dump(rf, f"results/baseline_rf_{target}.joblib")

out_pred = pd.DataFrame({
    "id": idte,
    "y_true": yte,
    "y_pred": yhat,
    "residual": yhat - yte,
})
out_pred.to_csv(f"results/predictions_{target}.csv", index=False)

save_scatter(yte, yhat,
             f"figures/scatter_true_vs_pred_{target}.png",
             title=f"True vs Predicted ({target})",
             xlabel="true", ylabel="predicted")

save_hist(out_pred["residual"].values,
          f"figures/hist_residuals_{target}.png",
          title=f"Residuals ({target})",
          xlabel="pred - true")

# --- Optional: feature importances ---
try:
    importances = rf.feature_importances_
    idx = np.argsort(importances)[::-1][:15]
    plt.figure(figsize=(6,4))
    plt.barh(range(len(idx)), importances[idx][::-1])
    plt.yticks(range(len(idx)), [feats[i] for i in idx][::-1])
    plt.title("Top feature importances (RF)")
    plt.tight_layout()
    plt.savefig(f"figures/bar_feature_importances_{target}.png", dpi=150)
    plt.close()
except Exception as e:
    print("Could not plot feature importances:", e)


=== Baseline (RandomForest) ===
target          : expected_len
num_features    : 13
MAE             : 0.0564
RMSE            : 0.1750
R^2             : 0.9985


In [17]:
# === Colab: quick dataset statistics for Longest Simple Path graphs ===
!pip -q install orjson networkx tqdm pandas

from google.colab import files
import io, orjson, os, json
from collections import Counter, defaultdict
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm


path = "/content/dataset0.jsonl"
print("Using file:", path)

# ---------- Helpers ----------
def iter_jsonl(path):
    with io.open(path, "rb") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            yield orjson.loads(line)

def build_graph(n, edges, directed=True):
    G = nx.DiGraph() if directed else nx.Graph()
    G.add_nodes_from(range(n))
    G.add_edges_from(edges)
    return G

# ---------- Pass 1: compute per-graph features (fast) ----------
rows = []
type_counter = Counter()
directed_counter = Counter()

for ex in tqdm(iter_jsonl(path), desc="Reading"):
    n = int(ex["n"])
    edges = ex["edges"]
    m = len(edges)
    directed = bool(ex.get("directed", True))
    gtype = ex.get("type", "Unknown")
    Lstar = int(ex["expected"]["length"])

    # density (no self-loops assumed)
    if directed:
        density = (m / (n*(n-1))) if n > 1 else 0.0
    else:
        density = ((2*m) / (n*(n-1))) if n > 1 else 0.0

    # normalized target
    denom = max(1, n-1)
    Lratio = Lstar / denom

    # Structural (lightweight)
    G = build_graph(n, edges, directed)
    if directed:
        is_dag = nx.is_directed_acyclic_graph(G)
        scc_sizes = [len(c) for c in nx.strongly_connected_components(G)] if m else [1]*n
        largest_scc = max(scc_sizes) if scc_sizes else 0
        indeg = [d for _, d in G.in_degree()]
        outdeg = [d for _, d in G.out_degree()]
        indeg_mean = float(np.mean(indeg)) if indeg else 0.0
        outdeg_mean = float(np.mean(outdeg)) if outdeg else 0.0
    else:
        is_dag = False
        cc_sizes = [len(c) for c in nx.connected_components(G)] if m else [1]*n
        largest_scc = max(cc_sizes) if cc_sizes else 0
        deg = [d for _, d in G.degree()]
        indeg_mean = None
        outdeg_mean = float(np.mean(deg)) if deg else 0.0

    # clustering (use undirected view to keep it stable)
    try:
        clustering = nx.average_clustering(G.to_undirected() if directed else G)
    except Exception:
        clustering = np.nan

    rows.append(dict(
        id=ex.get("id",""),
        type=gtype,
        directed=directed,
        n=n, m=m, density=density,
        expected_len=Lstar, len_ratio=Lratio,
        dag=is_dag,
        largest_scc=largest_scc,
        indeg_mean=indeg_mean,
        outdeg_mean=outdeg_mean,
        clustering=clustering,
    ))
    type_counter[gtype] += 1
    directed_counter["directed" if directed else "undirected"] += 1

df = pd.DataFrame(rows)
print(f"\nLoaded {len(df):,} graphs.")

# ---------- Basic stats ----------
def stat_triplet(series):
    return float(series.min()), float(series.median()), float(series.max())

n_min, n_med, n_max = stat_triplet(df["n"])
m_min, m_med, m_max = stat_triplet(df["m"])
dens_min, dens_med, dens_max = stat_triplet(df["density"])
L_min, L_med, L_max = stat_triplet(df["expected_len"])
R_min, R_med, R_max = stat_triplet(df["len_ratio"])
clust_med = float(df["clustering"].median(skipna=True))
largest_scc_med = float(df["largest_scc"].median())

pct_directed = 100.0 * (df["directed"].mean())
pct_dag_among_directed = float(
    100.0 * df.loc[df["directed"], "dag"].mean()
) if (df["directed"].any()) else None

# Per-n frequency (nice for table)
n_freq = df["n"].value_counts().sort_index().to_dict()

# ---------- Print summary ----------
print("\n=== DATASET SUMMARY ===")
print(f"Total graphs                  : {len(df):,}")
print("By type                       :", dict(type_counter))
print("Directed vs Undirected        :", dict(directed_counter))
print(f"% Directed                    : {pct_directed:.2f}%")
if pct_dag_among_directed is not None:
    print(f"% DAG (among directed)        : {pct_dag_among_directed:.2f}%")

print("\nGraph size n                  : min/med/max = "
      f"{n_min:.0f} / {n_med:.0f} / {n_max:.0f}")
print("Edge count |E|                : min/med/max = "
      f"{m_min:.0f} / {m_med:.0f} / {m_max:.0f}")
print("Density                        : min/med/max = "
      f"{dens_min:.3f} / {dens_med:.3f} / {dens_max:.3f}")
print("Largest SCC size (median)     : "
      f"{largest_scc_med:.2f}")
print("Avg clustering (median)       : "
      f"{clust_med:.3f}")

print("\nTarget L* (longest path len)  : min/med/max = "
      f"{L_min:.0f} / {L_med:.0f} / {L_max:.0f}")
print("Target ratio L*/(n-1)         : min/med/max = "
      f"{R_min:.3f} / {R_med:.3f} / {R_max:.3f}")

print("\nCounts per n:")
for k in sorted(n_freq.keys()):
    print(f"  n={k:>2}: {n_freq[k]:>5}")

# ---------- Save a machine-readable summary (for your PDF/table) ----------
os.makedirs("results", exist_ok=True)
summary = {
    "dataset_size": int(len(df)),
    "by_type": dict(type_counter),
    "directed_vs_undirected": dict(directed_counter),
    "pct_directed": pct_directed,
    "pct_dag_among_directed": pct_dag_among_directed,
    "n": {"min": n_min, "med": n_med, "max": n_max},
    "E": {"min": m_min, "med": m_med, "max": m_max},
    "density": {"min": dens_min, "med": dens_med, "max": dens_max},
    "largest_scc_median": largest_scc_med,
    "clustering_median": clust_med,
    "Lstar": {"min": L_min, "med": L_med, "max": L_max},
    "Lratio": {"min": R_min, "med": R_med, "max": R_max},
    "n_frequency": n_freq,
}
with open("results/eda_stats_quick.json", "w") as f:
    json.dump(summary, f, indent=2)
print("\nSaved: results/eda_stats_quick.json")


Using file: /content/dataset0.jsonl


Reading: 5000it [00:02, 1910.40it/s]


Loaded 5,000 graphs.

=== DATASET SUMMARY ===
Total graphs                  : 5,000
By type                       : {'Mixed': 4500, 'PureRandom': 500}
Directed vs Undirected        : {'directed': 5000}
% Directed                    : 100.00%
% DAG (among directed)        : 11.46%

Graph size n                  : min/med/max = 5 / 12 / 22
Edge count |E|                : min/med/max = 0 / 27 / 130
Density                        : min/med/max = 0.000 / 0.213 / 1.000
Largest SCC size (median)     : 7.00
Avg clustering (median)       : 0.376

Target L* (longest path len)  : min/med/max = 0 / 8 / 21
Target ratio L*/(n-1)         : min/med/max = 0.000 / 0.833 / 1.000

Counts per n:
  n= 5:   385
  n= 6:   364
  n= 7:   338
  n= 8:   362
  n= 9:   329
  n=10:   364
  n=11:   350
  n=12:   322
  n=13:   356
  n=14:   353
  n=15:   345
  n=16:   349
  n=17:   349
  n=18:   317
  n=19:    26
  n=20:    25
  n=21:    40
  n=22:    26

Saved: results/eda_stats_quick.json



