Skip to content

Commit 7274f77

Browse files
committed
feat: add format parsers and utilities
Add format conversion and utilities: - GML parser for graph file format - Wayback Machine archive URL utilities - ReadableGraph interface for generic graph access - Graph expander interface for traversal
1 parent 8d84282 commit 7274f77

21 files changed

Lines changed: 3220 additions & 0 deletions

src/formats/gml/fetch.ts

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
#!/usr/bin/env npx tsx
2+
/**
3+
* Dataset Fetcher - Download, extract, and convert graph datasets in memory.
4+
*
5+
* Supports fetching GML files from zip archives (like Newman's network data)
6+
* and converting them to the normalized JSON format.
7+
*
8+
* @example CLI usage:
9+
* ```bash
10+
* npx tsx src/formats/gml/fetch.ts https://websites.umich.edu/~mejn/netdata/karate.zip output.json
11+
* ```
12+
*
13+
* @example Programmatic usage:
14+
* ```typescript
15+
* import { fetchDataset } from 'graphbox/formats/gml';
16+
*
17+
* const graph = await fetchDataset('https://websites.umich.edu/~mejn/netdata/karate.zip', {
18+
* meta: { name: 'Karate Club', ... }
19+
* });
20+
* ```
21+
*/
22+
23+
import { unzipSync } from "fflate";
24+
25+
import type { GmlToJsonOptions } from "./parse";
26+
import { gmlToJson, parseGml } from "./parse";
27+
import type { GraphJson } from "./types";
28+
29+
/**
30+
* Options for fetching a dataset.
31+
*/
32+
export interface FetchDatasetOptions extends GmlToJsonOptions {
33+
/** Custom fetch function (for testing or custom environments) */
34+
fetch?: typeof globalThis.fetch;
35+
/** File extension to look for in the archive (default: .gml) */
36+
extension?: string;
37+
}
38+
39+
/**
40+
* Result of fetching a dataset, including the graph and metadata about the fetch.
41+
*/
42+
export interface FetchDatasetResult {
43+
/** The converted graph */
44+
graph: GraphJson;
45+
/** The filename that was extracted */
46+
filename: string;
47+
/** Size of the downloaded archive in bytes */
48+
archiveSize: number;
49+
/** Size of the extracted GML content in bytes */
50+
contentSize: number;
51+
}
52+
53+
/**
54+
* Fetch a dataset from a URL, extract GML from zip, and convert to JSON.
55+
*
56+
* @param url - URL to a zip archive containing a GML file
57+
* @param options - Conversion options including metadata
58+
* @returns The converted graph and fetch metadata
59+
*
60+
* @example
61+
* ```typescript
62+
* const result = await fetchDataset(
63+
* 'https://websites.umich.edu/~mejn/netdata/karate.zip',
64+
* {
65+
* meta: {
66+
* name: "Zachary's Karate Club",
67+
* description: "Social network of friendships...",
68+
* source: "https://websites.umich.edu/~mejn/netdata/",
69+
* url: "https://websites.umich.edu/~mejn/netdata/karate.zip",
70+
* citation: { authors: ["W. W. Zachary"], title: "...", year: 1977 },
71+
* retrieved: "2024-01-17"
72+
* }
73+
* }
74+
* );
75+
* console.log(result.graph.nodes.length); // 34
76+
* ```
77+
*/
78+
export const fetchDataset = async (
79+
url: string,
80+
options: FetchDatasetOptions
81+
): Promise<FetchDatasetResult> => {
82+
const fetchFunction = options.fetch ?? globalThis.fetch;
83+
const extension = options.extension ?? ".gml";
84+
85+
// Fetch the zip archive
86+
const response = await fetchFunction(url);
87+
if (!response.ok) {
88+
throw new Error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
89+
}
90+
91+
const arrayBuffer = await response.arrayBuffer();
92+
const archiveSize = arrayBuffer.byteLength;
93+
94+
// Extract the zip in memory
95+
const zipData = new Uint8Array(arrayBuffer);
96+
const unzipped = unzipSync(zipData);
97+
98+
// Find the GML file
99+
const gmlFiles = Object.keys(unzipped).filter(name =>
100+
name.toLowerCase().endsWith(extension.toLowerCase())
101+
);
102+
103+
if (gmlFiles.length === 0) {
104+
const files = Object.keys(unzipped).join(", ");
105+
throw new Error(`No ${extension} file found in archive. Files: ${files}`);
106+
}
107+
108+
// Use the first GML file found
109+
const filename = gmlFiles[0];
110+
const gmlData = unzipped[filename];
111+
const contentSize = gmlData.byteLength;
112+
113+
// Decode the GML content
114+
const decoder = new TextDecoder("utf-8");
115+
const gmlContent = decoder.decode(gmlData);
116+
117+
// Parse and convert
118+
const document = parseGml(gmlContent);
119+
const graph = gmlToJson(document, options);
120+
121+
return {
122+
graph,
123+
filename,
124+
archiveSize,
125+
contentSize,
126+
};
127+
};
128+
129+
/**
130+
* Fetch multiple datasets in parallel.
131+
*
132+
* @param datasets - Array of [url, options] tuples
133+
* @returns Array of fetch results
134+
*/
135+
export const fetchDatasets = async (
136+
datasets: Array<[string, FetchDatasetOptions]>
137+
): Promise<FetchDatasetResult[]> => {
138+
return Promise.all(datasets.map(([url, options]) => fetchDataset(url, options)));
139+
};
140+
141+
/**
142+
* CLI entry point - fetch dataset from URL and output JSON.
143+
*/
144+
const main = async (): Promise<void> => {
145+
const arguments_ = process.argv.slice(2);
146+
147+
if (arguments_.length === 0) {
148+
console.error("Usage: npx tsx src/formats/gml/fetch.ts <url> [output.json]");
149+
console.error("\nFetches a zip archive containing GML and outputs JSON.");
150+
console.error("\nExample:");
151+
console.error(" npx tsx src/formats/gml/fetch.ts https://websites.umich.edu/~mejn/netdata/karate.zip");
152+
process.exit(1);
153+
}
154+
155+
const [url, outputPath] = arguments_;
156+
157+
console.error(`Fetching ${url}...`);
158+
159+
// Extract name from URL for basic metadata
160+
const urlPath = new URL(url).pathname;
161+
const basename = urlPath.split("/").pop()?.replace(/\.zip$/i, "") ?? "graph";
162+
163+
const result = await fetchDataset(url, {
164+
meta: {
165+
name: basename,
166+
description: `Graph fetched from ${url}`,
167+
source: url,
168+
url: url,
169+
citation: {
170+
authors: [],
171+
title: basename,
172+
year: new Date().getFullYear(),
173+
},
174+
retrieved: new Date().toISOString().split("T")[0],
175+
},
176+
});
177+
178+
console.error(`Extracted: ${result.filename}`);
179+
console.error(`Archive size: ${(result.archiveSize / 1024).toFixed(1)} KB`);
180+
console.error(`Content size: ${(result.contentSize / 1024).toFixed(1)} KB`);
181+
console.error(`Nodes: ${result.graph.nodes.length}, Edges: ${result.graph.edges.length}`);
182+
183+
const output = JSON.stringify(result.graph, null, "\t");
184+
185+
if (outputPath) {
186+
const fs = await import("node:fs");
187+
fs.writeFileSync(outputPath, output + "\n");
188+
console.error(`Written to ${outputPath}`);
189+
} else {
190+
console.log(output);
191+
}
192+
};
193+
194+
// Run CLI if executed directly
195+
if (import.meta.url === `file://${process.argv[1]}`) {
196+
main().catch((error: unknown) => {
197+
console.error("Error:", error);
198+
process.exit(1);
199+
});
200+
}

src/formats/gml/index.ts

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/**
2+
* GML (Graph Modelling Language) format support.
3+
*
4+
* Provides parsing and serialization for the GML graph format,
5+
* commonly used for network analysis datasets.
6+
*
7+
* @example
8+
* ```typescript
9+
* import { parseGml, gmlToJson, serializeGml } from 'graphbox';
10+
*
11+
* // Parse GML to JSON
12+
* const doc = parseGml(gmlContent);
13+
* const json = gmlToJson(doc, { meta: { ... } });
14+
*
15+
* // Serialize JSON to GML
16+
* const gml = serializeGml(json);
17+
* ```
18+
*/
19+
20+
// Types
21+
export type {
22+
Citation,
23+
GmlDocument,
24+
GmlEdge,
25+
GmlNode,
26+
GraphEdge,
27+
GraphJson,
28+
GraphMeta,
29+
GraphNode,
30+
} from "./types";
31+
32+
// Parser
33+
export type { GmlToJsonOptions } from "./parse";
34+
export { gmlToJson,parseGml } from "./parse";
35+
36+
// Serializer
37+
export type { SerializeGmlOptions } from "./serialize";
38+
export { serializeGml } from "./serialize";
39+
40+
// Fetcher
41+
export type { FetchDatasetOptions, FetchDatasetResult } from "./fetch";
42+
export { fetchDataset, fetchDatasets } from "./fetch";

0 commit comments

Comments
 (0)