Skip to content

Commit ec1a26f

Browse files
committed
feat: add dataset management scripts
add scripts for benchmark dataset management: - download scripts for snap, pajek, and external datasets - conversion utilities for benchmark formats - catalog generation and archive url management
1 parent 7220c5e commit ec1a26f

14 files changed

Lines changed: 5715 additions & 0 deletions

scripts/add-all-datasets.ts

Lines changed: 554 additions & 0 deletions
Large diffs are not rendered by default.

scripts/add-external-datasets.ts

Lines changed: 543 additions & 0 deletions
Large diffs are not rendered by default.

scripts/add-pajek-datasets.ts

Lines changed: 730 additions & 0 deletions
Large diffs are not rendered by default.

scripts/add-snap-datasets.ts

Lines changed: 1266 additions & 0 deletions
Large diffs are not rendered by default.

scripts/archive-urls.ts

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#!/usr/bin/env npx tsx
2+
/**
3+
* Submit all dataset URLs to Wayback Machine for archiving.
4+
*
5+
* Uses the Wayback Machine Save Page Now API to archive URLs,
6+
* then outputs the archived URLs in id_ format for direct access.
7+
*/
8+
9+
import * as fs from "node:fs";
10+
import * as path from "node:path";
11+
12+
import {
13+
type ArchivedUrl,
14+
checkArchived,
15+
submitToArchive,
16+
} from "../src/utils/wayback.js";
17+
18+
const CATALOG_PATH = path.join(import.meta.dirname, "../src/data/catalog.json");
19+
const OUTPUT_PATH = path.join(import.meta.dirname, "../src/data/archived-urls.json");
20+
21+
interface ArchiveResult {
22+
generated: string;
23+
urls: Record<string, ArchivedUrl>;
24+
}
25+
26+
const main = async (): Promise<void> => {
27+
const catalog = JSON.parse(fs.readFileSync(CATALOG_PATH, "utf-8")) as {
28+
sources: Record<string, { url: string }>;
29+
datasets: Record<string, { url: string }>;
30+
};
31+
32+
const urls = new Set<string>();
33+
34+
for (const source of Object.values(catalog.sources)) {
35+
urls.add(source.url);
36+
}
37+
38+
for (const dataset of Object.values(catalog.datasets)) {
39+
urls.add(dataset.url);
40+
}
41+
42+
console.log(`Found ${urls.size} unique URLs to archive`);
43+
44+
const result: ArchiveResult = {
45+
generated: new Date().toISOString(),
46+
urls: {},
47+
};
48+
49+
let archived = 0;
50+
let alreadyArchived = 0;
51+
let failed = 0;
52+
53+
for (const url of urls) {
54+
console.log(`Processing: ${url}`);
55+
56+
// First check if already archived
57+
const existing = await checkArchived(url);
58+
if (existing) {
59+
console.log(` Already archived`);
60+
result.urls[url] = existing;
61+
alreadyArchived++;
62+
continue;
63+
}
64+
65+
// Submit for archiving
66+
console.log(` Submitting to Wayback Machine...`);
67+
const archiveResult = await submitToArchive(url);
68+
69+
if (archiveResult) {
70+
console.log(` Archived`);
71+
result.urls[url] = archiveResult;
72+
archived++;
73+
} else {
74+
console.log(` Failed to archive`);
75+
failed++;
76+
}
77+
}
78+
79+
fs.writeFileSync(OUTPUT_PATH, JSON.stringify(result, null, "\t") + "\n");
80+
81+
console.log(`\nDone!`);
82+
console.log(` Newly archived: ${archived}`);
83+
console.log(` Already archived: ${alreadyArchived}`);
84+
console.log(` Failed: ${failed}`);
85+
console.log(`\nResults saved to: ${OUTPUT_PATH}`);
86+
};
87+
88+
main().catch((error: unknown) => {
89+
console.error("Error:", error);
90+
process.exit(1);
91+
});

0 commit comments

Comments
 (0)