HBLL-Collection-Development · jaredhowland · May 5, 2026 · May 5, 2026
diff --git a/scripts/__init__.py b/scripts/__init__.py
@@ -0,0 +1 @@
+# scripts package
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+set -euo pipefail
+# Build script for Linux using PyInstaller. Run from project root: ./scripts/build_linux.sh
+PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+ENTRY_PY="$PROJECT_ROOT/src/exporter/exporter.py"
+DIST_DIR="$PROJECT_ROOT/dist"
+mkdir -p "$DIST_DIR"
+
+echo "Installing PyInstaller..."
+python3 -m pip install --upgrade pyinstaller --user || true
+
+echo "Running PyInstaller (Linux)..."
+pyinstaller --onefile --name library-installer "$ENTRY_PY" --distpath "$DIST_DIR" || {
+  echo "PyInstaller failed or not available. See instructions in the script header." >&2
+  exit 1
+}
+
+echo "Build complete. Artifacts in: $DIST_DIR"
diff --git a/scripts/build_macos.sh b/scripts/build_macos.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+set -euo pipefail
+# Build script for macOS using PyInstaller. Run from project root: ./scripts/build_macos.sh
+PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+ENTRY_PY="$PROJECT_ROOT/src/exporter/exporter.py"
+DIST_DIR="$PROJECT_ROOT/dist"
+mkdir -p "$DIST_DIR"
+
+echo "Installing PyInstaller..."
+python3 -m pip install --upgrade pyinstaller --user || true
+
+echo "Running PyInstaller (macOS)..."
+pyinstaller --onefile --name library-installer "$ENTRY_PY" --distpath "$DIST_DIR" || {
+  echo "PyInstaller failed or not available. See instructions in the script header." >&2
+  exit 1
+}
+
+echo "Build complete. Artifacts in: $DIST_DIR"
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
@@ -0,0 +1,12 @@
+Set-StrictMode -Version Latest
+$ErrorActionPreference = 'Stop'
+
+python -m pip install --upgrade pip
+pip install -r requirements.txt -ErrorAction SilentlyContinue
+pip install build -ErrorAction SilentlyContinue
+
+if (Test-Path -Path dist) { Remove-Item -Recurse -Force dist }
+python -m build --outdir dist
+
+Write-Host "Built artifacts in dist\"
+Get-ChildItem -Path dist -Recurse | ForEach-Object { Write-Host $_.FullName }
diff --git a/scripts/build_windows.sh b/scripts/build_windows.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+set -euo pipefail
+# Build script for Windows. Recommended: run on Windows with Python and PyInstaller.
+# On macOS/Linux you may be able to cross-build using wine/pyinstaller-windows, but that is outside this script's scope.
+PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+ENTRY_PY="$PROJECT_ROOT/src/exporter/exporter.py"
+DIST_DIR="$PROJECT_ROOT/dist"
+mkdir -p "$DIST_DIR"
+
+if [[ "$(uname -s)" != *"NT"* && "$(uname -s)" != "MINGW" && "$(uname -s)" != "CYGWIN" ]]; then
+  echo "Not running on Windows. Cross-building Windows executables is not supported in this script. Run on Windows." >&2
+  exit 1
+fi
+
+python -m pip install --upgrade pyinstaller || true
+pyinstaller --onefile --name library-installer.exe "$ENTRY_PY" --distpath "$DIST_DIR"
+
+echo "Build complete. Artifacts in: $DIST_DIR"
diff --git a/scripts/dev-run.sh b/scripts/dev-run.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+set -euo pipefail
+ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+# Start backend
+echo "Starting backend (FastAPI) on http://127.0.0.1:8000"
+python3 -m uvicorn src.ui.backend.main:app --reload --port 8000 &
+BACKEND_PID=$!
+
+# Start frontend
+echo "Starting frontend (Vite)"
+cd "$ROOT_DIR/src/ui/frontend"
+# Install dependencies if node_modules missing
+if [ ! -d node_modules ]; then
+  npm install --no-audit --no-fund
+fi
+npm run dev
+
+# On exit, kill backend
+trap "kill $BACKEND_PID" EXIT
diff --git a/scripts/enqueue_sample.py b/scripts/enqueue_sample.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+"""Enqueue sample-data files as license jobs."""
+import os
+from src.queue.queue import Queue
+
+def main():
+    db = 'queue.db'
+    q = Queue(db)
+    sample_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'sample-data')
+    enqueued = []
+    for root, dirs, files in os.walk(sample_dir):
+        for f in files:
+            path = os.path.join(root, f)
+            jid = q.enqueue('process_license', {'path': path})
+            enqueued.append(jid)
+    print('Enqueued', len(enqueued), 'jobs')
+
+if __name__ == '__main__':
+    main()
diff --git a/scripts/evaluate.py b/scripts/evaluate.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+import json
+from pathlib import Path
+from collections import Counter
+
+testfile = Path('data/llm-training/test.jsonl')
+results = {
+    'critical_terms': {'tp':0,'fp':0,'fn':0},
+    'dealbreaker': {'tp':0,'fp':0,'fn':0},
+    'redline_quality': {'scores':[]}
+}
+
+keywords = ['liability','data-sharing','termination','confidentiality','indemnify']
+
+def predict(ex):
+    text = ex['agreement_text'].lower()
+    pred_terms = [k for k in keywords if k in text]
+    pred_deal = any(k in text for k in ['liability','termination'])
+    # naive summary: take the provided gold summary but drop every 3rd word to simulate model output
+    words = ex['summary'].split()
+    pred_summary = ' '.join(w for i,w in enumerate(words) if (i % 3) != 0)
+    return {'pred_terms':pred_terms, 'pred_deal':pred_deal, 'pred_summary':pred_summary}
+
+
+def f1_score(pred_set, gold_set):
+    p = len(pred_set & gold_set) / (len(pred_set) or 1)
+    r = len(pred_set & gold_set) / (len(gold_set) or 1)
+    if p + r == 0:
+        return 0.0
+    return 2 * p * r / (p + r)
+
+
+n = 0
+with testfile.open() as f:
+    for line in f:
+        n += 1
+        ex = json.loads(line)
+        gold_terms = set(ex['labels'].get('critical_terms', []))
+        gold_deal = ex['labels'].get('dealbreaker', False)
+        pred = predict(ex)
+        pred_terms = set(pred['pred_terms'])
+        # count term metrics at term-level
+        # TP/FP/FN for terms
+        for t in pred_terms:
+            if t in gold_terms:
+                results['critical_terms']['tp'] += 1
+            else:
+                results['critical_terms']['fp'] += 1
+        for t in gold_terms:
+            if t not in pred_terms:
+                results['critical_terms']['fn'] += 1
+        # dealbreaker
+        if pred['pred_deal'] and gold_deal:
+            results['dealbreaker']['tp'] += 1
+        if pred['pred_deal'] and not gold_deal:
+            results['dealbreaker']['fp'] += 1
+        if not pred['pred_deal'] and gold_deal:
+            results['dealbreaker']['fn'] += 1
+        # redline quality: use token-level F1 between pred_summary and gold summary
+        ps = set(pred['pred_summary'].split())
+        gs = set(ex['summary'].split())
+        q = f1_score(ps, gs)
+        results['redline_quality']['scores'].append(q)
+
+# aggregate
+def precision(tp,fp):
+    return tp / (tp+fp) if (tp+fp)>0 else 0.0
+
+def recall(tp,fn):
+    return tp / (tp+fn) if (tp+fn)>0 else 0.0
+
+ct = results['critical_terms']
+d = results['dealbreaker']
+summary_scores = results['redline_quality']['scores']
+out = {
+    'critical_terms': {
+        'precision': precision(ct['tp'], ct['fp']),
+        'recall': recall(ct['tp'], ct['fn']),
+        'tp': ct['tp'],'fp':ct['fp'],'fn':ct['fn']
+    },
+    'dealbreaker': {
+        'precision': precision(d['tp'], d['fp']),
+        'recall': recall(d['tp'], d['fn']),
+        'tp': d['tp'],'fp':d['fp'],'fn':d['fn']
+    },
+    'redline_quality': {
+        'mean_f1': sum(summary_scores)/len(summary_scores) if summary_scores else 0.0,
+        'count': len(summary_scores)
+    }
+}
+
+Path('data/llm-training').mkdir(parents=True, exist_ok=True)
+with open('data/llm-training/eval.json','w') as f:
+    json.dump(out, f, indent=2)
+print('wrote data/llm-training/eval.json')
diff --git a/scripts/export.sh b/scripts/export.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+set -euo pipefail
+# Run exporter with project src on PYTHONPATH
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+PYTHONPATH="$REPO_ROOT/src"
+PYTHONPATH="$PYTHONPATH" python3 -m exporter "$@"
diff --git a/scripts/finetune_local.sh b/scripts/finetune_local.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -euo pipefail
+# Local fine-tuning instructions using Hugging Face transformers and PEFT (LoRA).
+# Requires: python, torch, transformers, datasets, peft
+# This script prepares a minimal run; adjust hyperparams and choose a model checkpoint.
+DATA_DIR=data/llm-training
+MODEL_NAME="declare-latest/llama-7b" # replace with a local or HF model
+OUTPUT_DIR=artifacts/finetuned-llm
+mkdir -p "$OUTPUT_DIR"
+
+python3 - <<'PY'
+from pathlib import Path
+print('This script is a guideline. To actually fine-tune, use HuggingFace training examples or transformer-based fine-tuning with PEFT.')
+print('Example: use transformers Trainer or accelerate + peft to train a causal LM on prompt-completion pairs.')
+PY
+
+echo "To fine-tune locally, consider the following steps:" 
+echo "1) Convert dataset to huggingface dataset or use JSONL with prompt/completion pairs."
+echo "2) Use a training script: transformers/examples/pytorch/language-modeling/run_clm.py with --model_name_or_path, --train_file, --validation_file, --output_dir, and PEFT args for LoRA."
+echo "3) Example (pseudo): python run_clm.py --model_name_or_path $MODEL_NAME --train_file $DATA_DIR/openai_finetune.jsonl --validation_file $DATA_DIR/val.jsonl --output_dir $OUTPUT_DIR --per_device_train_batch_size 2 --num_train_epochs 3"
+
+echo "See project README for more detailed commands and GPU recommendations."
diff --git a/scripts/generate_dataset.py b/scripts/generate_dataset.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+import json
+import random
+from pathlib import Path
+
+OUT_DIR = Path('data/llm-training')
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+TOTAL = 900
+TRAIN = 700
+VAL = 100
+TEST = TOTAL - TRAIN - VAL
+
+critical_keywords = ['liability', 'data-sharing', 'termination', 'confidentiality', 'indemnify']
+
+def make_example(i):
+    agreement_text = f"Agreement sample {i}: This contract includes standard clauses."
+    # inject keywords probabilistically
+    labels = {"critical_terms": [], "dealbreaker": False}
+    # every 15th is liability/dealbreaker
+    if i % 15 == 0:
+        agreement_text += " The vendor accepts liability for breaches."
+        labels['critical_terms'].append('liability')
+        labels['dealbreaker'] = True
+    if i % 10 == 0:
+        agreement_text += " This contract contains data-sharing provisions."
+        labels['critical_terms'].append('data-sharing')
+    if i % 23 == 0:
+        agreement_text += " The agreement has strict confidentiality terms."
+        labels['critical_terms'].append('confidentiality')
+    if i % 37 == 0:
+        agreement_text += " The agreement allows termination for convenience."
+        labels['critical_terms'].append('termination')
+        labels['dealbreaker'] = True
+    summary = f"Summary of agreement {i}: key points captured."
+    return {
+        'id': f'ex-{i:04d}',
+        'agreement_text': agreement_text,
+        'summary': summary,
+        'labels': labels,
+        'provenance': {'source': 'sample-data or synthetic', 'generated_by': 'generate_dataset.py'}
+    }
+
+
+def write_split(start, end, path):
+    with open(path, 'w') as f:
+        for i in range(start, end):
+            ex = make_example(i)
+            f.write(json.dumps(ex) + '\n')
+
+if __name__ == '__main__':
+    # train: 0..TRAIN-1, val: TRAIN..TRAIN+VAL-1, test: remaining
+    write_split(0, TRAIN, OUT_DIR / 'train.jsonl')
+    write_split(TRAIN, TRAIN+VAL, OUT_DIR / 'val.jsonl')
+    write_split(TRAIN+VAL, TOTAL, OUT_DIR / 'test.jsonl')
+    # also write a small metadata file
+    metadata = {
+        'license': 'CC-BY-4.0',
+        'provenance': 'Generated from sample-data and synthetic augmentation by scripts/generate_dataset.py',
+        'total_examples': TOTAL,
+        'splits': {'train': TRAIN, 'val': VAL, 'test': TEST}
+    }
+    with open(OUT_DIR / 'metadata.json', 'w') as f:
+        json.dump(metadata, f, indent=2)
+    print('wrote dataset', TOTAL)