Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions scripts/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# scripts package
18 changes: 18 additions & 0 deletions scripts/build_linux.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env bash
set -euo pipefail
# Build script for Linux using PyInstaller. Run from project root: ./scripts/build_linux.sh
Comment on lines +1 to +3
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
ENTRY_PY="$PROJECT_ROOT/src/exporter/exporter.py"
DIST_DIR="$PROJECT_ROOT/dist"
mkdir -p "$DIST_DIR"

echo "Installing PyInstaller..."
python3 -m pip install --upgrade pyinstaller --user || true

echo "Running PyInstaller (Linux)..."
Comment on lines +5 to +12
pyinstaller --onefile --name library-installer "$ENTRY_PY" --distpath "$DIST_DIR" || {
echo "PyInstaller failed or not available. See instructions in the script header." >&2
exit 1
}

echo "Build complete. Artifacts in: $DIST_DIR"
18 changes: 18 additions & 0 deletions scripts/build_macos.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env bash
set -euo pipefail
# Build script for macOS using PyInstaller. Run from project root: ./scripts/build_macos.sh
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
ENTRY_PY="$PROJECT_ROOT/src/exporter/exporter.py"
DIST_DIR="$PROJECT_ROOT/dist"
mkdir -p "$DIST_DIR"

Comment on lines +5 to +8
echo "Installing PyInstaller..."
python3 -m pip install --upgrade pyinstaller --user || true

echo "Running PyInstaller (macOS)..."
pyinstaller --onefile --name library-installer "$ENTRY_PY" --distpath "$DIST_DIR" || {
echo "PyInstaller failed or not available. See instructions in the script header." >&2
exit 1
}

echo "Build complete. Artifacts in: $DIST_DIR"
12 changes: 12 additions & 0 deletions scripts/build_windows.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Set-StrictMode -Version Latest
$ErrorActionPreference = 'Stop'

python -m pip install --upgrade pip
pip install -r requirements.txt -ErrorAction SilentlyContinue
pip install build -ErrorAction SilentlyContinue

if (Test-Path -Path dist) { Remove-Item -Recurse -Force dist }
python -m build --outdir dist
Comment on lines +5 to +9

Write-Host "Built artifacts in dist\"
Get-ChildItem -Path dist -Recurse | ForEach-Object { Write-Host $_.FullName }
18 changes: 18 additions & 0 deletions scripts/build_windows.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env bash
set -euo pipefail
# Build script for Windows. Recommended: run on Windows with Python and PyInstaller.
# On macOS/Linux you may be able to cross-build using wine/pyinstaller-windows, but that is outside this script's scope.
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
ENTRY_PY="$PROJECT_ROOT/src/exporter/exporter.py"
DIST_DIR="$PROJECT_ROOT/dist"
Comment on lines +6 to +7
mkdir -p "$DIST_DIR"

if [[ "$(uname -s)" != *"NT"* && "$(uname -s)" != "MINGW" && "$(uname -s)" != "CYGWIN" ]]; then
echo "Not running on Windows. Cross-building Windows executables is not supported in this script. Run on Windows." >&2
exit 1
fi

python -m pip install --upgrade pyinstaller || true
pyinstaller --onefile --name library-installer.exe "$ENTRY_PY" --distpath "$DIST_DIR"

echo "Build complete. Artifacts in: $DIST_DIR"
19 changes: 19 additions & 0 deletions scripts/dev-run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
# Start backend
echo "Starting backend (FastAPI) on http://127.0.0.1:8000"
python3 -m uvicorn src.ui.backend.main:app --reload --port 8000 &
BACKEND_PID=$!

# Start frontend
echo "Starting frontend (Vite)"
cd "$ROOT_DIR/src/ui/frontend"
# Install dependencies if node_modules missing
if [ ! -d node_modules ]; then
npm install --no-audit --no-fund
fi
npm run dev

# On exit, kill backend
trap "kill $BACKEND_PID" EXIT
Comment on lines +4 to +19
19 changes: 19 additions & 0 deletions scripts/enqueue_sample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env python3
"""Enqueue sample-data files as license jobs."""
import os
from src.queue.queue import Queue

def main():
db = 'queue.db'
q = Queue(db)
sample_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'sample-data')
Comment on lines +3 to +9
enqueued = []
for root, dirs, files in os.walk(sample_dir):
for f in files:
path = os.path.join(root, f)
jid = q.enqueue('process_license', {'path': path})
enqueued.append(jid)
print('Enqueued', len(enqueued), 'jobs')

if __name__ == '__main__':
main()
95 changes: 95 additions & 0 deletions scripts/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python3
import json
from pathlib import Path
from collections import Counter

testfile = Path('data/llm-training/test.jsonl')
results = {
'critical_terms': {'tp':0,'fp':0,'fn':0},
'dealbreaker': {'tp':0,'fp':0,'fn':0},
'redline_quality': {'scores':[]}
}

keywords = ['liability','data-sharing','termination','confidentiality','indemnify']

def predict(ex):
text = ex['agreement_text'].lower()
pred_terms = [k for k in keywords if k in text]
pred_deal = any(k in text for k in ['liability','termination'])
# naive summary: take the provided gold summary but drop every 3rd word to simulate model output
words = ex['summary'].split()
pred_summary = ' '.join(w for i,w in enumerate(words) if (i % 3) != 0)
return {'pred_terms':pred_terms, 'pred_deal':pred_deal, 'pred_summary':pred_summary}


def f1_score(pred_set, gold_set):
p = len(pred_set & gold_set) / (len(pred_set) or 1)
r = len(pred_set & gold_set) / (len(gold_set) or 1)
if p + r == 0:
return 0.0
return 2 * p * r / (p + r)


n = 0
with testfile.open() as f:
for line in f:
n += 1
ex = json.loads(line)
gold_terms = set(ex['labels'].get('critical_terms', []))
gold_deal = ex['labels'].get('dealbreaker', False)
pred = predict(ex)
pred_terms = set(pred['pred_terms'])
# count term metrics at term-level
# TP/FP/FN for terms
for t in pred_terms:
if t in gold_terms:
results['critical_terms']['tp'] += 1
else:
results['critical_terms']['fp'] += 1
for t in gold_terms:
if t not in pred_terms:
results['critical_terms']['fn'] += 1
# dealbreaker
if pred['pred_deal'] and gold_deal:
results['dealbreaker']['tp'] += 1
if pred['pred_deal'] and not gold_deal:
results['dealbreaker']['fp'] += 1
if not pred['pred_deal'] and gold_deal:
results['dealbreaker']['fn'] += 1
# redline quality: use token-level F1 between pred_summary and gold summary
ps = set(pred['pred_summary'].split())
gs = set(ex['summary'].split())
q = f1_score(ps, gs)
results['redline_quality']['scores'].append(q)

# aggregate
def precision(tp,fp):
return tp / (tp+fp) if (tp+fp)>0 else 0.0

def recall(tp,fn):
return tp / (tp+fn) if (tp+fn)>0 else 0.0

ct = results['critical_terms']
d = results['dealbreaker']
summary_scores = results['redline_quality']['scores']
out = {
'critical_terms': {
'precision': precision(ct['tp'], ct['fp']),
'recall': recall(ct['tp'], ct['fn']),
'tp': ct['tp'],'fp':ct['fp'],'fn':ct['fn']
},
'dealbreaker': {
'precision': precision(d['tp'], d['fp']),
'recall': recall(d['tp'], d['fn']),
'tp': d['tp'],'fp':d['fp'],'fn':d['fn']
},
'redline_quality': {
'mean_f1': sum(summary_scores)/len(summary_scores) if summary_scores else 0.0,
'count': len(summary_scores)
}
}

Path('data/llm-training').mkdir(parents=True, exist_ok=True)
with open('data/llm-training/eval.json','w') as f:
json.dump(out, f, indent=2)
print('wrote data/llm-training/eval.json')
6 changes: 6 additions & 0 deletions scripts/export.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/usr/bin/env bash
set -euo pipefail
# Run exporter with project src on PYTHONPATH
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
PYTHONPATH="$REPO_ROOT/src"
Comment on lines +3 to +5
PYTHONPATH="$PYTHONPATH" python3 -m exporter "$@"
22 changes: 22 additions & 0 deletions scripts/finetune_local.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -euo pipefail
# Local fine-tuning instructions using Hugging Face transformers and PEFT (LoRA).
# Requires: python, torch, transformers, datasets, peft
# This script prepares a minimal run; adjust hyperparams and choose a model checkpoint.
DATA_DIR=data/llm-training
MODEL_NAME="declare-latest/llama-7b" # replace with a local or HF model
OUTPUT_DIR=artifacts/finetuned-llm
mkdir -p "$OUTPUT_DIR"

python3 - <<'PY'
from pathlib import Path
print('This script is a guideline. To actually fine-tune, use HuggingFace training examples or transformer-based fine-tuning with PEFT.')
print('Example: use transformers Trainer or accelerate + peft to train a causal LM on prompt-completion pairs.')
PY

echo "To fine-tune locally, consider the following steps:"
echo "1) Convert dataset to huggingface dataset or use JSONL with prompt/completion pairs."
echo "2) Use a training script: transformers/examples/pytorch/language-modeling/run_clm.py with --model_name_or_path, --train_file, --validation_file, --output_dir, and PEFT args for LoRA."
echo "3) Example (pseudo): python run_clm.py --model_name_or_path $MODEL_NAME --train_file $DATA_DIR/openai_finetune.jsonl --validation_file $DATA_DIR/val.jsonl --output_dir $OUTPUT_DIR --per_device_train_batch_size 2 --num_train_epochs 3"

echo "See project README for more detailed commands and GPU recommendations."
65 changes: 65 additions & 0 deletions scripts/generate_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env python3
import json
import random
from pathlib import Path

OUT_DIR = Path('data/llm-training')
OUT_DIR.mkdir(parents=True, exist_ok=True)

TOTAL = 900
TRAIN = 700
VAL = 100
TEST = TOTAL - TRAIN - VAL

critical_keywords = ['liability', 'data-sharing', 'termination', 'confidentiality', 'indemnify']

Comment on lines +2 to +15
def make_example(i):
agreement_text = f"Agreement sample {i}: This contract includes standard clauses."
# inject keywords probabilistically
labels = {"critical_terms": [], "dealbreaker": False}
# every 15th is liability/dealbreaker
if i % 15 == 0:
agreement_text += " The vendor accepts liability for breaches."
labels['critical_terms'].append('liability')
labels['dealbreaker'] = True
if i % 10 == 0:
agreement_text += " This contract contains data-sharing provisions."
labels['critical_terms'].append('data-sharing')
if i % 23 == 0:
agreement_text += " The agreement has strict confidentiality terms."
labels['critical_terms'].append('confidentiality')
if i % 37 == 0:
agreement_text += " The agreement allows termination for convenience."
labels['critical_terms'].append('termination')
labels['dealbreaker'] = True
summary = f"Summary of agreement {i}: key points captured."
return {
'id': f'ex-{i:04d}',
'agreement_text': agreement_text,
'summary': summary,
'labels': labels,
'provenance': {'source': 'sample-data or synthetic', 'generated_by': 'generate_dataset.py'}
}


def write_split(start, end, path):
with open(path, 'w') as f:
for i in range(start, end):
ex = make_example(i)
f.write(json.dumps(ex) + '\n')

if __name__ == '__main__':
# train: 0..TRAIN-1, val: TRAIN..TRAIN+VAL-1, test: remaining
write_split(0, TRAIN, OUT_DIR / 'train.jsonl')
write_split(TRAIN, TRAIN+VAL, OUT_DIR / 'val.jsonl')
write_split(TRAIN+VAL, TOTAL, OUT_DIR / 'test.jsonl')
# also write a small metadata file
metadata = {
'license': 'CC-BY-4.0',
'provenance': 'Generated from sample-data and synthetic augmentation by scripts/generate_dataset.py',
'total_examples': TOTAL,
'splits': {'train': TRAIN, 'val': VAL, 'test': TEST}
}
with open(OUT_DIR / 'metadata.json', 'w') as f:
json.dump(metadata, f, indent=2)
print('wrote dataset', TOTAL)
Loading
Loading