# Silent Eight — End-to-End Demo

This notebook can run the full project end-to-end.

Colab mode: you can upload **only this notebook** and it will download the full repo ZIP from Google Drive.

Pipeline:
1) Download repo (Colab only)
2) Install dependencies
3) Generate demo data + SQLite DB
4) Preprocess + build keys/LSH
5) Train model
6) Offline matching + evaluation
7) Start FastAPI + call `/resolve`


## 0) Setup

If `src/` and `requirements.txt` are not present in the current working directory, the next cell (on Colab) will download and unpack the repo from Google Drive.


In [1]:
from __future__ import annotations

import os
import shutil
import subprocess
import sys
from pathlib import Path
from typing import List, Optional


def running_in_colab() -> bool:
    try:
        import google.colab  # type: ignore
        return True
    except Exception:
        return False


def run_and_show(cmd: List[str], title: Optional[str] = None) -> None:
    if title:
        print("\n" + "=" * 80)
        print(title)
        print("$", " ".join(cmd))
        print("=" * 80)
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.stdout:
        print(result.stdout)
    if result.stderr:
        print("[stderr]\n" + result.stderr)
    result.check_returncode()


def find_repo_root(base: Path) -> Optional[Path]:
    for api_file in base.glob("**/src/api.py"):
        return api_file.parents[1]
    return None


def download_drive_zip(file_id: str, dest: Path) -> None:
    dest.parent.mkdir(parents=True, exist_ok=True)
    run_and_show([sys.executable, "-m", "pip", "install", "-q", "gdown"], title="Install gdown")
    import gdown  # type: ignore
    url = f"https://drive.google.com/uc?id={file_id}"
    print("Downloading:", url)
    gdown.download(url, str(dest), quiet=False, fuzzy=True)
    if not dest.exists() or dest.stat().st_size == 0:
        raise RuntimeError(
            "Download failed or produced an empty file. "
            "Check Drive sharing: 'Anyone with the link' and downloads allowed."
        )


DRIVE_FILE_ID = "12ZzXx72imAvG5wTrzwbJwDYji3Eyvzf-"  
ZIP_PATH = Path("/content/silent-eight-assignment.zip")


if running_in_colab():
    content_root = Path("/content")
    repo_root = find_repo_root(Path.cwd()) or find_repo_root(content_root)
    if repo_root is None:
        download_drive_zip(DRIVE_FILE_ID, ZIP_PATH)
        print("Unpacking ZIP to /content ...")
        shutil.unpack_archive(str(ZIP_PATH), str(content_root))
        repo_root = find_repo_root(content_root)
        if repo_root is None:
            raise RuntimeError("Could not locate repo root under /content after unpack.")
    os.chdir(repo_root)
else:
    cwd = Path.cwd()
    if cwd.name == "notebooks":
        os.chdir(cwd.parent)


project_root = Path.cwd()
print("Project root:", project_root)
print("Python:", sys.executable)
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))


Project root: c:\repos\silent-eight-assignment
Python: c:\Users\kugor\anaconda3\envs\silent-eight\python.exe


## 1) Install dependencies

On Colab this installs `requirements.txt` from the downloaded repo.


In [2]:
if running_in_colab():
    run_and_show([
        sys.executable, "-m", "pip", "install", "-r", "requirements.txt"
    ], title="Install dependencies from requirements.txt")
else:
    print("Skipping install (not running on Colab).")


Skipping install (not running on Colab).


## 2) Generate demo data + SQLite database

This writes outputs under `data/`.


In [3]:
run_and_show([sys.executable, "-m", "src.generation"], title="Generate demo data + SQLite DB")

import pandas as pd

messy_csv = project_root / "data" / "messy_data.csv"
if messy_csv.exists():
    messy_df = pd.read_csv(messy_csv)
    print("\nGenerated data preview (data/messy_data.csv):")
    display(messy_df.head(10))
else:
    print("No data/messy_data.csv found")



Generate demo data + SQLite DB
$ c:\Users\kugor\anaconda3\envs\silent-eight\python.exe -m src.generation
Generating Ground Truth Entities (with 5% Doppelgangers)...
Generating Messy Observations...
------------------------------
Generated 2554 records.
Sample Data (First 10 rows):
  first_name last_name  ...           phone_number                         address
0       Alex   Duczmal  ...        +48 881 819 600           ulica Cyprysowa 08/38
1       Alex   Duczmal  ...        +48 881 819 600           ulica Cyprysowa 08/38
2       Alxe   Duczmal  ...            602 099 499           ulica Cyprysowa 08/38
3       Mark   Johnson  ...  001-218-519-6001x3389  79402 Peterson Drives Apt. 511
4       Mark   Johnson  ...                   None  79402 Peterson Drives Apt. 511
5     Marcus    Dowerg  ...       +49(0)1338908386                 Albersallee 565
6     Marcus    Dowerg  ...       +49(0)1338908386                 Albersallee 565
7         M.    Dowerg  ...       +49(0)1338908386   

Unnamed: 0,entity_id,first_name,last_name,dob,email,country,national_id,phone_number,address,city,record_id
0,6702910e-c2d9-4780-a278-cc7680c1cc67,Alex,Duczmal,1951-05-22,alex.duczmal@yahoo.com,PL,10320843322,+48 881 819 600,ulica Cyprysowa 08/38,Szczecinek,fab58e7f-1685-4873-a346-4ead834b11c2
1,6702910e-c2d9-4780-a278-cc7680c1cc67,Alex,Duczmal,22/05/1951,alex.duczmal@yahoo.com,PL,10320843322,+48 881 819 600,ulica Cyprysowa 08/38,Szczecinek,c616d850-0e37-4d1f-8724-109c78bfb765
2,6702910e-c2d9-4780-a278-cc7680c1cc67,Alxe,Duczmal,1951-05-22,alex.duczmal@yahoo.com,PL,10320843322,602 099 499,ulica Cyprysowa 08/38,Szczecinek,64aeacd1-23e0-4f83-8ace-d1ef32d4f9b5
3,d2c390a6-d7c7-4e38-8715-fa1f1e14834b,Mark,Johnson,1993-09-26,mark.johnson@gmail.com,US,251-29-2287,001-218-519-6001x3389,79402 Peterson Drives Apt. 511,Davisstad,7e85831a-97ae-463b-b3a3-28dfc8c84d8a
4,d2c390a6-d7c7-4e38-8715-fa1f1e14834b,Mark,Johnson,1993-09-26,,US,251-29-2287,,79402 Peterson Drives Apt. 511,Davisstad,9e489768-9f0b-4bd6-b0ab-fb30c33fbde5
5,9ebc3460-3b29-40e9-b9f0-2aef74c217cc,Marcus,Dowerg,1960-01-06,ioannis32@example.net,DE,81221276B960,+49(0)1338908386,Albersallee 565,Fürstenfeldbruck,7f8f56bc-4dff-4312-9fca-8f40a9ed8eef
6,9ebc3460-3b29-40e9-b9f0-2aef74c217cc,Marcus,Dowerg,1960-01-06,ioannis32@example.net,DE,81221276B960,+49(0)1338908386,Albersallee 565,Fürstenfeldbruck,d66baeaf-1343-4fec-bc69-0cbd69191ae7
7,9ebc3460-3b29-40e9-b9f0-2aef74c217cc,M.,Dowerg,1960-01-06,,DE,81221276B960,+49(0)1338908386,Albersallee 565,Fürsaenfeldbruck,80f4b67f-a168-4a0d-83e2-6faf3b11a10a
8,a21cabd0-411f-4599-a9ab-8aefb83fff1e,Larry,Roman,1975-11-13,yherrera@example.org,US,566-38-5926,001-831-603-4131,55341 Amanda Gardens Apt. 764,Lake Mark,13c4306d-4f96-4bf4-b902-7fa73d9abc16
9,9564a5c2-de15-4a77-80e4-4a1a96cbe3c6,Kamil,Szkopek,1969-06-10,kamil.szkopek@outlook.com,PL,47111502654,732 351 161,pl. Makowa 78,Wałbrzych,856d6a35-42cb-4015-91c4-ee5efed84f81


## 3) Preprocess + build LSH artifacts

Creates normalized columns, blocking keys, DB indexes, and (optionally) LSH artifacts under `models/`.


In [4]:
run_and_show([sys.executable, "-m", "src.preprocessing"], title="Preprocess + build blocking keys/LSH")

import sqlite3
import pandas as pd

db_path = project_root / "data" / "clients.db"
if db_path.exists():
    with sqlite3.connect(db_path) as conn:
        try:
            df = pd.read_sql_query("SELECT * FROM clients_processed LIMIT 5", conn)
            print("\nProcessed table preview (clients_processed):")
            display(df)
        except Exception as e:
            print("Could not read clients_processed preview:", e)
else:
    print("No data/clients.db found after preprocessing.")



Preprocess + build blocking keys/LSH
$ c:\Users\kugor\anaconda3\envs\silent-eight\python.exe -m src.preprocessing
Loading raw data...
Normalizing and generating keys...
  > Generating MinHash signatures...
  > Generating ID keys...
  > Generating Phone keys...
  > Generating Email keys...
  > Generating Initial+DOB keys...

--- Date Parsing Check ---
          dob norm_dob_year
0  1951-05-22          1951
1  22/05/1951          1951
2  1951-05-22          1951
3  1993-09-26          1993
4  1993-09-26          1993
5  1960-01-06          1960
6  1960-01-06          1960
7  1960-01-06          1960
8  1975-11-13          1975
9  1969-06-10          1969

--- Blocking Keys Check ---
        bk_nid  ...               bk_email
0  10320843322  ...  alexduczmal@yahoo.com
1  10320843322  ...  alexduczmal@yahoo.com
2  10320843322  ...  alexduczmal@yahoo.com
3    251292287  ...  markjohnson@gmail.com
4    251292287  ...                   None

[5 rows x 4 columns]

--- Blocking Key Coverage --

Unnamed: 0,entity_id,first_name,last_name,dob,email,country,national_id,phone_number,address,city,...,norm_address,norm_city,norm_dob,norm_dob_year,bk_minhash,bk_nid,bk_phone,bk_email,bk_initial_dob,key_count
0,6702910e-c2d9-4780-a278-cc7680c1cc67,Alex,Duczmal,1951-05-22,alex.duczmal@yahoo.com,PL,10320843322,+48 881 819 600,ulica Cyprysowa 08/38,Szczecinek,...,ulica cyprysowa 0838,szczecinek,1951-05-22,1951,"[86265554, 883542472, 315088925, 71200896, 117...",10320843322,819600.0,alexduczmal@yahoo.com,a|duczmal|1951-05-22,4
1,6702910e-c2d9-4780-a278-cc7680c1cc67,Alex,Duczmal,22/05/1951,alex.duczmal@yahoo.com,PL,10320843322,+48 881 819 600,ulica Cyprysowa 08/38,Szczecinek,...,ulica cyprysowa 0838,szczecinek,1951-05-22,1951,"[86265554, 883542472, 315088925, 71200896, 117...",10320843322,819600.0,alexduczmal@yahoo.com,a|duczmal|1951-05-22,4
2,6702910e-c2d9-4780-a278-cc7680c1cc67,Alxe,Duczmal,1951-05-22,alex.duczmal@yahoo.com,PL,10320843322,602 099 499,ulica Cyprysowa 08/38,Szczecinek,...,ulica cyprysowa 0838,szczecinek,1951-05-22,1951,"[86265554, 194400707, 137792012, 1327957820, 9...",10320843322,99499.0,alexduczmal@yahoo.com,a|duczmal|1951-05-22,4
3,d2c390a6-d7c7-4e38-8715-fa1f1e14834b,Mark,Johnson,1993-09-26,mark.johnson@gmail.com,US,251-29-2287,001-218-519-6001x3389,79402 Peterson Drives Apt. 511,Davisstad,...,79402 peterson drives apartment 511,davisstad,1993-09-26,1993,"[234886539, 180801429, 387971711, 747788293, 3...",251292287,13389.0,markjohnson@gmail.com,m|johnson|1993-09-26,4
4,d2c390a6-d7c7-4e38-8715-fa1f1e14834b,Mark,Johnson,1993-09-26,,US,251-29-2287,,79402 Peterson Drives Apt. 511,Davisstad,...,79402 peterson drives apartment 511,davisstad,1993-09-26,1993,"[234886539, 180801429, 387971711, 747788293, 3...",251292287,,,m|johnson|1993-09-26,2


## 4) Train the model

Trains the classifier and writes the model artifact under `models/`.


In [5]:
run_and_show([sys.executable, "-m", "src.train_model"], title="Train model")

model_path = project_root / "models" / "entity_resolution_model.pkl"
print("\nModel exists:", model_path.exists(), "->", model_path)



Train model
$ c:\Users\kugor\anaconda3\envs\silent-eight\python.exe -m src.train_model
Connecting to database...
Loading processed data...
Loading ground truth...
Generating candidates via LSH & Blocking Keys...
  > Loading LSH Index from disk (Fast)...
  > Loaded LSH Index with 2554 records.
  > Querying LSH Index...
  > Found 1507 pairs via LSH.
  > Querying Exact Blocking Keys (Optimized with UNION)...
  > Found 2804 pairs via Exact Keys.
Total Unique Candidate Pairs: 3044

--- Blocking Performance Report ---
Total Records: 2,554
Candidate Pairs: 3,044
Reduction Ratio: 99.90663095%
Pairs per Record: 1.19

[Block Size Analysis - Top 3 Largest Blocks per Key]
Key: bk_nid
  '019475061': 4 records
  '027371999': 4 records
  '04240376Y753': 4 records
Key: bk_phone
  '333945': 10 records
  '800384': 8 records
  '606474': 7 records
Key: bk_email
  'sheliajohnson@outlook.com': 5 records
  'aaronkelly@example.net': 4 records
  'adacitko@hotmail.com': 4 records
------------------------------

## 5) Offline matching + evaluation

Runs candidate generation + scoring + clustering and outputs evaluation CSVs under `data/`.


In [6]:
run_and_show([sys.executable, "-m", "src.matching"], title="Offline matching + evaluation")

import pandas as pd

data_dir = project_root / "data"
csvs = sorted(data_dir.glob("*.csv")) if data_dir.exists() else []
print("\nData CSV outputs:")
for p in csvs:
    print("-", p.name)

preview = data_dir / "manual_review_cases.csv"
if preview.exists():
    df = pd.read_csv(preview)
    print("\nPreview of manual_review_cases.csv:")
    display(df.head(5))



Offline matching + evaluation
$ c:\Users\kugor\anaconda3\envs\silent-eight\python.exe -m src.matching
Loading processed data for lookup...
Generating candidates via LSH & Blocking Keys...
  > Loading LSH Index from disk (Fast)...
  > Loaded LSH Index with 2554 records.
  > Querying LSH Index...
  > Found 1507 pairs via LSH.
  > Querying Exact Blocking Keys (Optimized with UNION)...
  > Found 2804 pairs via Exact Keys.
Total Unique Candidate Pairs: 3044

--- Blocking Performance Report ---
Total Records: 2,554
Candidate Pairs: 3,044
Reduction Ratio: 99.90663095%
Pairs per Record: 1.19

[Block Size Analysis - Top 3 Largest Blocks per Key]
Key: bk_nid
  '019475061': 4 records
  '027371999': 4 records
  '04240376Y753': 4 records
Key: bk_phone
  '333945': 10 records
  '800384': 8 records
  '606474': 7 records
Key: bk_email
  'sheliajohnson@outlook.com': 5 records
  'aaronkelly@example.net': 4 records
  'adacitko@hotmail.com': 4 records
------------------------------

Calculating comparison

Unnamed: 0,id_a,id_b,nid_score,nid_both_present,email_score,phone_match,first_name_score,last_name_score,addr_score,dob_match,dob_both_present,year_match,ml_prob,match_type,confidence_score,explanation,is_match
0,31feeb09-eac7-4a32-a03c-f3cece151373,86ae1d06-5a86-4ed1-98ba-505d7b65180f,0.333333,1,0.944778,0,1.0,1.0,0.086957,0,1,0,0.012801,review,0.5,Strong Name Match Only,0
1,2b245530-3994-4329-9c55-2a2b7df15a45,31feeb09-eac7-4a32-a03c-f3cece151373,0.333333,1,0.944778,0,1.0,1.0,0.086957,0,1,0,0.012801,review,0.5,Strong Name Match Only,0
2,1130c788-13da-4fe9-8c10-25988b3d84f1,86ae1d06-5a86-4ed1-98ba-505d7b65180f,0.333333,1,0.944778,0,1.0,1.0,0.086957,0,1,0,0.012801,review,0.5,Strong Name Match Only,0
3,c568bc84-4405-4780-afa5-4f20137ce8f3,ea8ac1ad-6ff8-4ef2-96b4-72e40f54be1b,0.0,0,1.0,0,1.0,1.0,0.0,0,1,0,0.40681,review,0.55,Moderate ML Probability (0.41); Strong Name Ma...,0
4,1130c788-13da-4fe9-8c10-25988b3d84f1,2b245530-3994-4329-9c55-2a2b7df15a45,0.333333,1,0.944778,0,1.0,1.0,0.086957,0,1,0,0.012801,review,0.5,Strong Name Match Only,0


## 6) Start the FastAPI service

In [7]:
import time

db_path = project_root / "data" / "clients.db"
model_path = project_root / "models" / "entity_resolution_model.pkl"
lsh_index_path = project_root / "models" / "lsh_index.pkl"

env = os.environ.copy()
env["ER_DB_PATH"] = str(db_path)
env["ER_MODEL_PATH"] = str(model_path)
env["ER_LSH_INDEX_PATH"] = str(lsh_index_path)

host = "0.0.0.0" if running_in_colab() else "127.0.0.1"
proc = subprocess.Popen(
    [sys.executable, "-m", "uvicorn", "src.api:app", "--host", host, "--port", "8000"],
    env=env,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
 )

time.sleep(1.5)
print("Server started (pid):", proc.pid)

if running_in_colab():
    from google.colab import output  # type: ignore
    output.serve_kernel_port_as_iframe(8000, path="docs")
else:
    print("Docs:    http://127.0.0.1:8000/docs")
    print("Health:  http://127.0.0.1:8000/health")
    print("Metrics: http://127.0.0.1:8000/metrics")


Server started (pid): 3592
Docs:    http://127.0.0.1:8000/docs
Health:  http://127.0.0.1:8000/health
Metrics: http://127.0.0.1:8000/metrics


## 7) Call the API (3 illustrative examples)

This section pulls a couple of real rows from the generated SQLite DB and then calls `/resolve` with:
- **Likely match**: consistent name + national ID from the same entity.
- **Harder case**: some fields omitted, different ID, different spelling.
- **Conflict (review)**: **national ID from one entity** combined with a **strong name match for a different entity**.
- **No match**: new entity, but happens to have the same name

In [13]:
import requests

base_url = "http://127.0.0.1:8000"

print("GET /health")
print(requests.get(f"{base_url}/health", timeout=10).json())

GET /health
{'status': 'ok', 'model_loaded': True}


In [9]:
print("POST /resolve (easy case)")
payload = {
    "first_name": messy_df.iloc[0]["first_name"],
    "last_name": messy_df.iloc[0]["last_name"],
    "dob": messy_df.iloc[0]["dob"],
    "national_id": messy_df.iloc[0]["national_id"],
    "email": messy_df.iloc[0]["email"],
    "address": messy_df.iloc[0]["address"],
    "phone_number": messy_df.iloc[0]["phone_number"],
}
resp = requests.post(f"{base_url}/resolve", json=payload, timeout=30)
resp.raise_for_status()
result = resp.json()
print("Status:", result.get("status"))
print("Candidates checked:", result.get("candidates_checked"))
print("Processing time (ms):", result.get("processing_time_ms"))
print("Best match:", result.get("best_match"))

POST /resolve (easy case)
Status: match
Candidates checked: 7
Processing time (ms): 53.99441719055176
Best match: {'candidate_id': 'fab58e7f-1685-4873-a346-4ead834b11c2', 'match_type': 'match', 'confidence_score': 0.9992415904998779, 'ml_probability': 0.9992415904998779, 'scores': {'name': 1.0, 'national_id': 1.0, 'email': 1.0, 'phone': 1.0, 'address': 1.0}, 'explanation': ['Rule 1: Strong National ID & Name Match', 'Rule 2: Strong Contact Info & Name Match (Verified)', 'Rule 3: Exact DOB & Strong Name Match', 'Rule 4: Address & Strong Name Match', 'Rule 5: High ML Probability (1.00)', 'Reason: Exact Email Match', 'Reason: Exact Phone Match', 'Reason: Strong National ID Match', 'Reason: Exact Date of Birth Match', 'Rule 6: Strong ID + Initials Match']}


In [10]:
print("\nPOST /resolve (harder case, some fields omitted, different ID, different spelling)")

payload = {
    "first_name": "Alex", # different spelling
    "last_name": messy_df.iloc[0]["last_name"],
    "dob": None,
    "national_id": "032415893581", # different random ID
    "email": messy_df.iloc[0]["email"],
    "phone_number": messy_df.iloc[0]["phone_number"],
}

print("POST /resolve")
resp = requests.post(f"{base_url}/resolve", json=payload, timeout=30)
resp.raise_for_status()
result = resp.json()
print("Status:", result.get("status"))
print("Candidates checked:", result.get("candidates_checked"))
print("Processing time (ms):", result.get("processing_time_ms"))
print("Best match:", result.get("best_match"))


POST /resolve (harder case, some fields omitted, different ID, different spelling)
POST /resolve
Status: review
Candidates checked: 7
Processing time (ms): 57.000160217285156
Best match: {'candidate_id': 'fab58e7f-1685-4873-a346-4ead834b11c2', 'match_type': 'review', 'confidence_score': 0.7857090830802917, 'ml_probability': 0.7857090830802917, 'scores': {'name': 1.0, 'national_id': 0.33333333333333337, 'email': 1.0, 'phone': 1.0, 'address': 0.0}, 'explanation': ['Moderate ML Probability (0.79)', 'Strong Name Match Only', 'Exact Email Match Only']}


In [11]:
print("\nPOST /resolve (harder case, Name match to one entity, ID match to a different entity)")

# everything belonging to one entity except national id
payload = {
    "first_name": messy_df.iloc[0]['first_name'], #
    "last_name": messy_df.iloc[0]["last_name"],
    "dob": messy_df.iloc[0]['dob'],
    "national_id": messy_df.iloc[100]['national_id'],
    "email": messy_df.iloc[0]["email"],
    "phone_number": messy_df.iloc[0]["phone_number"],
}

print("POST /resolve")
resp = requests.post(f"{base_url}/resolve", json=payload, timeout=30)
resp.raise_for_status()
result = resp.json()
print("Status:", result.get("status"))
print("Candidates checked:", result.get("candidates_checked"))
print("Processing time (ms):", result.get("processing_time_ms"))
print("Best match:", result.get("best_match"))


POST /resolve (harder case, Name match to one entity, ID match to a different entity)
POST /resolve
Status: review
Candidates checked: 11
Processing time (ms): 58.995962142944336


In [12]:
print("\nPOST /resolve (harder case, new person that happens to have the same name)")

# everything belonging to one entity except national id
payload = {
    "first_name": messy_df.iloc[0]['first_name'], #
    "last_name": messy_df.iloc[0]["last_name"],
    "dob": "11-12-1999",
    "national_id": "02349872645",
    "email": "email.email@gmail.com",
    "phone_number": "111 111 111",
}

print("POST /resolve")
resp = requests.post(f"{base_url}/resolve", json=payload, timeout=30)
resp.raise_for_status()
result = resp.json()
print("Status:", result.get("status"))
print("Candidates checked:", result.get("candidates_checked"))
print("Processing time (ms):", result.get("processing_time_ms"))
print("Best match:", result.get("best_match"))


POST /resolve (harder case, new person that happens to have the same name)
POST /resolve


Status: no_match
Candidates checked: 0
Processing time (ms): 44.00277137756348
Best match: None


## 8) Stop the server

Run when you're done.


In [None]:
if "proc" in globals() and proc and proc.poll() is None:
    proc.terminate()
    try:
        proc.wait(timeout=5)
    except Exception:
        proc.kill()
    print("Server stopped.")
else:
    print("Server is not running.")
