In [None]:
#!/usr/bin/env python3

import csv
import os
import re
import difflib

# ------------------ CONFIG - adjust paths ------------------
VOA_INPUT = r""
EPC_INPUT = r""
OUTPUT_CSV = r""
# -----------------------------------------------------------

# Safety: create output dir if missing
os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)

# Regexes
POSTCODE_RE = re.compile(r'\b([A-Z]{1,2}\d{1,2}[A-Z]?\s*\d[A-Z]{2})\b', re.IGNORECASE)
NON_ALNUM = re.compile(r'[^a-z0-9]')
MULTISP = re.compile(r'\s+')

# Matching params
FUZZY_RATIO_THRESHOLD = 0.70   # if fuzzy similarity >= this, consider same (tuneable)
POSTCODE_EXACT_PRIORITY = True # if postcodes match exactly, use a lower similarity threshold

def normalize_postcode(pc):
    """Normalise UK postcode-ish strings: uppercase, remove weird chars, ensure space before last 3 chars if plausible."""
    if not pc:
        return ""
    s = re.sub(r'[^A-Za-z0-9]', '', pc).upper()
    if len(s) <= 3:
        return s
    # if it's plausible UK postcode length >=5, insert space before last 3 chars
    if len(s) >= 5:
        return s[:-3] + " " + s[-3:]
    return s

def postcode_equal_loose(pc1, pc2):
    """Return True if postcodes are the same except possibly for the final digit/letter."""
    if not pc1 or not pc2:
        return False
    pc1 = normalize_postcode(pc1).replace(" ", "")
    pc2 = normalize_postcode(pc2).replace(" ", "")
    # Compare all but final character
    return pc1[:-1] == pc2[:-1]

def normalize_text(s):
    """Lowercase, remove diacritics? (not doing diacritics here), remove punctuation, collapse spaces."""
    if not s:
        return ""
    s2 = s.lower()
    # replace common separators
    s2 = s2.replace("&", " and ")
    s2 = NON_ALNUM.sub(" ", s2)
    s2 = MULTISP.sub(" ", s2).strip()
    return s2

def make_match_key(address_text, postcode):
    """Create a compact normalized key from address + postcode for exact matching."""
    pc = normalize_postcode(postcode) if postcode else ""
    addr = normalize_text(address_text)
    # strip a few very common words that add noise (but not 'data'/'server' etc.)
    addr = re.sub(r'\b(the|ltd|limited|co|company|building|unit|floor)\b', '', addr)
    addr = MULTISP.sub(' ', addr).strip()
    key = (addr + " " + pc).strip()
    # collapse spaces & return
    return NON_ALNUM.sub('', key)

def extract_postcode_from_text(s):
    if not s:
        return ""
    m = POSTCODE_RE.search(s)
    return m.group(1).upper().strip() if m else ""

# ------------------ Read VOA file (asterisk-delimited) ------------------
def read_voa_file(path):
    out = []
    if not os.path.exists(path):
        print("VOA file not found:", path)
        return out
    with open(path, "r", encoding="utf-8", errors="replace") as fh:
        for raw in fh:
            line = raw.strip()
            if not line:
                continue
            parts = line.split("*")
            # try to assemble a sensible address snippet from parts:
            # Many VOA rows include organisation & address fragments; we concatenate likely fields.
            # Strategy: take non-empty parts after the property-type token (heuristic).
            # Fallback: use the whole line cleaned.
            postcode = ""
            # find postcode anywhere in parts (last occurrences tend to be address/postcode fields)
            for p in reversed(parts):
                if not p:
                    continue
                pc = extract_postcode_from_text(p)
                if pc:
                    postcode = pc
                    break
            # assemble address: prefer parts that contain alpha chars and are not pure numbers
            addr_candidates = []
            for p in parts:
                if not p:
                    continue
                # skip very short numeric codes
                if re.fullmatch(r'^\d+$', p.strip()):
                    continue
                # skip codes that look like UPRN or numeric ids if long numeric
                if re.fullmatch(r'^\d{6,}$', p.strip()):
                    continue
                # keep text-like parts
                if re.search(r'[A-Za-z]', p):
                    addr_candidates.append(p.strip())
            address_guess = ", ".join(addr_candidates[:6]) if addr_candidates else line
            record = {
                "source": "VOA",
                "voa_raw": line,
                "voa_address_guess": address_guess,
                "postcode": postcode,
                "lmkey": "",  # VOA doesn't use LMK_KEY naming; leaving empty
            }
            record["match_key"] = make_match_key(record["voa_address_guess"], record["postcode"])
            out.append(record)
    return out

# ------------------ Read EPC file (auto-detect delimiter) ------------------
def detect_delimiter_sample(path, sample_bytes=8192):
    with open(path, "r", encoding="utf-8", errors="replace") as fh:
        sample = fh.read(sample_bytes)
    # prefer tab if present
    if "\t" in sample:
        return "\t"
    # else attempt to sniff
    try:
        dialect = csv.Sniffer().sniff(sample, delimiters=",;")
        return dialect.delimiter
    except Exception:
        return ","

def read_epc_file(path):
    out = []
    if not os.path.exists(path):
        print("EPC file not found:", path)
        return out
    delim = detect_delimiter_sample(path)
    with open(path, newline="", encoding="utf-8", errors="replace") as fh:
        reader = csv.DictReader(fh, delimiter=delim)
        # guard: if reader.fieldnames is None, try fallback tab/comma
        if not reader.fieldnames:
            fh.seek(0)
            reader = csv.DictReader(fh, delimiter="\t")
        for row in reader:
            # skip empty rows
            if row is None:
                continue
            # create address from ADDRESS / ADDRESS1/2/3 if present
            addr_parts = []
            for k in ("ADDRESS", "ADDRESS1", "ADDRESS2", "ADDRESS3"):
                v = row.get(k, "")
                if v:
                    addr_parts.append(v.strip())
            address = ", ".join(addr_parts).strip()
            # fallback to row.get("PROPERTY_TYPE") or "ADDRESS" field
            if not address:
                address = row.get("PROPERTY_TYPE", "") or row.get("ADDRESS", "") or " ".join([str(x) for x in row.values() if x])[:200]
            postcode = (row.get("POSTCODE") or row.get("Postcode") or "").strip()
            lmkey = row.get("LMK_KEY") or row.get("LMKKEY") or row.get("LMK Key") or ""
            record = {
                "source": "EPC",
                "epc_raw_fields": row,     # keep full dict for inspection if you want
                "epc_address": address,
                "postcode": postcode,
                "lmkey": lmkey,
            }
            record["match_key"] = make_match_key(record["epc_address"], record["postcode"])
            out.append(record)
    return out

# ------------------ Merge with deduplication ------------------
def merge_records(voa_list, epc_list):
    merged = []
    keys_map = {}   # match_key -> index in merged
    # First add all VOA entries
    for r in voa_list:
        rec = {
            "match_key": r["match_key"],
            "postcode": normalize_postcode(r["postcode"]),
            "voa_address": r["voa_address_guess"],
            "voa_raw": r["voa_raw"],
            "epc_address": "",
            "epc_lmkey": "",
            "epc_raw_repr": "",
            "sources": ["VOA"]
        }
        merged.append(rec)
        keys_map[rec["match_key"]] = len(merged) - 1

    # Now incorporate EPC entries
    # Now incorporate EPC entries
    for e in epc_list:
        key = e["match_key"]
        pc = normalize_postcode(e["postcode"])
        matched = False

        # ---- 1. Exact or loose postcode match (strongest merge condition) ----
        if pc:
            for idx, existing in enumerate(merged):
                if existing["postcode"] and postcode_equal_loose(existing["postcode"], pc):
                    # Merge EPC into existing entry
                    existing["sources"] = list(set(existing["sources"] + ["EPC"]))
                    existing["epc_address"] = e.get("epc_address") or existing["epc_address"]
                    existing["epc_raw_repr"] += " || " + str(e.get("epc_raw_fields") or "")
                    existing["epc_lmkey"] = e.get("lmkey") or existing.get("epc_lmkey", "")
                    matched = True
                    break
        if matched:
            continue

        # ---- 2. Fuzzy address match fallback ----
        best_idx = None
        best_score = 0.0
        for idx, existing in enumerate(merged):
            score = difflib.SequenceMatcher(None, existing["match_key"], key).ratio()
            if score > best_score:
                best_score = score
                best_idx = idx
        if best_score >= FUZZY_RATIO_THRESHOLD:
            rec = merged[best_idx]
            rec["sources"] = list(set(rec["sources"] + ["EPC"]))
            rec["epc_address"] = e.get("epc_address") or rec["epc_address"]
            rec["epc_raw_repr"] += " || " + str(e.get("epc_raw_fields") or "")
            rec["epc_lmkey"] = e.get("lmkey") or rec.get("epc_lmkey", "")
            continue

        # ---- 3. Otherwise add as new record ----
        newrec = {
            "match_key": key,
            "postcode": pc,
            "voa_address": "",
            "voa_raw": "",
            "epc_address": e.get("epc_address"),
            "epc_lmkey": e.get("lmkey"),
            "epc_raw_repr": str(e.get("epc_raw_fields") or ""),
            "sources": ["EPC"]
        }
        merged.append(newrec)


    return merged

# ------------------ Write output CSV ------------------
def write_output(path, records):
    # ensure consistent header order
    fieldnames = [
        "match_key",
        "postcode",
        "voa_address",
        "epc_address",
        "epc_lmkey",
        "sources",
        "voa_raw",
        "epc_raw_repr"
    ]
    with open(path, "w", newline="", encoding="utf-8") as fh:
        writer = csv.DictWriter(fh, fieldnames=fieldnames)
        writer.writeheader()
        for r in records:
            # convert lists to semicolon strings
            out = {k: r.get(k, "") for k in fieldnames}
            if isinstance(out.get("sources"), list):
                out["sources"] = ";".join(out["sources"])
            writer.writerow(out)
    print("Wrote merged output:", path)

# ------------------ Main ------------------
def main():
    print("Reading VOA input:", VOA_INPUT)
    voa = read_voa_file(VOA_INPUT)
    print("VOA records loaded:", len(voa))

    print("Reading EPC input:", EPC_INPUT)
    epc = read_epc_file(EPC_INPUT)
    print("EPC records loaded:", len(epc))

    merged = merge_records(voa, epc)
    print("Merged unique records:", len(merged))

    write_output(OUTPUT_CSV, merged)

if __name__ == "__main__":
    main()
