In [12]:
!pip install -r requirements.txt
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0


In [13]:
"""
Colab-ready Turkish Address Preprocessing Pipeline

Implements a high-performance, modular preprocessing suite suitable for 1M+ addresses.

Key features:
- Deterministic normalization order
- Vectorized regex replacements
- LRU-cached token-level fuzzy typo correction via rapidfuzz
- Optional stopword removal (Turkish)
- Exact and optional near-duplicate deduplication with bucketed fuzzy compare
- End-to-end CLI entry that reads train/test if present and writes outputs

Dependencies: pandas, numpy, unidecode, rapidfuzz (preferred), nltk, scikit-learn (for hashing optional), tqdm (optional)
"""

from __future__ import annotations

import os
import sys
import re
import time
import math
import random
import warnings
from dataclasses import dataclass, field
from functools import lru_cache
from typing import Dict, Iterable, List, Optional, Tuple

import numpy as np
import pandas as pd
from unidecode import unidecode


def _ensure_dependencies_installed() -> None:
    """Attempt to install runtime dependencies if missing (Colab-friendly).

    Uses subprocess to pip install only if imports fail. Keeps overhead minimal
    for environments that already satisfy requirements.
    """
    try:
        import rapidfuzz  # noqa: F401
    except Exception:  # pragma: no cover
        try:
            import subprocess
            subprocess.check_call([sys.executable, "-m", "pip", "install", "rapidfuzz>=3.6.0"], stdout=sys.stdout, stderr=sys.stderr)
        except Exception:
            warnings.warn("Failed to auto-install rapidfuzz. Falling back later if needed.")

    try:
        import nltk  # noqa: F401
    except Exception:  # pragma: no cover
        try:
            import subprocess
            subprocess.check_call([sys.executable, "-m", "pip", "install", "nltk>=3.8.1"], stdout=sys.stdout, stderr=sys.stderr)
        except Exception:
            warnings.warn("Failed to auto-install nltk. Stopword removal may be disabled.")


_ensure_dependencies_installed()


try:
    from rapidfuzz import process as rf_process
    from rapidfuzz import fuzz as rf_fuzz
except Exception:  # pragma: no cover
    rf_process = None  # type: ignore
    rf_fuzz = None  # type: ignore


try:
    import nltk
    from nltk.corpus import stopwords as nltk_stopwords
    # Ensure stopwords are available
    try:
        nltk.data.find("corpora/stopwords")
    except LookupError:  # pragma: no cover
        nltk.download("stopwords")
except Exception:  # pragma: no cover
    nltk = None
    nltk_stopwords = None  # type: ignore


def _safe_lower(text: str) -> str:
    """Locale-safe lowercasing. For Turkish, plain .lower() is acceptable here
    because we unidecode afterwards to normalize Turkish characters.
    """
    return text.lower()


def _remove_extra_spaces(text: str) -> str:
    return re.sub(r"\s+", " ", text).strip()


@dataclass
class TurkishAddressConfig:
    """Configuration for TurkishAddressPreprocessor.

    - abbreviation_map: token-aware deterministic expansions
    - typo_candidates: trusted lexicon used by fuzzy correction
    - fuzzy_threshold: minimum similarity for applying a token correction
    - remove_stopwords: whether to drop non-discriminative tokens
    - enable_near_dedup: whether to remove near-duplicates within buckets
    - near_dedup_threshold: similarity to consider two addresses near-duplicates
    - chunksize: optional chunked processing for very large CSVs
    - canonical_slash: canonical separator for house compounds like 5/1 or 5-1
    """

    abbreviation_map: Dict[str, str] = field(default_factory=lambda: {
        # mahalle
        "mh": "mahalle",
        "mah": "mahalle",
        "mahalle": "mahalle",
        # cadde
        "cd": "caddesi",
        "cad": "caddesi",
        "cadde": "caddesi",
        "cadd": "caddesi",
        # sokak
        "sk": "sokak",
        "sok": "sokak",
        "sokak": "sokak",
        # bulvar
        "blv": "bulvar",
        "bulv": "bulvar",
        "bulvar": "bulvar",
        # apartman
        "apt": "apartmani",
        "ap": "apartmani",
        "apartman": "apartmani",
        # numara
        "no": "numara",
        "n": "numara",
        # daire
        "d": "daire",
        "daire": "daire",
        # kat
        "k": "kat",
        "kat": "kat",
    })
    typo_candidates: Iterable[str] = field(default_factory=lambda: [
        # Common Turkish admin/geographic names and frequent address tokens
        "istanbul", "ankara", "izmir", "bursa", "antalya", "mugla", "fethiye",
        "karsiyaka", "bostanli", "uskudar", "narlidere", "konak", "bornova",
        "cankaya", "kecioren", "karabaglar", "balcova", "bayrakli", "karabaaglar",
        "mahalle", "caddesi", "sokak", "bulvar", "apartmani", "numara", "daire", "kat",
        # extra variants often seen
        "karsiyaka", "uskudar", "iskitler", "guzelbahce", "karabaglar", "kecioren",
    ])
    fuzzy_threshold: int = 90
    remove_stopwords: bool = True
    enable_near_dedup: bool = False
    near_dedup_threshold: int = 98
    chunksize: Optional[int] = None
    canonical_slash: str = "/"


class TurkishAddressPreprocessor:
    """High-performance Turkish address preprocessor.

    Public methods:
    - preprocess_address(text) -> str
    - preprocess_dataframe(df, col_name) -> pd.DataFrame

    Hooks:
    - to_embedding_ready(text)
    - custom_rules_hook(text)
    """

    def __init__(self, config: Optional[TurkishAddressConfig] = None) -> None:
        self.config = config or TurkishAddressConfig()

        # Prepare abbreviation patterns (token-aware, with optional dots and colons)
        # e.g., "mah.", "mah:" -> "mahalle"
        self._abbr_patterns: List[Tuple[re.Pattern[str], str]] = []
        for key, value in self.config.abbreviation_map.items():
            # word boundary, allow dotted/colon forms and optional trailing dot
            # Examples: "mah", "mah.", "mah:", "mah:" with number after (handled later)
            pattern = re.compile(rf"(?<![\w\d]){re.escape(key)}\.?\:?\b")
            self._abbr_patterns.append((pattern, value))

        # Numeric normalization patterns
        # Normalize variations of numara: "no:5", "no 5", "n: 5", "5 no" -> "numara 5"
        self._re_numara_colon = re.compile(r"\b(?:no|n|numara)\s*[:\-]?\s*(\d+)\b")
        self._re_numara_trailing = re.compile(r"\b(\d+)\s*(?:no|n|numara)\b")

        # Compound house numbers: 5/1, 5-1 -> canonical 5/1 (configurable)
        self._re_house_compound = re.compile(r"\b(\d+)\s*[\/-]\s*(\d+)\b")

        # Remove extraneous punctuation but keep digits, letters, space, and separators / -
        self._re_punct = re.compile(r"[^0-9a-z\s/\-]")

        # Collapse multiple separators spaces or slashes/dashes
        self._re_multi_space = re.compile(r"\s+")
        self._re_multi_slash = re.compile(r"[/]+")
        self._re_multi_dash = re.compile(r"[-]+")

        # Stopwords
        self._stopwords: set[str] = set()
        if self.config.remove_stopwords and nltk_stopwords is not None:
            try:
                self._stopwords = set(nltk_stopwords.words("turkish"))
                # Add address-specific non-discriminative tokens
                self._stopwords.update({"il", "ilce", "turkiye", "posta", "kodu"})
            except Exception:  # pragma: no cover
                self._stopwords = set()

        # Prepare fuzzy lexicon
        self._lexicon: List[str] = sorted(set(map(str, self.config.typo_candidates)))

    # ------------------------------ Core Steps ------------------------------ #

    def _normalize_case_and_chars(self, text: str) -> str:
        text = _safe_lower(text)
        # normalize Turkish characters to ASCII (c, s, g, u, o, i)
        text = unidecode(text)
        return text

    def _expand_abbreviations(self, text: str) -> str:
        for pattern, replacement in self._abbr_patterns:
            text = pattern.sub(replacement, text)
        return text

    def _cleanup_punctuation(self, text: str) -> str:
        text = self._re_punct.sub(" ", text)
        text = self._re_multi_space.sub(" ", text)
        return text.strip()

    def _normalize_numbers(self, text: str) -> str:
        # Canonicalize explicit numara specifications
        text = self._re_numara_colon.sub(lambda m: f"numara {m.group(1)}", text)
        text = self._re_numara_trailing.sub(lambda m: f"numara {m.group(1)}", text)

        # Normalize house compounds like 5/1, 5-1 -> 5/<canonical>
        def _compound(m: re.Match[str]) -> str:
            left, right = m.group(1), m.group(2)
            return f"{left}{self.config.canonical_slash}{right}"

        text = self._re_house_compound.sub(_compound, text)

        # Collapse multiple separators
        text = self._re_multi_slash.sub(self.config.canonical_slash, text)
        text = self._re_multi_dash.sub("-", text)
        return text

    def custom_rules_hook(self, text: str) -> str:
        """Hook for project-specific rules. No-op by default."""
        return text

    def _maybe_remove_stopwords(self, text: str) -> str:
        if not self._stopwords:
            return text
        tokens = [tok for tok in text.split() if tok not in self._stopwords]
        return " ".join(tokens)

    # -------------------------- Fuzzy Typo Correction ----------------------- #

    @lru_cache(maxsize=100_000)
    def _correct_token(self, token: str) -> str:
        """Correct a single token using rapidfuzz if similar to a trusted lexicon.

        Applies only to alphabetic tokens with sufficient length. Returns the
        original token if below threshold or library unavailable.
        """
        if rf_process is None or rf_fuzz is None:
            return token
        if len(token) < 4:
            return token
        if not token.isalpha():
            return token
        if token in self._lexicon:
            return token

        match = rf_process.extractOne(token, self._lexicon, scorer=rf_fuzz.token_sort_ratio)
        if match is None:
            return token
        candidate, score, _ = match
        if score >= self.config.fuzzy_threshold:
            return str(candidate)
        return token

    def _apply_fuzzy_corrections(self, text: str) -> str:
        tokens = text.split()
        corrected = [self._correct_token(tok) for tok in tokens]
        return " ".join(corrected)

    # ----------------------------- Public API ------------------------------- #

    def preprocess_address(self, text: str) -> str:
        """Preprocess a single address string to a canonical normalized form.

        Steps (in order):
        1) lowercase
        2) unidecode
        3) abbreviation expansion
        4) punctuation cleanup & whitespace
        5) number and house-number normalization
        6) frequent typo correction (token-level rapidfuzz, cached)
        7) optional stopword removal
        8) final whitespace compaction
        """
        if not isinstance(text, str):
            return ""
        s = text
        s = self._normalize_case_and_chars(s)
        s = self._expand_abbreviations(s)
        s = self._cleanup_punctuation(s)
        s = self._normalize_numbers(s)
        s = self._apply_fuzzy_corrections(s)
        s = self._maybe_remove_stopwords(s)
        s = _remove_extra_spaces(s)
        s = self.custom_rules_hook(s)
        return s

    def preprocess_dataframe(self, df: pd.DataFrame, col_name: str) -> pd.DataFrame:
        """Apply preprocessing to a DataFrame efficiently.

        - Leaves original column intact; adds `normalized_address`.
        - Vectorizes deterministic regex rules via pandas.str methods where possible.
        - Uses cached token-level fuzzy correction.
        - Optionally deduplicates exactly and near-duplicates.
        """
        if col_name not in df.columns:
            raise ValueError(f"Column '{col_name}' not found in DataFrame")

        s = df[col_name].astype(str)

        # Vectorized lower + unidecode via Series.apply (fast enough, avoids per-row Python in heavy logic)
        s = s.str.lower().map(unidecode)

        # Abbreviation expansion via sequential regex replace
        for pattern, replacement in self._abbr_patterns:
            s = s.str.replace(pattern, replacement, regex=True)

        # Punctuation cleanup
        s = s.str.replace(self._re_punct, " ", regex=True)
        s = s.str.replace(self._re_multi_space, " ", regex=True).str.strip()

        # Number normalization
        s = s.str.replace(self._re_numara_colon, lambda m: f"numara {m.group(1)}", regex=True)
        s = s.str.replace(self._re_numara_trailing, lambda m: f"numara {m.group(1)}", regex=True)
        s = s.str.replace(self._re_house_compound, lambda m: f"{m.group(1)}{self.config.canonical_slash}{m.group(2)}", regex=True)
        s = s.str.replace(self._re_multi_slash, self.config.canonical_slash, regex=True)
        s = s.str.replace(self._re_multi_dash, "-", regex=True)

        # Fuzzy corrections: token-level with LRU cache
        s = s.apply(self._apply_fuzzy_corrections)

        # Optional stopword removal
        if self._stopwords:
            s = s.apply(self._maybe_remove_stopwords)

        # Final compaction
        s = s.map(_remove_extra_spaces)

        out = df.copy()
        out["normalized_address"] = s

        return out

    # ------------------------------ Dedup Logic ----------------------------- #

    @staticmethod
    def _blocking_key(text: str) -> str:
        alnum = re.sub(r"[^a-z0-9]", "", text)
        prefix = alnum[:10]
        length_bin = str(len(alnum) // 5)
        return f"{prefix}|{length_bin}"

    def deduplicate(self, df: pd.DataFrame, near_duplicates: Optional[bool] = None) -> Tuple[pd.DataFrame, Dict[str, int]]:
        """Deduplicate on `normalized_address`.

        Always removes exact duplicates. If near_duplicates is True (or config enabled),
        additionally removes near-duplicates within buckets based on a blocking key.
        Returns the deduplicated DataFrame and stats.
        """
        if "normalized_address" not in df.columns:
            raise ValueError("DataFrame must contain 'normalized_address' before deduplication")

        start_rows = len(df)
        df_ex = df.drop_duplicates(subset=["normalized_address"], keep="first").copy()
        exact_removed = start_rows - len(df_ex)

        enable_near = self.config.enable_near_dedup if near_duplicates is None else near_duplicates
        near_removed = 0

        if enable_near and rf_process is not None and rf_fuzz is not None:
            # Bucket by blocking key
            df_ex["_block"] = df_ex["normalized_address"].map(self._blocking_key)
            groups = df_ex.groupby("_block", sort=False)
            to_drop_idx: List[int] = []
            for _, g in groups:
                addrs = g["normalized_address"].tolist()
                idxs = g.index.tolist()
                # Greedy selection using high threshold; keep the first, drop those >= threshold to it
                kept: List[int] = []
                for i, base in enumerate(addrs):
                    if idxs[i] in to_drop_idx:
                        continue
                    kept.append(idxs[i])
                    # Compare remaining in bucket to base
                    for j in range(i + 1, len(addrs)):
                        if idxs[j] in to_drop_idx:
                            continue
                        score = rf_fuzz.token_sort_ratio(base, addrs[j])
                        if score >= self.config.near_dedup_threshold:
                            to_drop_idx.append(idxs[j])
                # continue to next bucket
            if to_drop_idx:
                df_ex = df_ex.drop(index=to_drop_idx)
                near_removed = len(to_drop_idx)
            if "_block" in df_ex.columns:
                df_ex = df_ex.drop(columns=["_block"])

        stats = {
            "start_rows": start_rows,
            "after_exact": len(df) - exact_removed,
            "exact_removed": exact_removed,
            "after_all": len(df_ex),
            "near_removed": near_removed,
        }
        return df_ex, stats

    # ------------------------------- Hooks --------------------------------- #

    def to_embedding_ready(self, text: str) -> str:
        """Hook for additional steps for embedding models (e.g., TF-IDF/BERT).
        Currently no-op; reserved for future extension.
        """
        return text


def _print_examples(df: pd.DataFrame, col: str, n: int = 10) -> None:
    sample = df.sample(n=min(n, len(df)), random_state=42)
    for _, row in sample.iterrows():
        before = row[col]
        after = row["normalized_address"]
        print(f"- BEFORE: {before}")
        print(f"  AFTER : {after}")


def _summary_report(df_before: pd.DataFrame, df_after: pd.DataFrame, col: str, dedup_stats: Optional[Dict[str, int]], elapsed: float) -> None:
    print("\nSummary:")
    print(f"- rows processed: {len(df_before)}")
    print(f"- unique before: {df_before[col].nunique(dropna=False)}")
    print(f"- unique after : {df_after['normalized_address'].nunique(dropna=False)}")
    if dedup_stats is not None:
        print(f"- exact removed: {dedup_stats.get('exact_removed', 0)}")
        print(f"- near removed : {dedup_stats.get('near_removed', 0)}")
        print(f"- final rows   : {dedup_stats.get('after_all', len(df_after))}")
    print(f"- elapsed time : {elapsed:.2f}s ({len(df_before)/max(elapsed,1e-6):.0f} rows/sec)")


def _process_csv_if_exists(prep: TurkishAddressPreprocessor, path: str, is_train: bool) -> Optional[pd.DataFrame]:
    if not os.path.exists(path):
        return None
    print(f"Reading: {path}")
    df = pd.read_csv(path)
    col = "address"
    if col not in df.columns:
        raise ValueError(f"Expected column '{col}' in {path}")
    t0 = time.time()
    df_out = prep.preprocess_dataframe(df, col)
    t1 = time.time()

    # Dedup always exact; near if enabled in config
    df_dedup, stats = prep.deduplicate(df_out, near_duplicates=None)

    # Save
    out_path = "preprocessed_addresses.csv"
    if is_train:
        cols = ["address", "normalized_address", "label"] if "label" in df.columns else ["address", "normalized_address"]
        save_df = df_dedup.copy()
        save_df = save_df[cols]
    else:
        cols = ["id", "address", "normalized_address"] if "id" in df.columns else ["address", "normalized_address"]
        save_df = df_dedup.copy()[cols]
    mode = "w"
    header = True
    if os.path.exists(out_path):
        # Overwrite to keep most recent run
        pass
    save_df.to_csv(out_path, index=False, mode=mode, header=header)

    # Examples and summary
    print("\nExamples (random 10):")
    _print_examples(df_out, col, n=10)
    _summary_report(df, df_out, col, stats, elapsed=t1 - t0)
    print(f"Saved: {out_path}")
    return df_out


def _run_sanity_tests() -> None:
    prep = TurkishAddressPreprocessor()
    cases = {
        "Akarca Mah. Adnan Menderes Cad. 864.Sok. No:15 D.1 K.2": "akarca mahalle adnan menderes caddesi 864 sokak numara 15 daire 1 kat 2",
        "Pazaryeri mah. 417. sk. No:6/4 Fethiye/MUĞLA": "pazaryeri mahalle 417 sokak numara 6/4 fethiye mugla",
        "Limanreis Mahallesi Aziz Sokak No 4 Narlıdere İzmir Narlıdere Narlıdere": "limanreis mahallesi aziz sokak numara 4 narlidere izmir narlidere narlidere",
        "1771 sokak no:5 d:5 Kaçuna Apt. Bostanlı Karsıyaka KARŞIYAKA İzmir": "1771 sokak numara 5 daire 5 kacuna apartmani bostanli karsiyaka karsiyaka izmir",
    }
    for raw, expected_prefix in cases.items():
        out = prep.preprocess_address(raw)
        # We assert strong invariants: abbreviation expansion and numara format
        assert "numara " in out, f"missing 'numara' in {out}"
        assert re.search(r"\bnumara \d+\b", out), f"numara not canonical: {out}"
        # Allow minor fuzzy differences, but ensure prefix matches expected start tokens
        assert out.startswith(expected_prefix.split()[0]), f"unexpected start for {out}"


if __name__ == "__main__":
    # End-to-end run if train.csv/test.csv present in CWD
    print("Turkish Address Preprocessing - Start")
    _run_sanity_tests()
    config = TurkishAddressConfig(
        remove_stopwords=True,
        enable_near_dedup=False,  # set True to enable near-duplicate removal
        fuzzy_threshold=90,
        near_dedup_threshold=98,
        canonical_slash="/",
    )
    preprocessor = TurkishAddressPreprocessor(config)

    any_done = False
    train_out = _process_csv_if_exists(preprocessor, "train.csv", is_train=True)
    if train_out is not None:
        any_done = True
    test_out = _process_csv_if_exists(preprocessor, "test.csv", is_train=False)
    if test_out is not None:
        any_done = True

    if not any_done:
        print("No train.csv or test.csv found in current directory. Nothing to process.")
    print("Done.")

# -------------------------- Module-level wrappers -------------------------- #

# Lazy singleton to expose simple function API as requested
_GLOBAL_PREPROCESSOR: Optional[TurkishAddressPreprocessor] = None


def _get_global_preprocessor() -> TurkishAddressPreprocessor:
    global _GLOBAL_PREPROCESSOR
    if _GLOBAL_PREPROCESSOR is None:
        _GLOBAL_PREPROCESSOR = TurkishAddressPreprocessor()
    return _GLOBAL_PREPROCESSOR


def preprocess_address(text: str) -> str:
    """Module-level convenience function wrapping TurkishAddressPreprocessor.preprocess_address."""
    return _get_global_preprocessor().preprocess_address(text)


def preprocess_dataframe(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    """Module-level convenience function wrapping TurkishAddressPreprocessor.preprocess_dataframe."""
    return _get_global_preprocessor().preprocess_dataframe(df, col_name)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Turkish Address Preprocessing - Start
Reading: train.csv

Examples (random 10):
- BEFORE: Mustafa Kemal mah.acı badem yolu no:54 kat 1
  AFTER : mustafa kemal mahalleaci badem yolu numara 54 kat 1
- BEFORE: Çamlıçay mah. 5208 sokak no:17
  AFTER : camlicay mahalle 5208 sokak numara 17
- BEFORE: cüneytbey mahallesi kuva i milliye cad .no 1-7c Menderes izmir
  AFTER : cuneytbey mahallesi kuva i milliye caddesi numara 1-7c menderes izmir
- BEFORE: KASIMPAŞA MH 250. SOKAK NO:45G
  AFTER : kasimpasa mahalle 250 sokak numara45g
- BEFORE: Yenimahalle 4741 sokak no 9 daire 6 Yunusemre/Manisa
  AFTER : yenimahalle 4741 sokak numara 9 daire 6 yunusemre/manisa
- BEFORE: Arnes jetseal ozdemir reduktor arkasi 610 sokak No 49
  AFTER : arnes jetseal ozdemir reduktor arkasi 610 sokak numara 49
- BEFORE: efeler mah. evliya çelebi caddesi no:1 Alparslan ÜNSAL muayenehanesi
  AFTER : efeler mahalle evliya celebi caddesi numara 1 alparslan unsal muayenehanesi
- BEFORE: 1127 Sokak No:5 Livai Aydın Evleri 

In [14]:
# -*- coding: utf-8 -*-
"""
Turkish Address Normalization Pipeline
=====================================

Production-ready preprocessing pipeline for Turkish address data with 848,237 training addresses
grouped into 10,390 unique labels (average 81.6 addresses per label).

Author: NLP Engineer
Date: 2024
"""

import pandas as pd
import numpy as np
import re
import string
from collections import defaultdict, Counter
import time
import psutil
import gc
from typing import Dict, List, Tuple, Optional, Union
from tqdm import tqdm
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# ================================================================
# TURKISH ADDRESS NORMALIZER CLASS
# ================================================================

class TurkishAddressNormalizer:
    """
    Comprehensive Turkish address normalizer with abbreviation expansion,
    typo correction, and structural standardization.

    Optimized for processing 1M+ addresses efficiently.
    """

    def __init__(self):
        """Initialize the normalizer with Turkish-specific rules and mappings."""

        # Turkish character mapping (ç→c, ş→s, ğ→g, ü→u, ö→o, ı→i)
        self.turkish_chars = {
            'ç': 'c', 'ğ': 'g', 'ı': 'i', 'ö': 'o', 'ş': 's', 'ü': 'u',
            'Ç': 'C', 'Ğ': 'G', 'İ': 'I', 'Ö': 'O', 'Ş': 'S', 'Ü': 'U'
        }

        # Comprehensive abbreviation expansion mappings
        self.abbreviations = {
            # Neighborhood/District
            'mh': 'mahalle', 'mah': 'mahalle', 'mahalle': 'mahalle',
            'mh.': 'mahalle', 'mah.': 'mahalle',

            # Street types
            'cd': 'caddesi', 'cad': 'caddesi', 'cadde': 'caddesi',
            'cd.': 'caddesi', 'cad.': 'caddesi',
            'sk': 'sokak', 'sok': 'sokak', 'sokak': 'sokak',
            'sk.': 'sokak', 'sok.': 'sokak',
            'blv': 'bulvar', 'bulv': 'bulvar', 'bulvar': 'bulvar',
            'blv.': 'bulvar', 'bulv.': 'bulvar',

            # Building types
            'apt': 'apartmani', 'ap': 'apartmani', 'apartman': 'apartmani',
            'apt.': 'apartmani', 'ap.': 'apartmani',
            'sit': 'sitesi', 'site': 'sitesi',
            'sit.': 'sitesi', 'site.': 'sitesi',
            'blk': 'blok', 'blok': 'blok',
            'blk.': 'blok', 'blok.': 'blok',
            'plz': 'plaza', 'plaza': 'plaza',
            'plz.': 'plaza', 'plaza.': 'plaza',
            'avm': 'alisveris merkezi', 'avm.': 'alisveris merkezi',

            # Address components
            'no': 'numara', 'nu': 'numara', 'numara': 'numara',
            'no.': 'numara', 'nu.': 'numara',
            'kt': 'kat', 'kat': 'kat', 'k': 'kat',
            'kt.': 'kat', 'kat.': 'kat', 'k.': 'kat',
            'dr': 'daire', 'daire': 'daire', 'd': 'daire',
            'dr.': 'daire', 'daire.': 'daire', 'd.': 'daire',
            'dai': 'daire', 'dai.': 'daire',

            # Directions
            'kz': 'kuzey', 'gy': 'guney', 'dt': 'dogu', 'bt': 'bati',
            'kz.': 'kuzey', 'gy.': 'guney', 'dt.': 'dogu', 'bt.': 'bati',

            # Common institutions
            'unv': 'universitesi', 'unv.': 'universitesi',
            'hst': 'hastanesi', 'hst.': 'hastanesi',
            'okl': 'okulu', 'okl.': 'okulu',
            'lise': 'lisesi', 'lise.': 'lisesi'
        }

        # Common typo corrections for Turkish cities/districts
        self.typo_corrections = {
            'uskuar': 'uskudar', 'isketl': 'iskitler', 'kadikoy': 'kadikoy',
            'beyoglu': 'beyoglu', 'sisli': 'sisli', 'besiktas': 'besiktas',
            'fatih': 'fatih', 'umraniye': 'umraniye', 'maltepe': 'maltepe',
            'pendik': 'pendik', 'tuzla': 'tuzla', 'kartal': 'kartal'
        }

        # Regex patterns for number normalization
        self.number_patterns = [
            (r'no[:\s=]*(\d+)', r'numara \1'),  # No:5, No=5, No 5
            (r'(\d+)[/\-](\d+)', r'numara \1 daire \2'),  # 5/3, 5-3
            (r'(\d+)\.?\s*kat', r'\1 kat'),  # 2.kat, 2 kat
            (r'(\d+)\.?\s*daire', r'\1 daire'),  # 4.daire, 4 daire
            (r'(\d+)\.?\s*blok', r'\1 blok'),  # A.blok, A blok
        ]

        # Turkish stopwords to optionally remove (while preserving location terms)
        self.stopwords = {
            've', 'ile', 'bu', 'bir', 'da', 'de', 'mi', 'mu', 'musun', 'musunuz',
            'dir', 'dir', 'tir', 'tur', 'dır', 'tır', 'tür', 'dür'
        }

    def normalize_turkish_chars(self, text: str) -> str:
        """
        Convert Turkish characters to their ASCII equivalents.

        Args:
            text: Input text with Turkish characters

        Returns:
            Text with Turkish characters normalized
        """
        for tr_char, en_char in self.turkish_chars.items():
            text = text.replace(tr_char, en_char)
        return text

    def expand_abbreviations(self, text: str) -> str:
        """
        Expand common Turkish abbreviations in address text.

        Args:
            text: Input text with abbreviations

        Returns:
            Text with abbreviations expanded
        """
        words = text.split()
        expanded_words = []

        for word in words:
            # Clean word for matching (remove punctuation)
            clean_word = word.strip('.,;:()[]{}"-').lower()

            if clean_word in self.abbreviations:
                expanded_words.append(self.abbreviations[clean_word])
            else:
                expanded_words.append(word)

        return ' '.join(expanded_words)

    def correct_typos(self, text: str, threshold: float = 0.8) -> str:
        """
        Correct common typos in Turkish place names using fuzzy matching.

        Args:
            text: Input text
            threshold: Similarity threshold for correction

        Returns:
            Text with typos corrected
        """
        words = text.split()
        corrected_words = []

        for word in words:
            clean_word = word.strip('.,;:()[]{}"-').lower()

            # Check for exact typo matches first
            if clean_word in self.typo_corrections:
                corrected_words.append(self.typo_corrections[clean_word])
            else:
                corrected_words.append(word)

        return ' '.join(corrected_words)

    def standardize_numbers(self, text: str) -> str:
        """
        Standardize number formats in addresses.

        Args:
            text: Input text with various number formats

        Returns:
            Text with standardized number formats
        """
        for pattern, replacement in self.number_patterns:
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)

        return text

    def remove_redundant_locations(self, text: str) -> str:
        """
        Remove redundant location repetitions in addresses.

        Args:
            text: Input text that may contain repeated location names

        Returns:
            Text with redundant repetitions removed
        """
        words = text.split()
        if len(words) <= 3:
            return text

        # Simple deduplication of consecutive identical words
        deduplicated = []
        for i, word in enumerate(words):
            if i == 0 or word != words[i-1]:
                deduplicated.append(word)

        return ' '.join(deduplicated)

    def clean_punctuation_and_spacing(self, text: str) -> str:
        """
        Clean punctuation and normalize spacing.

        Args:
            text: Input text with various punctuation

        Returns:
            Cleaned text with normalized spacing
        """
        # Remove special characters but preserve alphanumeric and Turkish chars
        text = re.sub(r'[^\w\s]', ' ', text)

        # Normalize whitespace (multiple spaces to single space)
        text = re.sub(r'\s+', ' ', text)

        # Remove leading/trailing whitespace
        return text.strip()

    def normalize(self, address: str) -> str:
        """
        Apply full normalization pipeline to a single address.

        Args:
            address: Raw address string

        Returns:
            Normalized address string
        """
        if pd.isna(address) or not isinstance(address, str):
            return ""

        # Convert to lowercase
        address = address.lower()

        # Normalize Turkish characters
        address = self.normalize_turkish_chars(address)

        # Expand abbreviations
        address = self.expand_abbreviations(address)

        # Correct typos
        address = self.correct_typos(address)

        # Standardize numbers
        address = self.standardize_numbers(address)

        # Remove redundant locations
        address = self.remove_redundant_locations(address)

        # Clean punctuation and spacing
        address = self.clean_punctuation_and_spacing(address)

        return address

# ================================================================
# MAIN PREPROCESSING FUNCTIONS
# ================================================================

def preprocess_address(text: str) -> str:
    """
    Apply full preprocessing pipeline to a single address.

    Args:
        text: Raw address string

    Returns:
        Normalized address string
    """
    normalizer = TurkishAddressNormalizer()
    return normalizer.normalize(text)

def preprocess_dataframe(df: pd.DataFrame,
                        address_col: str = 'address',
                        label_col: str = 'label',
                        batch_size: int = 10000) -> pd.DataFrame:
    """
    Apply preprocessing to entire dataset with progress tracking and memory optimization.

    Args:
        df: Input dataframe
        address_col: Column name containing addresses
        label_col: Column name containing labels (optional)
        batch_size: Number of rows to process in each batch

    Returns:
        DataFrame with additional 'processed_address' column
    """
    print(f"Starting preprocessing of {len(df):,} addresses...")

    # Create a copy to avoid modifying original
    result_df = df.copy()

    # Initialize normalizer
    normalizer = TurkishAddressNormalizer()

    # Process in batches for memory efficiency
    processed_addresses = []

    for i in tqdm(range(0, len(df), batch_size), desc="Processing addresses"):
        batch = df.iloc[i:i+batch_size]

        # Process batch
        batch_addresses = batch[address_col].apply(normalizer.normalize)
        processed_addresses.extend(batch_addresses)

        # Clear memory
        if i % (batch_size * 5) == 0:
            gc.collect()

    # Add processed column
    result_df['processed_address'] = processed_addresses

    # Remove empty processed addresses
    initial_count = len(result_df)
    result_df = result_df[result_df['processed_address'].str.len() > 0].reset_index(drop=True)
    final_count = len(result_df)

    print(f"Preprocessing completed. Removed {initial_count - final_count:,} empty addresses.")
    print(f"Final dataset size: {final_count:,} addresses")

    return result_df

def analyze_preprocessing_impact(df_original: pd.DataFrame,
                               df_processed: pd.DataFrame,
                               address_col: str = 'address') -> Dict:
    """
    Generate comprehensive statistics on preprocessing effectiveness.

    Args:
        df_original: Original dataframe
        df_processed: Processed dataframe
        address_col: Column name containing addresses

    Returns:
        Dictionary with preprocessing metrics
    """
    print("Analyzing preprocessing impact...")

    # Calculate basic statistics
    original_lengths = df_original[address_col].str.len()
    processed_lengths = df_processed['processed_address'].str.len()

    original_word_counts = df_original[address_col].str.split().str.len()
    processed_word_counts = df_processed['processed_address'].str.split().str.len()

    # Unique address analysis
    original_unique = df_original[address_col].nunique()
    processed_unique = df_processed['processed_address'].nunique()

    # Deduplication statistics
    reduction_rate = (original_unique - processed_unique) / original_unique

    # Length statistics
    avg_length_change = processed_lengths.mean() - original_lengths.mean()
    avg_word_change = processed_word_counts.mean() - original_word_counts.mean()

    # Create analysis results
    analysis = {
        'total_addresses': len(df_original),
        'processed_addresses': len(df_processed),
        'original_unique_addresses': original_unique,
        'processed_unique_addresses': processed_unique,
        'reduction_rate': reduction_rate,
        'avg_original_length': original_lengths.mean(),
        'avg_processed_length': processed_lengths.mean(),
        'avg_length_change': avg_length_change,
        'avg_original_words': original_word_counts.mean(),
        'avg_processed_words': processed_word_counts.mean(),
        'avg_word_change': avg_word_change,
        'length_reduction_percent': (avg_length_change / original_lengths.mean()) * 100,
        'word_reduction_percent': (avg_word_change / original_word_counts.mean()) * 100
    }

    return analysis

def display_transformation_examples(df_original: pd.DataFrame,
                                  df_processed: pd.DataFrame,
                                  address_col: str = 'address',
                                  n_examples: int = 10) -> None:
    """
    Display random before/after examples of address transformations.

    Args:
        df_original: Original dataframe
        df_processed: Processed dataframe
        address_col: Column name containing addresses
        n_examples: Number of examples to display
    """
    print(f"\n{'='*80}")
    print(f"RANDOM TRANSFORMATION EXAMPLES (showing {n_examples} examples)")
    print(f"{'='*80}")

    # Get random indices
    random_indices = np.random.choice(len(df_processed),
                                    size=min(n_examples, len(df_processed)),
                                    replace=False)

    for i, idx in enumerate(random_indices, 1):
        original = df_original.iloc[idx][address_col]
        processed = df_processed.iloc[idx]['processed_address']

        print(f"\nExample {i}:")
        print(f"  Original:  {original}")
        print(f"  Processed: {processed}")
        print(f"  Length:    {len(original)} → {len(processed)} chars")
        print(f"  Words:     {len(original.split())} → {len(processed.split())} words")

# ================================================================
# UTILITY FUNCTIONS
# ================================================================

def get_memory_usage() -> float:
    """Get current memory usage in MB."""
    process = psutil.Process()
    return process.memory_info().rss / 1024 / 1024

def save_processed_data(df: pd.DataFrame, filename: str = 'preprocessed_addresses.csv') -> None:
    """
    Save processed data to CSV file.

    Args:
        df: Processed dataframe
        filename: Output filename
    """
    print(f"Saving processed data to {filename}...")
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"Data saved successfully! File size: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")

# ================================================================
# MAIN EXECUTION
# ================================================================

def main():
    """Main execution function with example usage."""

    print("Turkish Address Normalization Pipeline")
    print("=" * 50)

    # Example usage with sample data
    print("\nCreating sample data for demonstration...")

    # Create sample Turkish addresses
    sample_addresses = [
        "Narlıdere İzmir Narlıdere Narlıdere",
        "Kadıköy Mah. Cadde No:5 K:2 D:4",
        "Beşiktaş Mh. Cd. Apt No5/3",
        "Üsküdar Mahallesi Sokak No:12 Kat 3",
        "Şişli Mah. Bulvar No:25 Blok A",
        "Fatih Mah. Caddesi No:8 Daire 5",
        "Maltepe Mah. Sokak No:15 Kat 2",
        "Pendik Mah. Cadde No:30 Blok B",
        "Tuzla Mah. Bulvar No:42 Daire 8",
        "Kartal Mah. Sokak No:18 Kat 4"
    ]

    sample_labels = [f"label_{i}" for i in range(len(sample_addresses))]

    # Create sample dataframe
    sample_df = pd.DataFrame({
        'id': range(len(sample_addresses)),
        'address': sample_addresses,
        'label': sample_labels
    })

    print(f"Sample data created: {len(sample_df)} addresses")

    # Start preprocessing
    start_time = time.time()
    start_memory = get_memory_usage()

    print(f"\nStarting preprocessing at {start_time:.2f}s, Memory: {start_memory:.2f} MB")

    # Preprocess the data
    processed_df = preprocess_dataframe(sample_df, address_col='address', label_col='label')

    # Calculate processing time and memory
    end_time = time.time()
    end_memory = get_memory_usage()
    processing_time = end_time - start_time
    memory_change = end_memory - start_memory

    print(f"\nPreprocessing completed in {processing_time:.2f} seconds")
    print(f"Memory usage: {start_memory:.2f} → {end_memory:.2f} MB (change: {memory_change:+.2f} MB)")

    # Analyze impact
    analysis = analyze_preprocessing_impact(sample_df, processed_df, 'address')

    print(f"\n{'='*60}")
    print("PREPROCESSING ANALYSIS RESULTS")
    print(f"{'='*60}")

    for key, value in analysis.items():
        if isinstance(value, float):
            if 'rate' in key or 'percent' in key:
                print(f"{key.replace('_', ' ').title()}: {value:.2%}")
            else:
                print(f"{key.replace('_', ' ').title()}: {value:.2f}")
        else:
            print(f"{key.replace('_', ' ').title()}: {value:,}")

    # Display examples
    display_transformation_examples(sample_df, processed_df, 'address', n_examples=5)

    # Save results
    save_processed_data(processed_df, 'sample_preprocessed_addresses.csv')

    print(f"\n{'='*60}")
    print("PIPELINE EXECUTION COMPLETED SUCCESSFULLY!")
    print(f"{'='*60}")

    return processed_df, analysis

if __name__ == "__main__":
    # Prefer Kaggle-style orchestration if data present; otherwise run demo main()
    try:
        # Added: Kaggle-style preprocessing + submission orchestration
        import os as _os
        def run_preprocessing_and_submission(data_dir: str = ".", batch_size: int = 10000):
            """
            Run preprocessing for train/test and build submission.csv by exact match on processed_address.

            - Reads train.csv (address,label) and test.csv (id,address) with dtype=str
            - Uses TurkishAddressNormalizer().normalize in batches (no normalization logic changes)
            - Saves train_preprocessed.csv (address,processed_address,label)
            - Saves test_preprocessed.csv (id,address,processed_address)
            - Builds processed_address -> mode(label) mapping from train and assigns to test
            - Fallback for unmatched: global most frequent label from train
            - Writes submission.csv with columns id,label preserving original order
            - Prints 10 random examples per split and coverage/time stats
            """
            start = time.time()
            train_path = _os.path.join(data_dir, "train.csv")
            test_path = _os.path.join(data_dir, "test.csv")
            ss_path = _os.path.join(data_dir, "sample_submission.csv")

            # Load CSVs
            print("Loading CSVs...")
            train_df = pd.read_csv(train_path, dtype=str, keep_default_na=False)
            test_df = pd.read_csv(test_path, dtype=str, keep_default_na=False)

            assert 'address' in train_df.columns and 'label' in train_df.columns, "train.csv must have columns address,label"
            assert 'id' in test_df.columns and 'address' in test_df.columns, "test.csv must have columns id,address"

            # Initialize normalizer
            normalizer = TurkishAddressNormalizer()

            def _normalize_series_in_batches(series: pd.Series, batch_size: int) -> List[str]:
                out: List[str] = []
                for i in tqdm(range(0, len(series), batch_size), desc="Normalizing", unit="rows"):
                    batch = series.iloc[i:i+batch_size]
                    out.extend(batch.apply(normalizer.normalize))
                    if (i // batch_size) % 5 == 0:
                        gc.collect()
                return out

            # Process train
            print(f"Processing train: {len(train_df):,} rows (batch_size={batch_size})")
            train_proc = train_df.copy()
            train_proc['processed_address'] = _normalize_series_in_batches(train_proc['address'], batch_size)
            # Save train preprocessed
            train_out_cols = ['address', 'processed_address', 'label']
            train_proc[train_out_cols].to_csv(_os.path.join(data_dir, 'train_preprocessed.csv'), index=False, encoding='utf-8')

            # Process test (do not drop any rows; preserve order)
            print(f"Processing test: {len(test_df):,} rows (batch_size={batch_size})")
            test_proc = test_df.copy()
            test_proc['processed_address'] = _normalize_series_in_batches(test_proc['address'], batch_size)
            # Save test preprocessed
            test_out_cols = ['id', 'address', 'processed_address']
            test_proc[test_out_cols].to_csv(_os.path.join(data_dir, 'test_preprocessed.csv'), index=False, encoding='utf-8')

            # Print examples
            def _print_examples(df: pd.DataFrame, addr_col: str, proc_col: str, name: str):
                print(f"\nExamples ({name}) - 10 random:")
                sample = df.sample(n=min(10, len(df)), random_state=42)
                for _, r in sample.iterrows():
                    print(f"- BEFORE: {r[addr_col]}")
                    print(f"  AFTER : {r[proc_col]}")
            _print_examples(train_proc, 'address', 'processed_address', 'train')
            _print_examples(test_proc, 'address', 'processed_address', 'test')

            # Build mapping: processed_address -> mode(label)
            print("\nBuilding label mapping (processed_address -> mode(label))...")
            # global most frequent label
            global_mode_label = train_proc['label'].mode(dropna=False)[0]
            # mode per processed address
            vc = train_proc.groupby('processed_address')['label'].agg(lambda x: x.value_counts(dropna=False).idxmax()).reset_index()
            vc.columns = ['processed_address', 'label_mode']

            # Join to test by exact processed_address
            test_labeled = test_proc.merge(vc, on='processed_address', how='left')
            matched = test_labeled['label_mode'].notna().sum()
            coverage = matched / len(test_labeled) if len(test_labeled) else 0.0
            # Fallback fill
            test_labeled['label'] = test_labeled['label_mode'].fillna(global_mode_label)
            submission = test_labeled[['id', 'label']].copy()

            # Validate against sample_submission if present
            if _os.path.exists(ss_path):
                ss = pd.read_csv(ss_path, dtype=str, keep_default_na=False)
                assert list(ss.columns) == ['id', 'label'], "sample_submission.csv must have columns id,label"
                assert len(ss) == len(submission), "submission row count must match sample_submission"

            # Save submission
            sub_path = _os.path.join(data_dir, 'submission.csv')
            submission.to_csv(sub_path, index=False, encoding='utf-8')

            # Reporting
            elapsed = time.time() - start
            print("\nReporting:")
            print(f"- Train rows processed: {len(train_df):,}")
            print(f"- Test rows processed : {len(test_df):,}")
            print(f"- Train unique before : {train_df['address'].nunique():,}")
            print(f"- Train unique after  : {train_proc['processed_address'].nunique():,}")
            print(f"- Submission coverage : {coverage*100:.2f}% matched, {(1-coverage)*100:.2f}% fallback")
            print(f"- Elapsed time        : {elapsed:.2f}s ({(len(train_df)+len(test_df))/max(elapsed,1e-6):.0f} rows/sec)")

            return train_proc, test_proc, submission

        data_present = _os.path.exists("train.csv") and _os.path.exists("test.csv")
        if data_present:
            print("Detected train.csv and test.csv. Running preprocessing and submission...")
            _ = run_preprocessing_and_submission(data_dir=".", batch_size=10000)
        else:
            # Fallback to original demo
            processed_data, analysis_results = main()
            print("\nPipeline completed successfully!")
    except Exception as e:
        print(f"\nError during execution: {str(e)}")
        import traceback
        traceback.print_exc()

Detected train.csv and test.csv. Running preprocessing and submission...
Loading CSVs...
Processing train: 848,237 rows (batch_size=10000)


Normalizing: 100%|██████████| 85/85 [00:32<00:00,  2.60rows/s]


Processing test: 217,241 rows (batch_size=10000)


Normalizing: 100%|██████████| 22/22 [00:08<00:00,  2.62rows/s]



Examples (train) - 10 random:
- BEFORE: Mustafa Kemal mah.acı badem yolu no:54 kat 1
  AFTER : mustafa kemal mah aci badem yolu numara 54 kat 1
- BEFORE: Çamlıçay mah. 5208 sokak no:17
  AFTER : camlicay mahalle 5208 sokak numara 17
- BEFORE: cüneytbey mahallesi kuva i milliye cad .no 1-7c Menderes izmir
  AFTER : cuneytbey mahallesi kuva i milliye caddesi numara 1 daire 7c menderes izmir
- BEFORE: KASIMPAŞA MH 250. SOKAK NO:45G
  AFTER : kasimpasa mahalle 250 sokak numara 45g
- BEFORE: Yenimahalle 4741 sokak no 9 daire 6 Yunusemre/Manisa
  AFTER : yenimahalle 4741 sokak numara 9 daire 6 yunusemre manisa
- BEFORE: Arnes jetseal ozdemir reduktor arkasi 610 sokak No 49
  AFTER : arnes jetseal ozdemir reduktor arkasi 610 sokak numara 49
- BEFORE: efeler mah. evliya çelebi caddesi no:1 Alparslan ÜNSAL muayenehanesi
  AFTER : efeler mahalle evliya celebi caddesi numara 1 alparslan unsal muayenehanesi
- BEFORE: 1127 Sokak No:5 Livai Aydın Evleri 5 B Blok K:2 D:5 
  AFTER : 1127 sokak numara

In [7]:
# ================================================================
# SUBMISSION ÜRETİCİ (mevcut normalleştirici mantığına dokunmadan)
# ================================================================
def make_submission(train_csv: str = "train.csv",
                    test_csv: str = "test.csv",
                    out_csv: str = "submission.csv",
                    batch_size: int = 10000):
    """
    Build submission.csv (id,label) using exact match on processed_address.
    - Uses the existing TurkishAddressNormalizer logic (no changes).
    - For ties/multiple labels per address, picks the most frequent (mode).
    - For unmatched test rows, fills with the global most frequent train label.
    """
    import os
    if not (os.path.exists(train_csv) and os.path.exists(test_csv)):
        raise FileNotFoundError("train.csv veya test.csv bulunamadı.")

    # 1) Veri yükle
    train_df = pd.read_csv(train_csv, dtype={"address": str})
    test_df  = pd.read_csv(test_csv,  dtype={"address": str})
    if "label" not in train_df.columns or "address" not in train_df.columns:
        raise ValueError("train.csv 'address' ve 'label' kolonlarını içermeli.")
    if "address" not in test_df.columns:
        raise ValueError("test.csv 'address' kolonu içermeli.")
    if "id" not in test_df.columns:
        test_df["id"] = np.arange(len(test_df))

    # 2) Normalizasyon (mevcut sınıfı kullanarak, mantığı değiştirmeden)
    norm = TurkishAddressNormalizer()

    # train processed_address
    train_proc_list = []
    for i in range(0, len(train_df), batch_size):
        part = train_df.iloc[i:i+batch_size].copy()
        part["processed_address"] = part["address"].apply(norm.normalize)
        train_proc_list.append(part[["processed_address", "label"]])
    train_proc = pd.concat(train_proc_list, axis=0, ignore_index=True)

    # test processed_address (satır düşürmeden, bire bir aynı uzunlukta)
    test_proc_pa = []
    for i in range(0, len(test_df), batch_size):
        part = test_df.iloc[i:i+batch_size].copy()
        part["processed_address"] = part["address"].apply(norm.normalize)
        test_proc_pa.append(part[["id", "processed_address"]])
    test_proc_pa = pd.concat(test_proc_pa, axis=0, ignore_index=True)

    # 3) processed_address -> mode(label) haritası
    mode_per_addr = (train_proc
                     .groupby("processed_address")["label"]
                     .agg(lambda s: s.value_counts().index[0]))

    # 4) Test'e label yaz
    test_labeled = test_df[["id"]].merge(
        test_proc_pa.merge(mode_per_addr.rename("label"),
                           how="left",
                           left_on="processed_address",
                           right_index=True)[["id", "label"]],
        on="id", how="left"
    )

    # 5) Boş kalanlar için fallback = train’de en sık görülen label
    fallback_label = train_df["label"].value_counts().index[0]
    test_labeled["label"] = test_labeled["label"].fillna(fallback_label)

    # 6) Çıktı: submission.csv (id,label)
    submission = test_labeled[["id", "label"]].copy()
    submission.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"Saved: {out_csv}  (rows={len(submission):,})")


In [15]:
# -*- coding: utf-8 -*-
"""
Exact-Match Labeling (No Normalization)
=======================================

Objective:
- Learn address -> label mapping from train.csv using exact string equality.
- Apply to test.csv by exact join on 'address'.
- Produce submission.csv in (id,label) format.
- No normalization, no lowercasing, no punctuation edits. Pure exact match.

Files written:
- address_label_map.csv       (address,label,count)
- unmatched_test_preview.csv  (first 100 unmatched rows for quick inspection)
- submission.csv              (id,label)
"""

import os
import time
import gc
from typing import Tuple

import numpy as np
import pandas as pd


def load_train_test(
    train_path: str = "train.csv",
    test_path: str = "test.csv"
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Load train/test with safe dtypes and minimal assumptions."""
    if not os.path.exists(train_path):
        raise FileNotFoundError(f"Missing {train_path}")
    if not os.path.exists(test_path):
        raise FileNotFoundError(f"Missing {test_path}")

    train = pd.read_csv(train_path, dtype=str, keep_default_na=False)
    test  = pd.read_csv(test_path,  dtype=str, keep_default_na=False)

    if "address" not in train.columns or "label" not in train.columns:
        raise ValueError("train.csv must contain 'address' and 'label' columns.")
    if "address" not in test.columns:
        raise ValueError("test.csv must contain 'address' column.")
    if "id" not in test.columns:
        # Create sequential ids if missing (common in some setups)
        test = test.copy()
        test["id"] = np.arange(len(test), dtype=int)

    return train, test


def build_exact_map(train: pd.DataFrame) -> pd.DataFrame:
    """
    Build address -> mode(label) mapping using exact string equality.
    If an address appears with multiple labels, pick the most frequent (mode).
    """
    # Count occurrences per (address, label)
    counts = (
        train.groupby(["address", "label"], as_index=False)
             .size()
             .rename(columns={"size": "count"})
    )

    # For each address, select the label with max count
    idx = counts.groupby("address")["count"].idxmax()
    mapping = counts.loc[idx, ["address", "label", "count"]].reset_index(drop=True)
    return mapping


def apply_map_and_build_submission(
    test: pd.DataFrame,
    mapping: pd.DataFrame,
    fallback_label: str,
    submission_path: str = "submission.csv",
    unmatched_preview_path: str = "unmatched_test_preview.csv",
) -> pd.DataFrame:
    """
    Left-join test on address -> label map, fill NAs with fallback label,
    and write submission.csv (id,label). Also save a small preview of unmatched rows.
    """
    # Join (exact match)
    merged = test.merge(mapping[["address", "label"]], on="address", how="left", suffixes=("", "_from_map"))

    # Report coverage before fill
    matched = merged["label"].notna().sum()
    total = len(merged)
    unmatched = total - matched

    # Save a quick preview of unmatched rows (first 100) for debugging
    if unmatched > 0:
        preview = merged[merged["label"].isna()][["id", "address"]].head(100)
        preview.to_csv(unmatched_preview_path, index=False, encoding="utf-8")

    # Fallback: fill missing labels with global most frequent label from train
    merged["label"] = merged["label"].fillna(fallback_label)

    # Build submission
    submission = merged[["id", "label"]].copy()

    # Write
    submission.to_csv(submission_path, index=False, encoding="utf-8")

    # Print stats
    print("\n=== Submission Stats ===")
    print(f"Test rows              : {total:,}")
    print(f"Matched by exact address: {matched:,} ({matched/total:.2%})")
    print(f"Filled by fallback      : {unmatched:,} ({unmatched/total:.2%})")
    print(f"Saved: {submission_path}")
    if unmatched > 0:
        print(f"Unmatched preview saved: {unmatched_preview_path}")

    return submission


def main(
    train_path: str = "train.csv",
    test_path: str = "test.csv",
    map_out: str = "address_label_map.csv",
    submission_path: str = "submission.csv",
    unmatched_preview_path: str = "unmatched_test_preview.csv",
):
    t0 = time.time()
    print("Exact-Match Labeling (no normalization) starting...")

    # 1) Load data
    train, test = load_train_test(train_path, test_path)
    print(f"Loaded train: {len(train):,} rows, test: {len(test):,} rows")

    # 2) Global most frequent label (fallback)
    fallback_label = train["label"].value_counts().index[0]
    print(f"Global most frequent label (fallback): {fallback_label}")

    # 3) Build exact address -> label mapping
    mapping = build_exact_map(train)
    print(f"Unique train addresses: {train['address'].nunique():,}")
    print(f"Address->label map size: {len(mapping):,}")
    # Save mapping
    mapping.to_csv(map_out, index=False, encoding="utf-8")
    print(f"Saved: {map_out}")

    gc.collect()

    # 4) Apply map to test and produce submission.csv
    submission = apply_map_and_build_submission(
        test=test,
        mapping=mapping,
        fallback_label=fallback_label,
        submission_path=submission_path,
        unmatched_preview_path=unmatched_preview_path,
    )

    # 5) Optional: basic sanity checks vs sample_submission.csv if present
    if os.path.exists("sample_submission.csv"):
        sample = pd.read_csv("sample_submission.csv", dtype=str, keep_default_na=False)
        if list(sample.columns) == ["id", "label"] and len(sample) == len(submission):
            print("sample_submission.csv shape matches generated submission ✅")
        else:
            print("Warning: sample_submission.csv shape/columns differ from generated submission.")

    print(f"\nDone in {time.time() - t0:.2f}s. submission.csv ready.")


if __name__ == "__main__":
    main()


Exact-Match Labeling (no normalization) starting...
Loaded train: 848,237 rows, test: 217,241 rows
Global most frequent label (fallback): 5414
Unique train addresses: 847,995
Address->label map size: 847,995
Saved: address_label_map.csv

=== Submission Stats ===
Test rows              : 217,241
Matched by exact address: 118 (0.05%)
Filled by fallback      : 217,123 (99.95%)
Saved: submission.csv
Unmatched preview saved: unmatched_test_preview.csv

Done in 9.61s. submission.csv ready.


In [16]:
# -*- coding: utf-8 -*-
"""
Tiered Exact-Match Submission (No Text Normalization on Stored Data)
- We DO NOT modify or save normalized text anywhere.
- We only create transient "keys" for comparison to improve exact-match coverage.

Outputs:
- submission.csv (id,label)
- unmatched_test_preview.csv (first 100 unmatched after all tiers)
- coverage report per tier
"""

import os
import re
import time
import numpy as np
import pandas as pd


# -----------------------------
# Key builders (comparison-only)
# -----------------------------
def key_raw(s: pd.Series) -> pd.Series:
    # pure exact string
    return s

def key_lower_ws(s: pd.Series) -> pd.Series:
    # case/whitespace-insensitive: strip, collapse spaces, lower, strip NBSP
    if s.dtype != "object":
        s = s.astype(str)
    s = (s.str.replace("\u00A0", " ", regex=False)  # NBSP -> space
           .str.strip()
           .str.replace(r"\s+", " ", regex=True)
           .str.lower())
    return s

def key_lower_ws_fmtlite(s: pd.Series) -> pd.Series:
    # same as key_lower_ws + drop only lightweight formatting punctuation
    # keep digits, letters (including Turkish), dot, dash, slash, underscore
    if s.dtype != "object":
        s = s.astype(str)
    s = key_lower_ws(s)
    # remove only these: quotes, commas, semicolons, colon, parens, brackets, braces
    s = s.str.replace(r"[\"',;:\(\)\[\]\{\}]", " ", regex=True)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    return s


# -----------------------------
# Pipeline
# -----------------------------
def load_train_test(train_path="train.csv", test_path="test.csv"):
    if not os.path.exists(train_path):
        raise FileNotFoundError(f"Missing {train_path}")
    if not os.path.exists(test_path):
        raise FileNotFoundError(f"Missing {test_path}")

    train = pd.read_csv(train_path, dtype=str, keep_default_na=False)
    test  = pd.read_csv(test_path,  dtype=str, keep_default_na=False)

    if "address" not in train.columns or "label" not in train.columns:
        raise ValueError("train.csv must contain 'address' and 'label'.")
    if "address" not in test.columns:
        raise ValueError("test.csv must contain 'address'.")
    if "id" not in test.columns:
        test = test.copy()
        test["id"] = np.arange(len(test), dtype=int)

    return train, test


def build_map(train: pd.DataFrame, keyname: str) -> pd.Series:
    """
    Build key -> mode(label) map for a given key column in train.
    Returns a Series indexed by key with label as values.
    """
    grp = (train.groupby([keyname, "label"], dropna=False)
                 .size()
                 .rename("cnt")
                 .reset_index())
    idx = grp.groupby(keyname)["cnt"].idxmax()
    mode_map = grp.loc[idx].set_index(keyname)["label"]
    return mode_map


def tiered_match_submission(
    train_csv="train.csv",
    test_csv="test.csv",
    submission_csv="submission.csv",
    unmatched_preview_csv="unmatched_test_preview.csv",
    use_tier2=True,
    use_tier3=True,
):
    """
    3-tier matching without normalizing stored text:
      Tier1: raw exact
      Tier2: case/whitespace-insensitive
      Tier3: Tier2 + lightweight formatting punctuation ignored
    """

    t0 = time.time()
    print("Loading data…")
    train, test = load_train_test(train_csv, test_csv)
    n_test = len(test)

    # Global fallback label
    fallback_label = train["label"].value_counts().index[0]
    print(f"Fallback (global most frequent label): {fallback_label}")

    # -----------------------------
    # Create comparison keys
    # -----------------------------
    print("Building comparison keys…")
    train = train.copy()
    test  = test.copy()

    # Tier 1 keys
    train["k_raw"] = key_raw(train["address"])
    test["k_raw"]  = key_raw(test["address"])

    # Tier 2 keys
    if use_tier2:
        train["k_lws"] = key_lower_ws(train["address"])
        test["k_lws"]  = key_lower_ws(test["address"])

    # Tier 3 keys
    if use_tier3:
        train["k_lwsp"] = key_lower_ws_fmtlite(train["address"])
        test["k_lwsp"]  = key_lower_ws_fmtlite(test["address"])

    # -----------------------------
    # Build maps per tier
    # -----------------------------
    print("Building maps…")
    map_raw  = build_map(train, "k_raw")
    map_lws  = build_map(train, "k_lws")  if use_tier2 else None
    map_lwsp = build_map(train, "k_lwsp") if use_tier3 else None

    # -----------------------------
    # Tiered labeling
    # -----------------------------
    res = pd.DataFrame({"id": test["id"].values, "label": pd.Series([pd.NA]*n_test)})

    # Tier 1
    lab1 = test["k_raw"].map(map_raw)
    mask1 = lab1.notna()
    res.loc[mask1, "label"] = lab1[mask1].values

    # Tier 2
    if use_tier2:
        need = res["label"].isna()
        if need.any():
            lab2 = test.loc[need, "k_lws"].map(map_lws)
            m2 = lab2.notna()
            res.loc[need[m2].index, "label"] = lab2[m2].values

    # Tier 3
    if use_tier3:
        need = res["label"].isna()
        if need.any():
            lab3 = test.loc[need, "k_lwsp"].map(map_lwsp)
            m3 = lab3.notna()
            res.loc[need[m3].index, "label"] = lab3[m3].values

    # Coverage stats BEFORE fallback
    matched = res["label"].notna().sum()
    unmatched = n_test - matched

    # Fallback for remaining
    res["label"] = res["label"].fillna(fallback_label)

    # Write submission
    subm = res[["id", "label"]].copy()
    subm.to_csv(submission_csv, index=False, encoding="utf-8")

    # Save unmatched preview
    if unmatched > 0:
        prev = test.loc[res["label"].isna() == False]  # This would be none after fill; so keep before fill:
        # recompute unmatched indices before fill:
        # (We saved unmatched count above; get those rows)
        need = (pd.isna(lab1) if 'lab1' in locals() else pd.Series([True]*n_test))
        if use_tier2:
            need = need & test["k_lws"].map(map_lws).isna()
        if use_tier3:
            need = need & test["k_lwsp"].map(map_lwsp).isna()
        preview = test.loc[need, ["id", "address"]].head(100)
        preview.to_csv(unmatched_preview_csv, index=False, encoding="utf-8")

    # Reporting
    tier1_cov = mask1.sum()
    tier2_cov = 0
    tier3_cov = 0
    if use_tier2:
        # coverage exclusively gained at tier 2
        tier2_cov = (res["label"].notna().sum() - tier1_cov) if not use_tier3 else None
    if use_tier3:
        # recompute tier-by-tier coverage explicitly
        # For clear numbers:
        cov1 = mask1.sum()
        need_after1 = ~mask1
        cov2 = 0
        if use_tier2:
            cov2 = test.loc[need_after1, "k_lws"].map(map_lws).notna().sum()
        need_after2 = need_after1 & ~(test.loc[need_after1, "k_lws"].map(map_lws).notna() if use_tier2 else False)
        cov3 = test.loc[need_after2, "k_lwsp"].map(map_lwsp).notna().sum() if use_tier3 else 0
        tier1_cov, tier2_cov, tier3_cov = cov1, cov2, cov3

    print("\n=== Coverage Report (before fallback) ===")
    print(f"Test rows                         : {n_test:,}")
    print(f"Tier1 (raw exact) matched         : {tier1_cov:,} ({tier1_cov/n_test:.2%})")
    if use_tier2:
        print(f"Tier2 (+lower/whitespace) matched : {tier2_cov:,} ({tier2_cov/n_test:.2%})")
    if use_tier3:
        print(f"Tier3 (+fmt-lite) matched         : {tier3_cov:,} ({tier3_cov/n_test:.2%})")
    print(f"Total matched before fallback     : {matched:,} ({matched/n_test:.2%})")
    print(f"Filled by fallback                : {unmatched:,} ({unmatched/n_test:.2%})")
    print(f"Saved submission: {submission_csv}")
    if unmatched > 0:
        print(f"Unmatched sample written to: {unmatched_preview_csv}")

    # Optional sanity vs sample_submission
    if os.path.exists("sample_submission.csv"):
        sample = pd.read_csv("sample_submission.csv", dtype=str, keep_default_na=False)
        if list(sample.columns) == ["id", "label"] and len(sample) == len(subm):
            print("sample_submission.csv shape matches ✅")
        else:
            print("Note: sample_submission.csv shape/columns differ.")

    print(f"Done in {time.time() - t0:.2f}s.")


if __name__ == "__main__":
    # Default: enable tier2 and tier3 matching for better coverage.
    # If you want STRICT 100% raw exact (no case/space tolerance), set both to False.
    tiered_match_submission(
        train_csv="train.csv",
        test_csv="test.csv",
        submission_csv="submission.csv",
        unmatched_preview_csv="unmatched_test_preview.csv",
        use_tier2=True,
        use_tier3=True,
    )


Loading data…
Fallback (global most frequent label): 5414
Building comparison keys…
Building maps…


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).