In [1]:
# Teknofest x Hepsiburada Address Matching Pipeline
# Complete ML solution for Turkish address deduplication and matching

# ================================================================
# STEP 0: SETUP AND INSTALLATIONS
# ================================================================
!pip install sentence-transformers faiss-cpu lightgbm scikit-learn pandas numpy
!pip install unidecode regex tqdm

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0
Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0


In [2]:
import pandas as pd
import numpy as np
import re
import string
from collections import defaultdict, Counter
import pickle
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

from sentence_transformers import SentenceTransformer
import faiss
from unidecode import unidecode
from tqdm import tqdm
import gc

# Set random seed for reproducibility
np.random.seed(42)


In [3]:
# ================================================================
# STEP 1: DATA PREPROCESSING & NORMALIZATION
# ================================================================

class TurkishAddressNormalizer:
    """Comprehensive Turkish address normalizer"""

    def __init__(self):
        # Turkish abbreviation mappings
        self.abbreviations = {
            # Neighborhood/District
            'mh': 'mahallesi', 'mah': 'mahallesi', 'mahalle': 'mahallesi',

            # Street types
            'cd': 'caddesi', 'cad': 'caddesi', 'cadde': 'caddesi',
            'sk': 'sokagi', 'sok': 'sokagi', 'sokak': 'sokagi',
            'blv': 'bulvari', 'bulv': 'bulvari', 'bulvar': 'bulvari',
            'osb': 'organize sanayi bolge', 'km': 'kilometre',

            # Building types
            'apt': 'apartmani', 'ap': 'apartmani', 'apartman': 'apartmani',
            'sit': 'sitesi', 'site': 'sitesi',
            'blk': 'blok', 'blok': 'blok',
            'plz': 'plaza', 'plaza': 'plaza',
            'avm': 'alisveris merkezi',

            # Address components
            'no': 'numara', 'nu': 'numara',
            'kt': 'kat', 'kat': 'kat',
            'dr': 'daire', 'daire': 'daire', 'da': 'daire',
            'pst': 'posta kodu',

            # Directions
            'kz': 'kuzey', 'gy': 'guney', 'dt': 'dogu', 'bt': 'bati',

            # Common words
            'yrm': 'yurdu', 'otel': 'oteli', 'hst': 'hastanesi',
            'unv': 'universitesi', 'lise': 'lisesi', 'okl': 'okulu'
        }

        # Regex patterns for component extraction
        self.patterns = {
            'number': r'(?:no[:\s]*|numara[:\s]*|n[:\s]*)?(\d+)(?:[/\-](\d+))?',
            'floor': r'(?:kat[:\s]*|kt[:\s]*|k[:\s]*)?(\d+)(?:\s*\.?\s*kat)?',
            'apartment': r'(?:daire[:\s]*|dr[:\s]*|d[:\s]*)?(\d+)(?:\s*\.?\s*daire)?',
            'block': r'(?:blok[:\s]*|blk[:\s]*|b[:\s]*)?([a-zA-Z]?\d*)(?:\s*\.?\s*blok)?',
            'postal_code': r'\b(\d{5})\b'
        }

    def normalize_turkish_chars(self, text):
        """Convert Turkish characters and remove diacritics"""
        turkish_chars = {
            'ç': 'c', 'ğ': 'g', 'ı': 'i', 'ö': 'o', 'ş': 's', 'ü': 'u',
            'Ç': 'C', 'Ğ': 'G', 'İ': 'I', 'Ö': 'O', 'Ş': 'S', 'Ü': 'U'
        }
        for tr_char, en_char in turkish_chars.items():
            text = text.replace(tr_char, en_char)
        return text

    def expand_abbreviations(self, text):
        """Expand common Turkish abbreviations"""
        words = text.split()
        expanded_words = []

        for word in words:
            # Remove punctuation for matching
            clean_word = word.strip('.,;:()-').lower()
            if clean_word in self.abbreviations:
                expanded_words.append(self.abbreviations[clean_word])
            else:
                expanded_words.append(word)

        return ' '.join(expanded_words)

    def standardize_numbers(self, text):
        """Standardize number formats"""
        # Handle "No:12", "No=12", "12/3" patterns
        text = re.sub(r'no[:\s=]*(\d+)', r'numara \1', text, flags=re.IGNORECASE)
        text = re.sub(r'(\d+)[/\-](\d+)', r'numara \1 daire \2', text)

        # Handle floor patterns
        text = re.sub(r'(\d+)\.?\s*kat', r'\1 kat', text, flags=re.IGNORECASE)

        # Handle apartment patterns
        text = re.sub(r'(\d+)\.?\s*daire', r'\1 daire', text, flags=re.IGNORECASE)

        return text

    def clean_punctuation(self, text):
        """Remove unnecessary punctuation and normalize spacing"""
        # Remove special characters but keep Turkish letters
        text = re.sub(r'[^\w\sçğıöşüÇĞİÖŞÜ]', ' ', text)

        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text)

        return text.strip()

    def extract_components(self, text):
        """Extract address components using regex"""
        components = {}

        for component, pattern in self.patterns.items():
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                if component == 'number' and match.group(2):
                    components['number'] = match.group(1)
                    components['apartment'] = match.group(2)
                else:
                    components[component] = match.group(1)

        return components

    def normalize(self, address):
        """Apply full normalization pipeline"""
        if pd.isna(address) or not isinstance(address, str):
            return ""

        # Convert to lowercase
        address = address.lower()

        # Normalize Turkish characters
        address = self.normalize_turkish_chars(address)

        # Expand abbreviations
        address = self.expand_abbreviations(address)

        # Standardize numbers
        address = self.standardize_numbers(address)

        # Clean punctuation
        address = self.clean_punctuation(address)

        return address

# Initialize normalizer
normalizer = TurkishAddressNormalizer()

In [4]:
# ================================================================
# STEP 2: DATA LOADING AND PREPROCESSING
# ================================================================

def load_and_preprocess_data():
    """Load and preprocess the datasets"""
    print("Loading datasets...")

    # Load data (adjust paths as needed)
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')

    print(f"Train shape: {train_df.shape}")
    print(f"Test shape: {test_df.shape}")
    print(f"Unique labels in train: {train_df['label'].nunique()}")

    # Normalize addresses
    print("Normalizing addresses...")
    train_df['normalized_address'] = train_df['address'].apply(normalizer.normalize)
    test_df['normalized_address'] = test_df['address'].apply(normalizer.normalize)

    # Remove empty addresses
    train_df = train_df[train_df['normalized_address'].str.len() > 0].reset_index(drop=True)
    test_df = test_df[test_df['normalized_address'].str.len() > 0].reset_index(drop=True)

    print(f"After cleaning - Train: {len(train_df)}, Test: {len(test_df)}")

    return train_df, test_df

# Load data
train_df, test_df = load_and_preprocess_data()

Loading datasets...
Train shape: (848237, 2)
Test shape: (217241, 2)
Unique labels in train: 10390
Normalizing addresses...
After cleaning - Train: 848234, Test: 217241


In [5]:
# ================================================================
# STEP 3: BASELINE MODEL - TF-IDF CENTROID
# ================================================================

class TFIDFCentroidModel:
    """TF-IDF based centroid classifier"""

    def __init__(self, ngram_range=(2, 11), max_features=None):
        self.vectorizer = TfidfVectorizer(
            analyzer='char',
            ngram_range=ngram_range,
            max_features=max_features,
            lowercase=True,
            strip_accents='unicode',
            # YENİ PARAMETRELER:
            sublinear_tf=True,      # TF = 1 + log(TF) - büyük farkları azaltır
            min_df=2,               # En az 2 dokümanda geçmeli - noise azaltır
            max_df=1,            # Çok yaygın terimleri çıkar (%95'ten fazla)
            norm='l2',              # L2 normalizasyon
            use_idf=True,           # IDF ağırlıklandırma
            smooth_idf=True,        # IDF smoothing
            dtype=np.float32        # Bellek optimizasyonu
        )
        self.label_encoder = LabelEncoder()
        self.centroids = None
        self.labels = None

    def fit(self, addresses, labels):
        """Fit the model on training data"""
        print("Fitting TF-IDF vectorizer...")

        # Encode labels
        encoded_labels = self.label_encoder.fit_transform(labels)
        self.labels = self.label_encoder.classes_

        # Vectorize addresses
        X = self.vectorizer.fit_transform(addresses)

        # Compute centroids for each label
        print("Computing centroids...")
        self.centroids = np.zeros((len(self.labels), X.shape[1]))

        for i, label in enumerate(self.labels):
            mask = encoded_labels == i
            if mask.sum() > 0:
                self.centroids[i] = X[mask].mean(axis=0).A1

        print(f"Model fitted with {len(self.labels)} unique labels")

    def predict(self, addresses, top_k=1):
        """Predict labels for new addresses"""
        X = self.vectorizer.transform(addresses)

        # Compute similarities to centroids
        similarities = cosine_similarity(X, self.centroids)

        if top_k == 1:
            predictions = similarities.argmax(axis=1)
            return self.label_encoder.inverse_transform(predictions)
        else:
            # Return top-k predictions
            top_indices = np.argsort(similarities, axis=1)[:, -top_k:][:, ::-1]
            top_labels = []
            for indices in top_indices:
                top_labels.append(self.label_encoder.inverse_transform(indices))
            return top_labels, similarities

# Train TF-IDF baseline
print("Training TF-IDF baseline model...")
tfidf_model = TFIDFCentroidModel()
tfidf_model.fit(train_df['normalized_address'], train_df['label'])

# Predict on test set
print("Making TF-IDF predictions...")
tfidf_predictions = tfidf_model.predict(test_df['normalized_address'])

# Create baseline submission
baseline_submission = pd.DataFrame({
    'id': test_df['id'],
    'label': tfidf_predictions
})
baseline_submission.to_csv('baseline_submission.csv', index=False)
print("Baseline submission saved!")


Training TF-IDF baseline model...
Fitting TF-IDF vectorizer...
Computing centroids...
Model fitted with 10390 unique labels
Making TF-IDF predictions...
Baseline submission saved!


In [8]:
# -*- coding: utf-8 -*-
"""
address_matcher_geo_centroid.py

Gelistirilmis Adres Cozumleme (Turkiye)
- Char + Word TF-IDF (birlesik) + L2 normalize
- Label basina "centroid" (prototip) vektorleri (seyrek csr)
- GeoDatabase tabanli aday daraltma (il/ilce/posta kodu)
- Esnek parametreler (n-gram, min_df, max_features, agirliklar vs.)
- Kolay kullanim: fit(train_df), predict(list[str]), evaluate(dev_df)

Girdi veri formati:
- train_df: columns = ['address', 'label']  (label: int veya str)
- dev_df  : columns = ['id','address','label'] veya ['address','label']

Not: GeoDatabase icin 'geo_database.py' dosyasini ayni klasorde bulundurun.
"""

from __future__ import annotations

import re
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Optional, Union

from scipy.sparse import hstack, csr_matrix, issparse, vstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score

# Yerel modul: GeoDatabase
try:
    from geo_database import GeoDatabase
except Exception as e:
    raise ImportError("geo_database.py ayni klasorde olmali: from geo_database import GeoDatabase")


# -----------------------------
# Yardimci: metin standardizasyonu
# -----------------------------
def _normalize_whitespace(s: str) -> str:
    s = re.sub(r"[,\.;:]+", " ", s)            # yogun noktalamayi bosluk yap
    s = re.sub(r"[\/|]+", " / ", s)            # bolu isaretleri ayrik kalsin
    s = re.sub(r"\s+", " ", s).strip()
    return s

_CANON_MAP = [
    (r"\bmahalles(i|i\.|i:|i,)?\b|\bmah(\.|:)?\b", " mah "),
    (r"\bcaddes(i|i\.|i:|i,)?\b|\bcad(\.|:)?\b", " cad "),
    (r"\bsoka(\u011f\u0131|gi|k|k\.|k:|k,)?\b|\bsok(\.|:)?\b", " sok "),
    (r"\bblv(t|d)?(\.|:)?\b|\bbulvar(i|\u0131)?\b", " bulvar "),
    (r"\bno(\.|:)?\b", " no "),
    (r"\bkat(\.|:)?\b", " kat "),
    (r"\bdair(e|e\.|e:)?\b|\bd\.\b|\bd:(?=\d)", " daire "),
    (r"\bapt(\.|:)?\b|\bapartman(i|\u0131)?\b", " apartman "),
    (r"\bmevk[ii](i|i\.|:)?\b", " mevki "),
    (r"\b(ilce|il\u00e7e)\b", " ilce "),
    (r"\b(il)\b", " il "),
]

def canon_address(text: str) -> str:
    """Adres metnini hafifce standardize eder (asiri silme yapmaz)."""
    if not isinstance(text, str):
        text = "" if text is None else str(text)
    t = text.lower()
    for pat, repl in _CANON_MAP:
        t = re.sub(pat, repl, t)
    t = re.sub(r"(no|kat|daire)\s*[:=]\s*", r"\1 ", t)
    t = re.sub(r"\b(\d{5})\b", r" \1 ", t)
    t = _normalize_whitespace(t)
    return t


# -----------------------------
# Ana Model
# -----------------------------
class AddressMatcherGeoCentroid:
    def __init__(
        self,
        # Char TF-IDF
        char_ngram: Tuple[int,int] = (3,8),
        char_max_features: Optional[int] = None,
        char_min_df: int = 1,
        char_max_df: float = 1.0,
        char_sublinear_tf: bool = True,
        # Word TF-IDF
        use_word: bool = True,
        word_ngram: Tuple[int,int] = (1,2),
        word_max_features: Optional[int] = 100_000,
        word_min_df: int = 1,
        word_max_df: float = 1.0,
        word_sublinear_tf: bool = True,
        # Birleştirme agirliklari
        char_weight: float = 1.0,
        word_weight: float = 0.7,
        # Geo kisitlama
        use_geo_filter: bool = True,
        geo_intersection_first: bool = True,   # hem il hem ilce varsa once kesisim, sonra birlik
        # Aday havuz fallback boyutu (hic geo bulunamazsa)
        fallback_top_labels: Optional[int] = None,  # None = tum etiketler
        # Rastgelelik
        random_state: int = 42
    ):
        self.char_params = dict(
            analyzer='char',
            ngram_range=char_ngram,
            max_features=char_max_features,
            lowercase=True,
            strip_accents='unicode',
            sublinear_tf=char_sublinear_tf,
            min_df=char_min_df,
            max_df=char_max_df,
            norm='l2',
            use_idf=True,
            smooth_idf=True,
            dtype=np.float32
        )
        self.word_params = dict(
            analyzer='word',
            ngram_range=word_ngram,
            max_features=word_max_features,
            lowercase=True,
            strip_accents='unicode',
            sublinear_tf=word_sublinear_tf,
            min_df=word_min_df,
            max_df=word_max_df,
            token_pattern=r"(?u)\b\w+\b",
            norm='l2',
            use_idf=True,
            smooth_idf=True,
            dtype=np.float32
        )
        self.use_word = use_word
        self.char_weight = float(char_weight)
        self.word_weight = float(word_weight)
        self.use_geo_filter = use_geo_filter
        self.geo_intersection_first = geo_intersection_first
        self.fallback_top_labels = fallback_top_labels
        self.random_state = random_state

        # placeholders
        self.char_vec: Optional[TfidfVectorizer] = None
        self.word_vec: Optional[TfidfVectorizer] = None
        self.C: Optional[csr_matrix] = None    # centroid matrix [n_labels, n_feats] L2-normalized
        self.labels_: List[Union[int,str]] = []
        self.label_to_idx: Dict[Union[int,str], int] = {}
        self.idx_to_label: Dict[int, Union[int,str]] = {}

        # Geo
        self.geo = GeoDatabase()
        self.label_provinces: List[set] = []
        self.label_districts: List[set] = []
        self.province2labels: Dict[str, set] = {}
        self.district2labels: Dict[str, set] = {}


    # ---------- vektorleştirme ----------
    def _fit_vectorizers(self, texts: List[str]):
        self.char_vec = TfidfVectorizer(**self.char_params)
        X_char = self.char_vec.fit_transform(texts)
        if self.use_word:
            self.word_vec = TfidfVectorizer(**self.word_params)
            X_word = self.word_vec.fit_transform(texts)
            if self.word_weight != 1.0:
                X_word = X_word * self.word_weight
            if self.char_weight != 1.0:
                X_char = X_char * self.char_weight
            X = hstack([X_char, X_word]).tocsr()
        else:
            if self.char_weight != 1.0:
                X_char = X_char * self.char_weight
            X = X_char.tocsr()
        return X

    def _transform(self, texts: List[str]):
        X_char = self.char_vec.transform(texts)
        if self.use_word and self.word_vec is not None:
            X_word = self.word_vec.transform(texts)
            if self.word_weight != 1.0:
                X_word = X_word * self.word_weight
            if self.char_weight != 1.0:
                X_char = X_char * self.char_weight
            X = hstack([X_char, X_word]).tocsr()
        else:
            if self.char_weight != 1.0:
                X_char = X_char * self.char_weight
            X = X_char.tocsr()
        X = normalize(X, norm='l2', axis=1, copy=False)
        return X


    # ---------- centroid hesaplama (sparse) ----------
    @staticmethod
    def _label_centroids_sparse(X: csr_matrix, y: np.ndarray, labels: List[Union[int,str]]) -> csr_matrix:
        rows = []
        for lab in labels:
            mask = (y == lab)
            Xi = X[mask]
            if Xi.shape[0] == 0:
                rows.append(csr_matrix((1, X.shape[1]), dtype=X.dtype))
                continue
            summed = Xi.sum(axis=0)
            if not issparse(summed):
                summed = csr_matrix(summed)
            centroid = summed.multiply(1.0 / Xi.shape[0])
            rows.append(centroid)
        C = vstack(rows).tocsr()
        C = normalize(C, norm='l2', axis=1, copy=False)
        return C


    # ---------- geo metadata ----------
    def _build_geo_metadata(self, texts: List[str], labels: List[Union[int,str]]):
        n_labels = len(self.labels_)
        provinces = [set() for _ in range(n_labels)]
        districts = [set() for _ in range(n_labels)]
        lab2idx = {lab: i for i, lab in enumerate(self.labels_)}

        for t, lab in zip(texts, labels):
            i = lab2idx[lab]
            p = self.geo.find_province(t)
            d = self.geo.find_district(t)
            if p:
                provinces[i].add(p)
            if d:
                districts[i].add(d)

        self.label_provinces = provinces
        self.label_districts = districts

        p2ls = {}
        d2ls = {}
        for i in range(n_labels):
            for p in provinces[i]:
                p2ls.setdefault(p, set()).add(i)
            for d in districts[i]:
                d2ls.setdefault(d, set()).add(i)
        self.province2labels = p2ls
        self.district2labels = d2ls


    def _candidate_label_indices_from_text(self, text: str) -> Optional[List[int]]:
        if not self.use_geo_filter:
            return None
        p = self.geo.find_province(text)
        d = self.geo.find_district(text)
        cand = set()
        if p and d:
            l_p = self.province2labels.get(p, set())
            l_d = self.district2labels.get(d, set())
            inter = l_p.intersection(l_d)
            if self.geo_intersection_first and len(inter) > 0:
                cand = inter
            else:
                cand = l_p.union(l_d)
        elif p:
            cand = self.province2labels.get(p, set())
        elif d:
            cand = self.district2labels.get(d, set())

        if len(cand) == 0:
            return None
        return sorted(list(cand))


    # ---------- public API ----------
    def fit(self, train_df: pd.DataFrame):
        assert {'address','label'}.issubset(train_df.columns), "train_df must have ['address','label']"
        texts_raw = train_df['address'].astype(str).tolist()
        texts = [canon_address(t) for t in texts_raw]

        self.labels_ = list(pd.Index(train_df['label']).unique())
        self.label_to_idx = {lab: i for i, lab in enumerate(self.labels_)}
        self.idx_to_label = {i: lab for lab, i in self.label_to_idx.items()}

        y = train_df['label'].values

        X = self._fit_vectorizers(texts)
        X = normalize(X, norm='l2', axis=1, copy=False)
        self.C = self._label_centroids_sparse(X, y, self.labels_)

        if self.use_geo_filter:
            self._build_geo_metadata(texts_raw, train_df['label'].tolist())

        return self


    def predict(self, addresses: List[str], topk: int = 1) -> Union[List[Union[int,str]], Tuple[List[Union[int,str]], np.ndarray]]:
        assert self.C is not None, "Model fit edilmedi."

        texts_raw = [a if isinstance(a, str) else "" for a in addresses]
        texts = [canon_address(t) for t in texts_raw]
        X = self._transform(texts)

        n = X.shape[0]
        preds = []
        all_scores = []

        for i in range(n):
            xi = X[i]
            cand_idx = self._candidate_label_indices_from_text(texts_raw[i])
            if cand_idx is None:
                C_sub = self.C
                lbls = self.labels_
            else:
                C_sub = self.C[cand_idx, :]
                lbls = [self.labels_[j] for j in cand_idx]

            sims = xi @ C_sub.T   # 1 x m
            sims = np.asarray(sims.todense()).ravel() if hasattr(sims, "todense") else np.asarray(sims).ravel()

            if topk == 1:
                j = int(np.argmax(sims))
                preds.append(lbls[j])
                all_scores.append(sims[j])
            else:
                ord_idx = np.argsort(-sims)[:topk]
                preds.append([lbls[j] for j in ord_idx])
                all_scores.append(sims[ord_idx])

        scores_arr = np.array(all_scores, dtype=np.float32)
        if topk == 1:
            return preds, scores_arr
        return preds, scores_arr


    def evaluate(self, df: pd.DataFrame) -> float:
        assert {'address','label'}.issubset(df.columns), "df must have ['address','label']"
        y_true = df['label'].tolist()
        y_pred, _ = self.predict(df['address'].tolist(), topk=1)
        return accuracy_score(y_true, y_pred)


# -----------------------------
# Hizli kullanim ornegi
# -----------------------------
if __name__ == "__main__":
    data = {
        'address': [
            "Istiklal Mah. Cumhuriyet Cad. No: 12 D:3 Gebze Kocaeli 41400",
            "Akarca Mah. Adnan Menderes Cad. 864. Sok. No:15 D:1 K:2 Fethiye Mugla 48300",
            "Zeytinburnu Bestelsiz Mah. 58. Bulvar No:10/3 Istanbul 34020",
            "Serdivan Bahcelievler Mah. 105. Sok. No:7 Adapazari Sakarya 54050"
        ],
        'label': [100, 200, 300, 400]
    }
    train_df = pd.DataFrame(data)

    model = AddressMatcherGeoCentroid(
        char_ngram=(3,8),
        char_max_features=None,
        use_word=True,
        word_ngram=(1,2),
        char_weight=1.0,
        word_weight=0.7,
        use_geo_filter=True
    )

    model.fit(train_df)

    test_addresses = [
        "Cumhuriyet cd no:12 daire:3 Gebze/KOCAELI 41400",
        "Adnan Menderes cad. 864 sok no 15 d1 Fethiye Mugla",
    ]

    preds, scores = model.predict(test_addresses, topk=1)
    for a, p, s in zip(test_addresses, preds, scores):
        print(f"[{a}] -> {p} (score={float(s):.4f})")


[Cumhuriyet cd no:12 daire:3 Gebze/KOCAELI 41400] -> 100 (score=0.7944)
[Adnan Menderes cad. 864 sok no 15 d1 Fethiye Mugla] -> 200 (score=0.7330)


In [None]:
# Gerekli paketler (Colab):
# pip install scikit-learn scipy pandas numpy

from address_matcher_geo_centroid import AddressMatcherGeoCentroid
import pandas as pd

# train_df: ['address','label']
train_df = pd.read_csv("train.csv")  # kendi yolun

model = AddressMatcherGeoCentroid(
    char_ngram=(3,8),
    use_word=True,
    word_ngram=(1,2),
    char_weight=1.0,
    word_weight=0.7,
    use_geo_filter=True,            # geo daraltma açık
    geo_intersection_first=True     # hem il hem ilçe varsa kesişimi öncelikle dene
).fit(train_df)

# Test
test = pd.read_csv("test.csv")
preds, scores = model.predict(test["address"].tolist(), topk=1)

# Submission:
out = pd.DataFrame({"id": test["id"], "label": preds})
out.to_csv("submission.csv", index=False)
