In [1]:
!pip install numpy pandas scipy scikit-learn



In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [3]:
# P0: Fast Address Matching (Batch + SVD + NearestNeighbors)
# -----------------------------------------------------------
# Colab-friendly, scalable baseline without Python O(N*M) loops.
# - Char + Word TF-IDF (sparse)
# - hstack -> TruncatedSVD (e.g., 256-D)
# - Label mean embeddings
# - NearestNeighbors (cosine, brute) for fast batch prediction
# - Memory-friendly batching for test transform & search
# - Float32 everywhere possible
#
# Expected speedup: >10-100x over per-address/per-label loops.
# -----------------------------------------------------------

import os, sys, gc, math, time, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from scipy.sparse import hstack, csr_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

# -----------------------------
# 0) Utility: lightweight preprocessing
# -----------------------------

TR_MAP = str.maketrans({
    'Ç':'ç','Ğ':'ğ','İ':'i','I':'ı','Ö':'ö','Ş':'ş','Ü':'ü',
    '\u00A0': ' '  # non‑breaking space -> space
})

PUNCTS = "\t\n\r\f\v!\"#$%&'()*+,./:;<=>?@[\\]^`{|}~"  # Turkish letters kept

import re
MULTISPACE_RE = re.compile(r"\s+")
PUNCT_RE = re.compile("[" + re.escape(PUNCTS) + "]+")


def preprocess_address(text: str) -> str:
    if not isinstance(text, str):
        return ""
    t = text.translate(TR_MAP)
    t = t.lower()
    # normalize punctuation/whitespace but keep numbers and Turkish chars
    t = PUNCT_RE.sub(" ", t)
    t = MULTISPACE_RE.sub(" ", t).strip()
    return t

# -----------------------------
# 1) Load data
# -----------------------------

# Expected columns:
#   train.csv: address, label
#   test.csv : id, address  (if no id, we auto-generate)

TRAIN_PATH = os.getenv("TRAIN_PATH", "train.csv")
TEST_PATH  = os.getenv("TEST_PATH",  "test.csv")
SUBMIT_OUT = os.getenv("SUBMIT_OUT", "submission.csv")

print("Loading data...")
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)
print(train_df.head(2))
print(test_df.head(2))

assert "address" in train_df.columns and "label" in train_df.columns, "train.csv must have columns: address,label"
assert "address" in test_df.columns, "test.csv must have column: address"

if "id" not in test_df.columns:
    test_df["id"] = np.arange(len(test_df), dtype=np.int64)

# -----------------------------
# 2) Preprocess (vectorized)
# -----------------------------

print("Preprocessing texts (train/test)...")
train_texts = train_df["address"].astype(str).map(preprocess_address)
test_texts  = test_df["address"].astype(str).map(preprocess_address)

# -----------------------------
# 3) TF‑IDF (char + word) and SVD dimensionality reduction
# -----------------------------

# NOTE: Keep sizes moderate to control RAM; tune as needed.
CHAR_MAX_FEATS = int(os.getenv("CHAR_MAX_FEATS", 20000))
WORD_MAX_FEATS = int(os.getenv("WORD_MAX_FEATS", 20000))
SVD_DIM        = int(os.getenv("SVD_DIM", 256))
RANDOM_STATE   = 42

print(f"Fitting TF‑IDF (char_wb 3‑5, max_features={CHAR_MAX_FEATS})...")
char_vect = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), max_features=CHAR_MAX_FEATS)
X_char_train = char_vect.fit_transform(train_texts)

print(f"Fitting TF‑IDF (word 1‑2, max_features={WORD_MAX_FEATS})...")
word_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=WORD_MAX_FEATS)
X_word_train = word_vect.fit_transform(train_texts)

print("Stacking sparse matrices (train)...")
X_train_sparse = hstack([X_char_train, X_word_train]).tocsr()

print(f"Fitting TruncatedSVD to {SVD_DIM} dims (LSA)...")
svd = TruncatedSVD(n_components=SVD_DIM, random_state=RANDOM_STATE)
X_train_red = svd.fit_transform(X_train_sparse).astype(np.float32)

# Free large sparse parts not needed after transform for training
X_char_train = X_word_train = X_train_sparse = None; gc.collect()

# -----------------------------
# 4) Label mean embeddings (float32)
# -----------------------------

print("Computing label mean embeddings...")
labels = train_df["label"].to_numpy()
unique_labels, inv = np.unique(labels, return_inverse=True)

# Accumulate sums per label index
label_count = np.bincount(inv).astype(np.int32)
label_sum = np.zeros((len(unique_labels), SVD_DIM), dtype=np.float32)
for i in range(X_train_red.shape[0]):
    label_sum[inv[i]] += X_train_red[i]

label_embeds = (label_sum / np.maximum(label_count[:, None], 1)).astype(np.float32)

# -----------------------------
# 5) Fit NearestNeighbors (cosine, brute)
# -----------------------------

print("Fitting NearestNeighbors (cosine, brute)...")
nbrs = NearestNeighbors(metric='cosine', algorithm='brute')
nbrs.fit(label_embeds)

# -----------------------------
# 6) Transform TEST in batches and search top‑1
# -----------------------------

BATCH = int(os.getenv("TEST_BATCH", 50000))  # tune to your RAM
K = int(os.getenv("TOPK", 1))  # P0 = 1; can increase later

print(f"Transforming & searching test in batches of {BATCH}...")
ids = test_df["id"].to_numpy()
all_pred_idx = np.empty(len(test_df), dtype=np.int32)

n = len(test_df)
start = 0
while start < n:
    end = min(start + BATCH, n)
    chunk = test_texts.iloc[start:end]

    # sparse transform (char + word)
    Xc = char_vect.transform(chunk)
    Xw = word_vect.transform(chunk)
    Xt = hstack([Xc, Xw]).tocsr()
    Xc = Xw = None; gc.collect()

    # SVD to dense float32
    Xt_red = svd.transform(Xt).astype(np.float32)
    Xt = None; gc.collect()

    # cosine kneighbors (returns distance; smaller is better)
    dist, idx = nbrs.kneighbors(Xt_red, n_neighbors=K, return_distance=True)
    all_pred_idx[start:end] = idx[:,0].astype(np.int32)

    print(f"Processed {end}/{n} test rows...")
    start = end

predicted_labels = unique_labels[all_pred_idx]

# -----------------------------
# 7) Save submission
# -----------------------------

sub = pd.DataFrame({"id": ids, "label": predicted_labels})
sub.to_csv(SUBMIT_OUT, index=False)
print(f"Saved: {SUBMIT_OUT}")

# -----------------------------
# 8) Notes / Tuning
# -----------------------------
# - Increase SVD_DIM (e.g., 384) if RAM allows; improves recall a bit.
# - If train is very large, you can SVD-fit on a stratified sample and then transform all.
# - For even faster search, swap NearestNeighbors with FAISS (IndexFlatIP on normalized vectors)
#   after L2-normalizing embeddings; cosine ~ dot on normalized vectors.
# - For better accuracy later (P1), do city/district blocking before kneighbors and rerank top‑K.


Loading data...
                                             address  label
0  Akarca Mah. Adnan Menderes Cad. 864.Sok. No:15...   8831
1  Cumhuriye Mah. Hükümet Cad. Sivriler İşhanı No...   8810
   id                                            address
0   0    Menderes mahallesi 1013 sok No.40 Daire.2 Kat.2
1   1  250. Sk. No:14 B Blok Kat:5 Daire:14\n3. Halil...
Preprocessing texts (train/test)...
Fitting TF‑IDF (char_wb 3‑5, max_features=20000)...
Fitting TF‑IDF (word 1‑2, max_features=20000)...
Stacking sparse matrices (train)...
Fitting TruncatedSVD to 256 dims (LSA)...
Computing label mean embeddings...
Fitting NearestNeighbors (cosine, brute)...
Transforming & searching test in batches of 50000...
Processed 50000/217241 test rows...
Processed 100000/217241 test rows...
Processed 150000/217241 test rows...
Processed 200000/217241 test rows...
Processed 217241/217241 test rows...
Saved: submission.csv


In [4]:
# P0‑B: Instance‑level kNN (Char‑only option, SVD, L2‑normalize)
# -----------------------------------------------------------------
# Colab‑ready fast baseline that replaces label‑mean with instance kNN.
# Key ideas:
#  - TF‑IDF (char n‑grams; optional word n‑grams)
#  - hstack -> TruncatedSVD (e.g., 256–384D)
#  - L2 normalize embeddings (cosine stability)
#  - kNN over TRAIN INSTANCES (labels taken from the nearest train rows)
#  - Optional: exact‑match shortcut; optional: small K majority vote
#  - Streaming batches for TEST transform
#
# Env toggles (override via os.environ or Colab UI):
#   USE_WORD_NGRAMS=0/1 (default 0)
#   CHAR_MAX_FEATS=40000  WORD_MAX_FEATS=20000
#   CHAR_NGRAM_MIN=3  CHAR_NGRAM_MAX=6
#   SVD_DIM=384  TEST_BATCH=50000
#   K_INST=3      # k neighbors for instance search (majority vote)
#   USE_LABEL_MEAN=0/1 (default 0)  # optionally also build label‑mean index
#   K_LABEL=25    # top‑K for label‑mean (if used)
#   TOPK_RERANK=0 # >1 enables tiny reranker for label‑mean mode
#
# Expected input files:
#   train.csv => columns: address, label
#   test.csv  => columns: id(optional), address
# Output:
#   submission.csv => columns: id, label
# -----------------------------------------------------------------

import os, gc, re, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from scipy.sparse import hstack

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors

# -----------------------------
# 0) Lightweight preprocessing
# -----------------------------
TR_MAP = str.maketrans({
    'Ç':'ç','Ğ':'ğ','İ':'i','I':'ı','Ö':'ö','Ş':'ş','Ü':'ü',
    '\u00A0': ' '
})
PUNCTS = "\t\n\r\f\v!\"#$%&'()*+,./:;<=>?@[\\]^`{|}~"
MULTISPACE_RE = re.compile(r"\s+")
PUNCT_RE = re.compile("[" + re.escape(PUNCTS) + "]+")

def preprocess_address(text: str) -> str:
    if not isinstance(text, str):
        return ""
    t = text.translate(TR_MAP).lower()
    t = PUNCT_RE.sub(" ", t)
    t = MULTISPACE_RE.sub(" ", t).strip()
    return t

# -----------------------------
# 1) IO
# -----------------------------
TRAIN_PATH = os.getenv("TRAIN_PATH", "train.csv")
TEST_PATH  = os.getenv("TEST_PATH",  "test.csv")
SUBMIT_OUT = os.getenv("SUBMIT_OUT", "submission.csv")

print("Loading data...")
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)
assert {"address","label"}.issubset(train_df.columns)
assert "address" in test_df.columns
if "id" not in test_df.columns:
    test_df["id"] = np.arange(len(test_df), dtype=np.int64)
print(train_df.head(2))
print(test_df.head(2))

# -----------------------------
# 2) Preprocess
# -----------------------------
print("Preprocessing texts...")
train_texts = train_df["address"].astype(str).map(preprocess_address)
test_texts  = test_df["address"].astype(str).map(preprocess_address)
labels = train_df["label"].to_numpy()

# exact‑match dict (cheap quick win)
from collections import defaultdict, Counter
print("Building exact‑match dictionary from train...")
_addr_to_labels = defaultdict(Counter)
for a, y in zip(train_texts, labels):
    _addr_to_labels[a][y] += 1
EXACT_MAP = {a: cnt.most_common(1)[0][0] for a, cnt in _addr_to_labels.items()}
print(f"Exact dictionary size: {len(EXACT_MAP):,}")

# -----------------------------
# 3) Vectorizers & SVD
# -----------------------------
USE_WORD_NGRAMS = int(os.getenv("USE_WORD_NGRAMS", 0))
CHAR_MAX_FEATS  = int(os.getenv("CHAR_MAX_FEATS", 40000))
WORD_MAX_FEATS  = int(os.getenv("WORD_MAX_FEATS", 20000))
C_MIN           = int(os.getenv("CHAR_NGRAM_MIN", 3))
C_MAX           = int(os.getenv("CHAR_NGRAM_MAX", 6))
SVD_DIM         = int(os.getenv("SVD_DIM", 384))
RANDOM_STATE    = 42

print(f"Fitting TF‑IDF char_wb({C_MIN},{C_MAX}), max_features={CHAR_MAX_FEATS}...")
char_vect = TfidfVectorizer(analyzer='char_wb', ngram_range=(C_MIN, C_MAX), max_features=CHAR_MAX_FEATS)
Xc_train = char_vect.fit_transform(train_texts)

if USE_WORD_NGRAMS:
    print(f"Fitting TF‑IDF word(1,2), max_features={WORD_MAX_FEATS}...")
    word_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=WORD_MAX_FEATS)
    Xw_train = word_vect.fit_transform(train_texts)
    X_train_sparse = hstack([Xc_train, Xw_train]).tocsr()
    del Xw_train
else:
    X_train_sparse = Xc_train.tocsr()

del Xc_train; gc.collect()

print(f"Fitting TruncatedSVD to {SVD_DIM} dims...")
svd = TruncatedSVD(n_components=SVD_DIM, random_state=RANDOM_STATE)
X_train_red = svd.fit_transform(X_train_sparse).astype(np.float32)
X_train_sparse = None; gc.collect()

# L2 normalize for cosine stability
X_train_norm = normalize(X_train_red)  # float64 by default
X_train_norm = X_train_norm.astype(np.float32, copy=False)

# -----------------------------
# 4) Build searchers: instance‑kNN (default) and optional label‑mean
# -----------------------------
USE_INSTANCE_SEARCH = True
inst_k = int(os.getenv("K_INST", 3))  # majority over K neighbors

print("Fitting instance‑level NearestNeighbors (cosine, brute)...")
inst_nbrs = NearestNeighbors(metric='cosine', algorithm='brute')
inst_nbrs.fit(X_train_norm)
TRAIN_LABELS = labels

# Optional: also build label‑mean index (fallback or comparison)
USE_LABEL_MEAN = int(os.getenv("USE_LABEL_MEAN", 0))
if USE_LABEL_MEAN:
    print("Computing label‑mean embeddings (optional)...")
    unique_labels, inv = np.unique(labels, return_inverse=True)
    counts = np.bincount(inv).astype(np.int32)
    sums = np.zeros((len(unique_labels), SVD_DIM), dtype=np.float32)
    for i in range(X_train_red.shape[0]):
        sums[inv[i]] += X_train_red[i]
    label_embeds = sums / np.maximum(counts[:,None], 1)
    label_embeds = normalize(label_embeds).astype(np.float32, copy=False)
    print("Fitting label‑mean NearestNeighbors (cosine, brute)...")
    label_nbrs = NearestNeighbors(metric='cosine', algorithm='brute')
    label_nbrs.fit(label_embeds)
    K_LABEL = int(os.getenv("K_LABEL", 25))
    # light reranker helpers
    TOKEN_RE = re.compile(r"[a-z0-9çğıöşü]+")
    DIGIT_RE = re.compile(r"\b\d+[a-z]?\b")
    def tokenset(s: str): return set(TOKEN_RE.findall(s))
    def numset(s: str):    return set(DIGIT_RE.findall(s))
    print("Preparing representative text per label for rerank...")
    label_first_text = {}
    for a, yi in zip(train_texts, inv):
        if yi not in label_first_text:
            label_first_text[yi] = a
    label_tokens = [tokenset(label_first_text.get(i, "")) for i in range(len(unique_labels))]
    label_nums   = [numset(label_first_text.get(i, ""))   for i in range(len(unique_labels))]
    def rerank_candidates(test_s, cand_idx, cand_dist):
        base = 1.0 - cand_dist
        tt, tn = tokenset(test_s), numset(test_s)
        best, best_score = 0, -1e9
        for j, yi in enumerate(cand_idx):
            tok = len(tt & label_tokens[yi]) / max(len(tt | label_tokens[yi]), 1)
            num = 1.0 if tn and (tn & label_nums[yi]) else 0.0
            score = base[j] + 0.03*tok + 0.05*num
            if score > best_score:
                best, best_score = j, score
        return cand_idx[best]
    TOPK_RERANK = int(os.getenv("TOPK_RERANK", 0))

# -----------------------------
# 5) Predict in batches
# -----------------------------
BATCH = int(os.getenv("TEST_BATCH", 50000))
ids = test_df["id"].to_numpy()
predicted_labels = np.empty(len(test_df), dtype=labels.dtype)

print(f"Predicting test in batches of {BATCH} (instance‑kNN, k={inst_k})...")
start, n = 0, len(test_df)
while start < n:
    end = min(start + BATCH, n)
    chunk_ser = test_texts.iloc[start:end]
    chunk = chunk_ser.to_numpy()

    # A) exact matches (direct label)
    pred_batch = np.array([EXACT_MAP.get(s, None) for s in chunk], dtype=object)

    need_mask = np.fromiter((p is None for p in pred_batch), count=len(pred_batch), dtype=bool)
    if need_mask.any():
        need_idx = np.where(need_mask)[0]
        need_texts = chunk_ser.iloc[need_idx]

        Xc = char_vect.transform(need_texts)
        if USE_WORD_NGRAMS:
            Xw = word_vect.transform(need_texts)
            Xt = hstack([Xc, Xw]).tocsr(); del Xw
        else:
            Xt = Xc.tocsr()
        del Xc; gc.collect()

        Xt_red = svd.transform(Xt).astype(np.float32); Xt = None; gc.collect()
        Xt_norm = normalize(Xt_red).astype(np.float32, copy=False)

        # Instance search (default)
        dist, idx = inst_nbrs.kneighbors(Xt_norm, n_neighbors=max(1, inst_k), return_distance=True)
        # Majority vote across K neighbors (tie -> closest)
        if inst_k == 1:
            pred_labels_need = TRAIN_LABELS[idx[:,0]]
        else:
            pred_labels_need = np.empty(len(need_idx), dtype=TRAIN_LABELS.dtype)
            for r in range(len(need_idx)):
                cand = TRAIN_LABELS[idx[r]]
                # count and tie‑break by nearest
                vals, counts = np.unique(cand, return_counts=True)
                best = vals[np.argmax(counts)]
                # If tie, choose label of nearest neighbor
                if (counts == counts.max()).sum() > 1:
                    best = cand[0]
                pred_labels_need[r] = best
        pred_batch[need_idx] = pred_labels_need

    # Optional label‑mean fallback (only if some are still None)
    if USE_LABEL_MEAN and (pred_batch == None).any():
        need_idx = np.where(pred_batch == None)[0]
        if len(need_idx) > 0:
            need_texts = chunk_ser.iloc[need_idx]
            Xc = char_vect.transform(need_texts)
            if USE_WORD_NGRAMS:
                Xw = word_vect.transform(need_texts)
                Xt = hstack([Xc, Xw]).tocsr(); del Xw
            else:
                Xt = Xc.tocsr()
            del Xc; gc.collect()
            Xt_red = svd.transform(Xt).astype(np.float32); Xt = None; gc.collect()
            Xt_norm = normalize(Xt_red).astype(np.float32, copy=False)
            dist, idx = label_nbrs.kneighbors(Xt_norm, n_neighbors=max(1, K_LABEL), return_distance=True)
            if K_LABEL == 1 or TOPK_RERANK <= 1:
                pred_labels_need = unique_labels[idx[:,0]]
            else:
                pred_labels_need = np.empty(len(need_idx), dtype=unique_labels.dtype)
                for j in range(len(need_idx)):
                    yi = rerank_candidates(chunk[need_idx[j]], idx[j], dist[j])
                    pred_labels_need[j] = unique_labels[yi]
            pred_batch[need_idx] = pred_labels_need

    predicted_labels[start:end] = pred_batch
    print(f"Processed {end}/{n} rows...")
    start = end

# -----------------------------
# 6) Save submission
# -----------------------------
sub = pd.DataFrame({"id": ids, "label": predicted_labels})
sub.to_csv(SUBMIT_OUT, index=False)
print(f"Saved: {SUBMIT_OUT}")

# -----------------------------
# Notes
# -----------------------------
# * If RAM is tight, reduce CHAR_MAX_FEATS and/or SVD_DIM.
# * If speed is tight, disable USE_WORD_NGRAMS (default 0) and keep char 3‑6.
# * Usually (char‑only + SVD 384 + instance k=3 + L2 norm) >> label‑mean.
# * For even faster ANN, replace inst_nbrs with faiss IndexFlatIP on L2‑normalized vectors.


Loading data...
                                             address  label
0  Akarca Mah. Adnan Menderes Cad. 864.Sok. No:15...   8831
1  Cumhuriye Mah. Hükümet Cad. Sivriler İşhanı No...   8810
   id                                            address
0   0    Menderes mahallesi 1013 sok No.40 Daire.2 Kat.2
1   1  250. Sk. No:14 B Blok Kat:5 Daire:14\n3. Halil...
Preprocessing texts...
Building exact‑match dictionary from train...
Exact dictionary size: 835,663
Fitting TF‑IDF char_wb(3,6), max_features=40000...
Fitting TruncatedSVD to 384 dims...
Fitting instance‑level NearestNeighbors (cosine, brute)...
Predicting test in batches of 50000 (instance‑kNN, k=3)...
Processed 50000/217241 rows...
Processed 100000/217241 rows...
Processed 150000/217241 rows...
Processed 200000/217241 rows...
Processed 217241/217241 rows...
Saved: submission.csv


In [1]:
# P0: Fast Address Matching (Batch + SVD + NearestNeighbors)
# -----------------------------------------------------------
# Colab-friendly, scalable baseline without Python O(N*M) loops.
# - Char + Word TF-IDF (sparse)
# - hstack -> TruncatedSVD (e.g., 256-D)
# - Label mean embeddings
# - NearestNeighbors (cosine, brute) for fast batch prediction
# - Memory-friendly batching for test transform & search
# - Float32 everywhere possible
#
# Expected speedup: >10-100x over per-address/per-label loops.
# -----------------------------------------------------------

import os, sys, gc, math, time, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from scipy.sparse import hstack, csr_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

# -----------------------------
# 0) Utility: lightweight preprocessing
# -----------------------------

TR_MAP = str.maketrans({
    'Ç':'ç','Ğ':'ğ','İ':'i','I':'ı','Ö':'ö','Ş':'ş','Ü':'ü',
    '\u00A0': ' '  # non‑breaking space -> space
})

PUNCTS = "\t\n\r\f\v!\"#$%&'()*+,./:;<=>?@[\\]^`{|}~"  # Turkish letters kept

import re
MULTISPACE_RE = re.compile(r"\s+")
PUNCT_RE = re.compile("[" + re.escape(PUNCTS) + "]+")


def preprocess_address(text: str) -> str:
    if not isinstance(text, str):
        return ""
    t = text.translate(TR_MAP)
    t = t.lower()
    # normalize punctuation/whitespace but keep numbers and Turkish chars
    t = PUNCT_RE.sub(" ", t)
    t = MULTISPACE_RE.sub(" ", t).strip()
    return t

# -----------------------------
# 1) Load data
# -----------------------------

# Expected columns:
#   train.csv: address, label
#   test.csv : id, address  (if no id, we auto-generate)

TRAIN_PATH = os.getenv("TRAIN_PATH", "train.csv")
TEST_PATH  = os.getenv("TEST_PATH",  "test.csv")
SUBMIT_OUT = os.getenv("SUBMIT_OUT", "submission.csv")

print("Loading data...")
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)
print(train_df.head(2))
print(test_df.head(2))

assert "address" in train_df.columns and "label" in train_df.columns, "train.csv must have columns: address,label"
assert "address" in test_df.columns, "test.csv must have column: address"

if "id" not in test_df.columns:
    test_df["id"] = np.arange(len(test_df), dtype=np.int64)

# -----------------------------
# 2) Preprocess (vectorized)
# -----------------------------

print("Preprocessing texts (train/test)...")
train_texts = train_df["address"].astype(str).map(preprocess_address)
test_texts  = test_df["address"].astype(str).map(preprocess_address)

# -----------------------------
# 3) TF‑IDF (char + word) and SVD dimensionality reduction
# -----------------------------

# NOTE: Keep sizes moderate to control RAM; tune as needed.
CHAR_MAX_FEATS = int(os.getenv("CHAR_MAX_FEATS", 20000))
WORD_MAX_FEATS = int(os.getenv("WORD_MAX_FEATS", 20000))
SVD_DIM        = int(os.getenv("SVD_DIM", 256))
RANDOM_STATE   = 42

print(f"Fitting TF‑IDF (char_wb 3‑5, max_features={CHAR_MAX_FEATS})...")
char_vect = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), max_features=CHAR_MAX_FEATS)
X_char_train = char_vect.fit_transform(train_texts)

print(f"Fitting TF‑IDF (word 1‑2, max_features={WORD_MAX_FEATS})...")
word_vect = TfidfVectorizer(analyzer='word', ngram_range=(1,2), max_features=WORD_MAX_FEATS)
X_word_train = word_vect.fit_transform(train_texts)

print("Stacking sparse matrices (train)...")
X_train_sparse = hstack([X_char_train, X_word_train]).tocsr()

print(f"Fitting TruncatedSVD to {SVD_DIM} dims (LSA)...")
svd = TruncatedSVD(n_components=SVD_DIM, random_state=RANDOM_STATE)
X_train_red = svd.fit_transform(X_train_sparse).astype(np.float32)

# Free large sparse parts not needed after transform for training
X_char_train = X_word_train = X_train_sparse = None; gc.collect()

# -----------------------------
# 4) Label mean embeddings (float32)
# -----------------------------

print("Computing label mean embeddings...")
labels = train_df["label"].to_numpy()
unique_labels, inv = np.unique(labels, return_inverse=True)

# Accumulate sums per label index
label_count = np.bincount(inv).astype(np.int32)
label_sum = np.zeros((len(unique_labels), SVD_DIM), dtype=np.float32)
for i in range(X_train_red.shape[0]):
    label_sum[inv[i]] += X_train_red[i]

label_embeds = (label_sum / np.maximum(label_count[:, None], 1)).astype(np.float32)

# -----------------------------
# 5) Fit NearestNeighbors (cosine, brute)
# -----------------------------

print("Fitting NearestNeighbors (cosine, brute)...")
nbrs = NearestNeighbors(metric='cosine', algorithm='brute')
nbrs.fit(label_embeds)

# -----------------------------
# 6) Transform TEST in batches and search top‑1
# -----------------------------

BATCH = int(os.getenv("TEST_BATCH", 50000))  # tune to your RAM
K = int(os.getenv("TOPK", 1))  # P0 = 1; can increase later

print(f"Transforming & searching test in batches of {BATCH}...")
ids = test_df["id"].to_numpy()
all_pred_idx = np.empty(len(test_df), dtype=np.int32)

n = len(test_df)
start = 0
while start < n:
    end = min(start + BATCH, n)
    chunk = test_texts.iloc[start:end]

    # sparse transform (char + word)
    Xc = char_vect.transform(chunk)
    Xw = word_vect.transform(chunk)
    Xt = hstack([Xc, Xw]).tocsr()
    Xc = Xw = None; gc.collect()

    # SVD to dense float32
    Xt_red = svd.transform(Xt).astype(np.float32)
    Xt = None; gc.collect()

    # cosine kneighbors (returns distance; smaller is better)
    dist, idx = nbrs.kneighbors(Xt_red, n_neighbors=K, return_distance=True)
    all_pred_idx[start:end] = idx[:,0].astype(np.int32)

    print(f"Processed {end}/{n} test rows...")
    start = end

predicted_labels = unique_labels[all_pred_idx]

# -----------------------------
# 7) Save submission
# -----------------------------

sub = pd.DataFrame({"id": ids, "label": predicted_labels})
sub.to_csv(SUBMIT_OUT, index=False)
print(f"Saved: {SUBMIT_OUT}")

# -----------------------------
# 8) Notes / Tuning
# -----------------------------
# - Increase SVD_DIM (e.g., 384) if RAM allows; improves recall a bit.
# - If train is very large, you can SVD-fit on a stratified sample and then transform all.
# - For even faster search, swap NearestNeighbors with FAISS (IndexFlatIP on normalized vectors)
#   after L2-normalizing embeddings; cosine ~ dot on normalized vectors.
# - For better accuracy later (P1), do city/district blocking before kneighbors and rerank top‑K.


# =============================
# 9) P2a – Label Prototypes (multi‑centroid per label, drop‑in)
# =============================
# Amaç: Label ortalaması tek modlu kabul ediyor; çok modlu etiketlerde
# isabet düşüyor. Her label için 1‑3 prototip (medoid benzeri) seçip
# prototipler üzerinde arama yapmak genelde 0.30‑0.40 bandını yukarı taşır.
# Aşağıdaki kod P0’a drop‑in: SVD çıktısını L2 normalize edip, her label’dan
# birkaç temsilci seçer ve NearestNeighbors’i prototiplere kurar.

import numpy as np
from sklearn.preprocessing import normalize

# --- 9.1) Train embeddings'i L2 normalize et (cosine için şart) ---
X_train_norm = normalize(X_train_red).astype(np.float32, copy=False)

# --- 9.2) Farthest‑Point Sampling ile label prototipleri seç ---
# Hızlı ve stabil: KMeans yerine, label içinden en temsilî 1‑3 noktayı seçiyoruz.

def build_label_prototypes_fps(X_norm: np.ndarray, labels_np: np.ndarray,
                               unique_labels_np: np.ndarray, inv_idx: np.ndarray,
                               max_k: int = 3) -> tuple[np.ndarray, np.ndarray]:
    """X_norm: (N, D) L2‑normalize edilmiş train embedding
       labels_np: (N,) train label'ları
       unique_labels_np: benzersiz label dizisi
       inv_idx: her train satırı için unique_labels indeksine işaret eden dizi
       Çıktı: (P, D) prototip matris, (P,) prototip label'ları
    """
    protos = []
    proto_labs = []
    D = X_norm.shape[1]

    for li in range(len(unique_labels_np)):
        idxs = np.where(inv_idx == li)[0]
        cnt = len(idxs)
        if cnt == 0:
            continue
        # Küçük sınıflar için 1 prototip yeterli; büyüdükçe 2‑3
        if cnt < 20:
            k = 1
        elif cnt < 80:
            k = 2
        else:
            k = 3
        k = min(k, max_k, cnt)

        Xi = X_norm[idxs]  # (cnt, D)
        # 1) İlk prototip: label ortalamasına en benzer nokta (medoid‑of‑mean)
        mu = Xi.mean(axis=0, dtype=np.float32)
        mu /= (np.linalg.norm(mu) + 1e-9)
        sims = Xi @ mu
        chosen_local = [int(np.argmax(sims))]

        # 2..k) Farthest‑Point: seçili noktalara en uzak (min max sim) yeni nokta
        while len(chosen_local) < k:
            C = Xi[chosen_local]                    # (t, D)
            sims_to_chosen = Xi @ C.T               # (cnt, t)
            nearest_sim = np.max(sims_to_chosen, axis=1)  # (cnt,)
            pick = int(np.argmin(nearest_sim))
            if pick in chosen_local:
                break
            chosen_local.append(pick)

        for loc in chosen_local:
            protos.append(Xi[loc])
            proto_labs.append(unique_labels_np[li])

    proto_mat = np.vstack(protos).astype(np.float32)
    # Güvenlik için tekrar normalize
    proto_mat = proto_mat / (np.linalg.norm(proto_mat, axis=1, keepdims=True) + 1e-9)
    return proto_mat, np.asarray(proto_labs)

print("Building label prototypes (FPS)...")
proto_matrix, proto_labels = build_label_prototypes_fps(
    X_train_norm, labels, unique_labels, inv, max_k=int(os.getenv("MAX_PROTO_PER_LABEL", 3))
)
print(f"Prototypes total: {len(proto_labels):,}")

# --- 9.3) NearestNeighbors'i prototiplerde kur ---
print("Fitting NearestNeighbors on prototypes (cosine, brute)...")
proto_nbrs = NearestNeighbors(metric='cosine', algorithm='brute')
proto_nbrs.fit(proto_matrix)

# --- 9.4) Test akışını prototiplere göre değiştir ---
# Orijinal 6) bloğunu yorumlayıp aşağıdaki sürümü kullanın:

# BATCH = int(os.getenv("TEST_BATCH", 50000))
# K = int(os.getenv("TOPK", 1))

print(f"Transforming & searching test against prototypes in batches of {BATCH}...")
ids = test_df["id"].to_numpy()
all_pred = np.empty(len(test_df), dtype=proto_labels.dtype)

start = 0; n = len(test_df)
while start < n:
    end = min(start + BATCH, n)
    chunk = test_texts.iloc[start:end]

    Xc = char_vect.transform(chunk)
    Xw = word_vect.transform(chunk)
    Xt = hstack([Xc, Xw]).tocsr(); Xc = Xw = None; gc.collect()

    Xt_red = svd.transform(Xt).astype(np.float32); Xt = None; gc.collect()
    Xt_norm = normalize(Xt_red).astype(np.float32, copy=False)

    dist, idx = proto_nbrs.kneighbors(Xt_norm, n_neighbors=max(1, K), return_distance=True)
    # Tek komşu → o prototipin label'ı
    all_pred[start:end] = proto_labels[idx[:, 0]]
    print(f"Processed {end}/{n} test rows (prototypes)...")
    start = end

sub = pd.DataFrame({"id": ids, "label": all_pred})
sub.to_csv(SUBMIT_OUT, index=False)
print(f"Saved (prototypes): {SUBMIT_OUT}")


Loading data...
                                             address  label
0  Akarca Mah. Adnan Menderes Cad. 864.Sok. No:15...   8831
1  Cumhuriye Mah. Hükümet Cad. Sivriler İşhanı No...   8810
   id                                            address
0   0    Menderes mahallesi 1013 sok No.40 Daire.2 Kat.2
1   1  250. Sk. No:14 B Blok Kat:5 Daire:14\n3. Halil...
Preprocessing texts (train/test)...
Fitting TF‑IDF (char_wb 3‑5, max_features=20000)...
Fitting TF‑IDF (word 1‑2, max_features=20000)...
Stacking sparse matrices (train)...
Fitting TruncatedSVD to 256 dims (LSA)...
Computing label mean embeddings...
Fitting NearestNeighbors (cosine, brute)...
Transforming & searching test in batches of 50000...
Processed 50000/217241 test rows...
Processed 100000/217241 test rows...
Processed 150000/217241 test rows...
Processed 200000/217241 test rows...
Processed 217241/217241 test rows...
Saved: submission.csv
Building label prototypes (FPS)...
Prototypes total: 25,427
Fitting NearestNeigh