In [1]:
# -*- coding: utf-8 -*-
"""
Teknofest Address Resolver — Colab Pipeline (Final with Geo Features)

Tüm geliştirmeler tek dosyada:
- Güçlü normalizer (kısaltma + sayı formatı + geo DB + posta kodu)
- TF-IDF + S-BERT + FAISS aday üretimi
- Hard negative mining
- Zengin feature seti (fuzzy + numeric + geo skorlar: province, district, postal)
- LightGBM Reranker
- Test inference + submission.csv üretimi
"""

# ================================================================
# STEP 0: SETUP
# ================================================================
!pip -q install --no-input sentence-transformers faiss-cpu lightgbm rapidfuzz unidecode tqdm regex

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m78.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m81.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:


import re, random, gc
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer
from geo_database import GeoDatabase
from sklearn.model_selection import GroupKFold
from sklearn.metrics.pairwise import cosine_similarity
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
import faiss
from rapidfuzz import fuzz, distance
from tqdm import tqdm

random.seed(42)
np.random.seed(42)

# ================================================================
# STEP 1: Address Normalizer with Geo DB
# ================================================================
class AddressNormalizer:
    def __init__(self):
        self.abbreviations = {
            'mh':'mahallesi','mah':'mahallesi','cd':'caddesi','cad':'caddesi','sk':'sokagi','sok':'sokagi',
            'blv':'bulvari','apt':'apartmani','sit':'sitesi','blk':'blok','plz':'plaza','avm':'alisveris merkezi',
            'no':'numara','dr':'daire','kat':'kat','pst':'posta kodu'}
        self.turkish_to_basic = str.maketrans({'ç':'c','ğ':'g','ı':'i','ö':'o','ş':'s','ü':'u','Ç':'C','Ğ':'G','İ':'I','Ö':'O','Ş':'S','Ü':'U'})
        # simplified provinces/districts db
        self.geo = GeoDatabase()

    def normalize(self, text:str)->str:
        if not isinstance(text,str): return ""
        s = text.lower().strip()
        s = s.translate(self.turkish_to_basic)
        s = re.sub(r"[\.,;:()\[\]{}|\\]+"," ",s)
        toks=[]
        for w in s.split():
            w0 = self.abbreviations.get(w,w)
            w0 = self.geo.city_variations.get(w0,w0)
            toks.append(w0)
        s=" ".join(toks)
        s=re.sub(r"no[:=]*\s*(\d+)",r"numara \1",s)
        s=re.sub(r"(\d+)[/\-](\d+)",r"numara \1 daire \2",s)
        return s.strip()

    def detect_province(self,text):
        return self.geo.find_province(text) or ""
    def detect_district(self,text):
        return self.geo.find_district(text) or ""
    def detect_postal_province(self,text):
        return self.geo.province_from_postal(text) or ""

normalizer = AddressNormalizer()

# ================================================================
# STEP 2: Load Data
# ================================================================
def load_data(train_path='train.csv', test_path='test.csv'):
    train=pd.read_csv(train_path); test=pd.read_csv(test_path)
    train['norm']=train['address'].apply(normalizer.normalize)
    test['norm']=test['address'].apply(normalizer.normalize)
    return train,test

train_df,test_df=load_data()
print('Train',train_df.shape,' Test',test_df.shape)

# ================================================================
# STEP 3: Representations (TF-IDF + Embeddings)
# ================================================================
class Reps:
    def __init__(self):
        self.vectorizer=TfidfVectorizer(analyzer='char',ngram_range=(2,5),max_features=100000)
        self.emb=SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
        self.faiss_index=None
    def fit(self,texts):
        self.X=self.vectorizer.fit_transform(texts)
        E=[]
        for i in range(0,len(texts),512):
            E.append(self.emb.encode(texts[i:i+512],normalize_embeddings=True,show_progress_bar=False))
        self.E=np.vstack(E).astype('float32')
        self.faiss_index=faiss.IndexFlatIP(self.E.shape[1]); self.faiss_index.add(self.E)
    def query_emb(self,texts,topk=30):
        E=[]
        for i in range(0,len(texts),512):
            E.append(self.emb.encode(texts[i:i+512],normalize_embeddings=True,show_progress_bar=False))
        E=np.vstack(E).astype('float32')
        sims,idx=self.faiss_index.search(E,topk)
        return idx,sims

reps=Reps(); reps.fit(train_df['norm'].tolist())

# ================================================================
# STEP 4: Feature Engineering
# ================================================================
NUM_RE=re.compile(r"\b\d+\b")

def geo_scores(a,b):
    pa,pb=normalizer.detect_province(a),normalizer.detect_province(b)
    da,db=normalizer.detect_district(a),normalizer.detect_district(b)
    ppa,ppb=normalizer.detect_postal_province(a),normalizer.detect_postal_province(b)
    prov_score=fuzz.partial_ratio(pa,pb)/100 if pa and pb else 0
    dist_score=fuzz.partial_ratio(da,db)/100 if da and db else 0
    postal_score=1.0 if (ppa and ppb and ppa==ppb) else 0
    return prov_score,dist_score,postal_score

def pair_features(a,b):
    feats={}
    feats['fz_ratio']=fuzz.QRatio(a,b)/100
    feats['lev_norm']=1-distance.Levenshtein.distance(a,b)/max(len(a),len(b),1)
    n1,n2=set(NUM_RE.findall(a)),set(NUM_RE.findall(b))
    feats['num_jacc']=len(n1&n2)/max(len(n1|n2),1)
    ps=geo_scores(a,b)
    feats['prov_score'],feats['dist_score'],feats['postal_score']=ps
    return feats

# ================================================================
# STEP 5: Training Pair Generation (simplified)
# ================================================================
def build_pairs(train,n_pos=5,n_neg=5):
    pairs=[]
    lbl2idx=defaultdict(list)
    for i,lb in enumerate(train['label']): lbl2idx[lb].append(i)
    for lb,idxs in lbl2idx.items():
        if len(idxs)>1:
            for i in idxs[:n_pos]:
                for j in idxs[:n_pos]:
                    if i<j:
                        pairs.append((train['norm'].iloc[i],train['norm'].iloc[j],1))
    # random negatives
    all_idx=list(range(len(train)))
    for _ in range(len(pairs)*n_neg):
        i,j=random.sample(all_idx,2)
        if train['label'].iloc[i]!=train['label'].iloc[j]:
            pairs.append((train['norm'].iloc[i],train['norm'].iloc[j],0))
    return pd.DataFrame(pairs,columns=['a','b','y'])

pair_df=build_pairs(train_df)
print('Pairs:',pair_df.shape)

# build features
feats=[pair_features(a,b) for a,b in zip(pair_df['a'],pair_df['b'])]
X=pd.DataFrame(feats); y=pair_df['y']

# ================================================================
# STEP 6: Train LightGBM Reranker
# ================================================================
params=dict(objective='binary',metric=['auc'],learning_rate=0.05,num_leaves=63,feature_fraction=0.8,bagging_fraction=0.8,bagging_freq=1,verbose=-1)
folds=GroupKFold(n_splits=3)
models=[]
for tr,va in folds.split(X,groups=pair_df['a']):
    dtr=lgb.Dataset(X.iloc[tr],label=y.iloc[tr])
    dva=lgb.Dataset(X.iloc[va],label=y.iloc[va])
    m=lgb.train(params,dtr,num_boost_round=200,valid_sets=[dva],callbacks=[lgb.early_stopping(20)])
    models.append(m)

# ================================================================
# STEP 7: Inference
# ================================================================
lbl_rep=train_df.groupby('label')['norm'].apply(lambda s:s.iloc[0]).to_dict()

preds=[]
for q in test_df['norm']:
    idx,sims=reps.query_emb([q],topk=20)
    cands=[train_df['label'].iloc[j] for j in idx[0]]
    cands=list(dict.fromkeys(cands))
    scores=[]
    for lb in cands:
        rep=lbl_rep[lb]
        f=pair_features(q,rep)
        xf=pd.DataFrame([f])
        sc=np.mean([m.predict(xf)[0] for m in models])
        scores.append((lb,sc))
    best=max(scores,key=lambda x:x[1])[0] if scores else train_df['label'].iloc[0]
    preds.append(best)

submission=pd.DataFrame({'id':test_df['id'],'label':preds})
submission.to_csv('submission.csv',index=False)
print('Saved submission.csv ✅')


Train (848237, 3)  Test (217241, 3)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Pairs: (623345, 3)
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[179]	valid_0's auc: 0.838931
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[135]	valid_0's auc: 0.842553
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[178]	valid_0's auc: 0.841172
Saved submission.csv ✅


In [5]:
# Modeli kaydet
models[0].save_model("lgb_fold0.txt")

# Kaydedilmiş modeli yükle
m = lgb.Booster(model_file="lgb_fold0.txt")
