In [3]:
!pip install pandas numpy scikit-learn lightgbm xgboost \
            rapidfuzz unidecode faiss-cpu \
            sentence-transformers networkx

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading rapidfuzz-3.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m75.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: u

In [6]:
# turkish_address_pipeline.py
import pandas as pd, numpy as np, re, string, unidecode, json
from collections import defaultdict
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score
from rapidfuzz import fuzz
import lightgbm as lgb
import faiss
from sentence_transformers import SentenceTransformer
import networkx as nx

##############################################
# 1. PREPROCESSING
##############################################
ABBREV_MAP = {
    "mah":"mahalle", "mh":"mahalle",
    "cd":"caddesi", "cad":"caddesi",
    "sk":"sokak", "sok":"sokak",
    "apt":"apartman", "ap":"apartman",
    "blv":"bulvar", "bul":"bulvar"
}

def normalize_text(s: str) -> str:
    if pd.isna(s): return ""
    s = s.lower()
    s = unidecode.unidecode(s)  # remove turkish accents
    for k,v in ABBREV_MAP.items():
        s = re.sub(rf"\b{k}\b", v, s)
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def preprocess(df):
    df["address_norm"] = df["address"].astype(str).apply(normalize_text)
    return df

##############################################
# 2. PAIR GENERATION
##############################################
def make_pairs(train_df, n_pos=50000, n_neg=50000, seed=42):
    np.random.seed(seed)
    pos_pairs, neg_pairs = [], []
    # positive pairs
    for cid, group in train_df.groupby("cluster_id"):
        ids = group.index.tolist()
        if len(ids) < 2: continue
        chosen = np.random.choice(ids, size=min(len(ids), 20), replace=False)
        for i in range(len(chosen)-1):
            pos_pairs.append((chosen[i], chosen[i+1], 1))
    # negative pairs (different cluster, same il/ilce block approx)
    ids = train_df.index.tolist()
    for _ in range(n_neg):
        i,j = np.random.choice(ids, 2, replace=False)
        if train_df.loc[i,"cluster_id"] != train_df.loc[j,"cluster_id"]:
            neg_pairs.append((i,j,0))
    pairs = pos_pairs + neg_pairs
    return pairs

##############################################
# 3. FEATURE ENGINEERING
##############################################
model_sbert = SentenceTransformer("emrecan/bert-base-turkish-cased-mean-nli-stsb-tr")

def jaccard_ngram(a,b,n=3):
    A = {a[i:i+n] for i in range(len(a)-n+1)}
    B = {b[i:i+n] for i in range(len(b)-n+1)}
    if not A or not B: return 0
    return len(A&B)/len(A|B)

def feature_vector(a1,a2,vec1,vec2):
    feats = {}
    feats["fuzz_ratio"] = fuzz.ratio(a1,a2)/100
    feats["fuzz_partial"] = fuzz.partial_ratio(a1,a2)/100
    feats["fuzz_token_sort"] = fuzz.token_sort_ratio(a1,a2)/100
    for n in [2,3,4]:
        feats[f"jaccard_{n}"] = jaccard_ngram(a1,a2,n)
    # cosine embedding
    num = np.dot(vec1,vec2)
    denom = np.linalg.norm(vec1)*np.linalg.norm(vec2)+1e-9
    feats["cosine"] = num/denom
    feats["len_diff"] = abs(len(a1)-len(a2))/(max(len(a1),len(a2))+1e-9)
    return feats

def build_feature_matrix(train_df, pairs):
    addr = train_df["address_norm"].tolist()
    emb = model_sbert.encode(addr, batch_size=64, show_progress_bar=True)
    rows=[]
    for i,j,y in pairs:
        feats = feature_vector(addr[i], addr[j], emb[i], emb[j])
        feats["y"]=y; feats["id1"]=i; feats["id2"]=j
        rows.append(feats)
    return pd.DataFrame(rows)

##############################################
# 4. MODEL TRAINING
##############################################
def train_model(feats):
    X = feats.drop(columns=["y","id1","id2"])
    y = feats["y"]
    lgbm = lgb.LGBMClassifier(
        n_estimators=500, learning_rate=0.05,
        num_leaves=63, subsample=0.8, colsample_bytree=0.8
    )
    cv = GroupKFold(n_splits=5)
    scores=[]
    for train_idx, val_idx in cv.split(X,y,groups=feats["id1"]):
        lgbm.fit(X.iloc[train_idx], y.iloc[train_idx])
        preds = lgbm.predict(X.iloc[val_idx])
        scores.append(f1_score(y.iloc[val_idx], preds))
    print("CV F1:", np.mean(scores))
    lgbm.fit(X,y)
    return lgbm

##############################################
# 5. CANDIDATE RETRIEVAL (FAISS)
##############################################
def build_faiss_index(embeddings):
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    return index

def get_candidates(index, emb, k=20):
    faiss.normalize_L2(emb)
    D,I = index.search(emb, k)
    return I

##############################################
# 6. CLUSTERING
##############################################
def clustering(test_df, preds, thresh=0.6):
    G = nx.Graph()
    for _,row in preds.iterrows():
        if row["prob"]>thresh:
            G.add_edge(row["id1"], row["id2"])
    clusters = list(nx.connected_components(G))
    cluster_map={}
    for cid, comp in enumerate(clusters):
        for idx in comp:
            cluster_map[idx]=cid
    labels = [cluster_map.get(i, -1) for i in range(len(test_df))]
    return labels

##############################################
# 7. MAIN EXECUTION
##############################################
if __name__=="__main__":
    train = pd.read_csv("train.csv")
    test  = pd.read_csv("test.csv")

    train = preprocess(train)
    test = preprocess(test)

    # pairs + features
    pairs = make_pairs(train)
    feats = build_feature_matrix(train, pairs)
    model = train_model(feats)

    # embeddings for test
    test_emb = model_sbert.encode(test["address_norm"].tolist(), batch_size=64, show_progress_bar=True)
    index = build_faiss_index(test_emb)

    # candidate + predict
    preds=[]
    for i,vec in enumerate(test_emb):
        cands = get_candidates(index, vec.reshape(1,-1), k=5)[0]
        for j in cands:
            if i>=j: continue
            f = feature_vector(test.loc[i,"address_norm"], test.loc[j,"address_norm"], vec, test_emb[j])
            p = model.predict_proba(pd.DataFrame([f]))[0,1]
            preds.append({"id1":i,"id2":j,"prob":p})
    preds=pd.DataFrame(preds)

    # clustering
    labels = clustering(test,preds,thresh=0.6)
    submission = pd.DataFrame({"record_id":test["record_id"],"cluster_id":labels})
    submission.to_csv("sample_submission.csv",index=False)
    print("Saved teknofest_submission.csv")


KeyError: 'cluster_id'