In [1]:

!pip install lightgbm --quiet

import pandas as pd
import numpy as np
import re, unicodedata
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb



In [2]:

TRAIN_PATH = "/content/train.csv"
TEST_PATH  = "/content/test.csv"

train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)
print("Shapes ->", train_df.shape, test_df.shape)

ABBREVIATIONS = {
    r"\bmah\b": "mahallesi", r"\bmh\b": "mahallesi", r"\bmhl\b": "mahallesi",
    r"\bcad\b": "cadde", r"\bcd\b": "cadde", r"\bcadd\b": "cadde",
    r"\bsok\b": "sokak", r"\bsk\b": "sokak",
    r"\bblv\b": "bulvari", r"\bbulv\b": "bulvari",
    r"\bap\b": "apartmani", r"\bapt\b": "apartmani",
    r"\bno\b": "numara", r"\bnr\b": "numara",
    r"\bkat\b": "kat", r"\bsit\b": "sitesi", r"\bsitesi\b": "sitesi",
    r"\bilc\b": "ilce", r"\bköy\b": "koyu"
}

def clean_address(addr):
    if pd.isna(addr): return ""
    addr = str(addr)
    addr = unicodedata.normalize("NFKC", addr)
    addr = addr.lower()
    addr = re.sub(r"[^\w\sğüşöçıİĞÜŞÖÇ]", " ", addr)
    addr = re.sub(r"\s+", " ", addr).strip()
    addr = addr.replace("ı", "i").replace("ğ","g").replace("ş","s")\
               .replace("ç","c").replace("ö","o").replace("ü","u")
    for pat, repl in ABBREVIATIONS.items():
        addr = re.sub(pat, repl, addr)
    addr = re.sub(r"\s+", " ", addr).strip()
    return addr

tqdm.pandas()
train_df["clean_address"] = train_df["address"].progress_apply(clean_address)
test_df["clean_address"]  = test_df["address"].progress_apply(clean_address)


le = LabelEncoder()
train_df["label_enc"] = le.fit_transform(train_df["label"])
num_classes = len(le.classes_)
print("Num classes:", num_classes)



Shapes -> (848237, 2) (217241, 2)


100%|██████████| 848237/848237 [00:38<00:00, 21801.87it/s]
100%|██████████| 217241/217241 [00:09<00:00, 22337.93it/s]


Num classes: 10390


In [3]:

train_idx, val_idx = train_test_split(
    np.arange(len(train_df)), test_size=0.1, stratify=train_df['label_enc'], random_state=42
)

X_train = train_df.iloc[train_idx]['clean_address'].values
y_train = train_df.iloc[train_idx]['label_enc'].values
X_val   = train_df.iloc[val_idx]['clean_address'].values
y_val   = train_df.iloc[val_idx]['label_enc'].values
X_test  = test_df['clean_address'].values


In [4]:

word_vec = TfidfVectorizer(
    max_features=150000, ngram_range=(1,2), analyzer="word", sublinear_tf=True, min_df=3
)
char_vec = TfidfVectorizer(
    max_features=50000, ngram_range=(3,5), analyzer="char", sublinear_tf=True, min_df=3
)

X_train_word = word_vec.fit_transform(X_train)
X_val_word   = word_vec.transform(X_val)
X_test_word  = word_vec.transform(X_test)

X_train_char = char_vec.fit_transform(X_train)
X_val_char   = char_vec.transform(X_val)
X_test_char  = char_vec.transform(X_test)

from scipy.sparse import hstack
X_train_fused = hstack([X_train_word, X_train_char])
X_val_fused   = hstack([X_val_word, X_val_char])
X_test_fused  = hstack([X_test_word, X_test_char])

print("Fused TF-IDF shape:", X_train_fused.shape)


Fused TF-IDF shape: (763413, 200000)


In [5]:

svd = TruncatedSVD(n_components=1024, random_state=42)

X_train_red = svd.fit_transform(X_train_fused).astype(np.float32)
X_val_red   = svd.transform(X_val_fused).astype(np.float32)
X_test_red  = svd.transform(X_test_fused).astype(np.float32)

print("Reduced shapes:", X_train_red.shape, X_val_red.shape, X_test_red.shape)


Reduced shapes: (763413, 1024) (84824, 1024) (217241, 1024)


In [6]:

nb = MultinomialNB(alpha=0.3)
nb.fit(X_train_word, y_train)

val_preds_nb = nb.predict(X_val_word)
f1_nb = f1_score(y_val, val_preds_nb, average="macro")
print("Naive Bayes Val F1:", f1_nb)

nb_val_probs  = nb.predict_proba(X_val_word)
nb_test_probs = nb.predict_proba(X_test_word)



Naive Bayes Val F1: 0.4471882037419752


In [None]:
lgb_train = lgb.Dataset(X_train_red, y_train)
lgb_val   = lgb.Dataset(X_val_red, y_val)

params = {
    "objective": "multiclass",
    "num_class": num_classes,
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "min_data_in_leaf": 30,
    "max_depth": -1,
    "metric": "None",
    "verbosity": -1
}

def lgb_f1(preds, data):
    labels = data.get_label().astype(int)
    preds  = preds.reshape(num_classes, -1).T.argmax(axis=1)
    return "f1_macro", f1_score(labels, preds, average="macro"), True

model_lgb = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_val],
    num_boost_round=1000,
    feval=lgb_f1,
    callbacks=[lgb.early_stopping(stopping_rounds=50),
               lgb.log_evaluation(period=50)]
)

val_preds_lgb = np.argmax(model_lgb.predict(X_val_red), axis=1)
f1_lgb = f1_score(y_val, val_preds_lgb, average="macro")
print("LightGBM Val F1:", f1_lgb)

lgb_val_probs  = model_lgb.predict(X_val_red)
lgb_test_probs = model_lgb.predict(X_test_red)

Training until validation scores don't improve for 50 rounds
[50]	valid_0's f1_macro: 3.53657e-05


In [None]:
# Cell 8: Stacking (NB + LightGBM)
meta_X_val = np.hstack([nb_val_probs, lgb_val_probs])
meta_X_test = np.hstack([nb_test_probs, lgb_test_probs])

meta = LogisticRegression(max_iter=200, class_weight="balanced")
meta.fit(meta_X_val, y_val)

val_preds_stack = meta.predict(meta_X_val)
f1_stack = f1_score(y_val, val_preds_stack, average="macro")
print("Stacking (NB+LGB) Val F1:", f1_stack)



In [None]:

final_preds = meta.predict(meta_X_test)
final_labels = le.inverse_transform(final_preds)

submission = pd.DataFrame({"id": test_df["id"], "label": final_labels})
submission.to_csv("submission_stack.csv", index=False)
print("Submission saved! 🎉")
