In [9]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score


TRAIN_PATH = "/content/test.csv"
TEST_PATH  = "/content/test.csv"

train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

print(f"Train shape: {train_df.shape} | Test shape: {test_df.shape}")



Train shape: (217241, 2) | Test shape: (217241, 2)


In [10]:
# Önce CSV dosyasını kontrol edelim
import pandas as pd

# CSV'yi yükle ve ilk birkaç satırı kontrol et
train_df = pd.read_csv("train.csv")
print("Kolon adları:", train_df.columns.tolist())
print("\nİlk 5 satır:")
print(train_df.head())
print("\nDataFrame shape:", train_df.shape)

# Eğer kolon adlarında sorun varsa, temizleyelim
train_df.columns = train_df.columns.str.strip()  # Boşlukları temizle
print("\nTemizlenmiş kolon adları:", train_df.columns.tolist())

Kolon adları: ['address', 'label']

İlk 5 satır:
                                             address  label
0  Akarca Mah. Adnan Menderes Cad. 864.Sok. No:15...   8831
1  Cumhuriye Mah. Hükümet Cad. Sivriler İşhanı No...   8810
2  İsmet inönü mahallesi 2001 sokak no:2 Çeşme be...   3067
3  Gazeteci Hasan Tahsin Caddesi, No:10/3,  Gizem...   8210
4  Bitez mahallesi Adnan Menderes caddesi gündonu...   9675

DataFrame shape: (848237, 2)

Temizlenmiş kolon adları: ['address', 'label']


In [11]:
TRAIN_PATH = "train.csv"
TEST_PATH  = "test.csv"

# 1) Oku
train_df = pd.read_csv(TRAIN_PATH, dtype=str, keep_default_na=False)
test_df  = pd.read_csv(TEST_PATH,  dtype=str, keep_default_na=False)

# 2) Kolon adlarını normalize et: BOM, boşluk, büyük/küçük
def _fix_cols(df):
    df.columns = (
        df.columns.astype(str)
                  .str.replace('\ufeff', '', regex=False)  # BOM temizle
                  .str.strip()
                  .str.lower()
    )
    return df

train_df = _fix_cols(train_df)
test_df  = _fix_cols(test_df)

# 3) Eğer testte id yoksa oluştur
if "id" not in test_df.columns:
    test_df["id"] = np.arange(len(test_df))

# 4) Beklenen kolon adları yoksa alternatifleri dene (örn. 'cluster_id' → 'label', 'adres' → 'address')
alt_map = {}
if "label" not in train_df.columns:
    for cand in ["labels", "cluster_id", "cluster", "etiket", "target", "y"]:
        if cand in train_df.columns:
            alt_map[cand] = "label"; break
if "address" not in train_df.columns:
    for cand in ["adres", "addr"]:
        if cand in train_df.columns:
            alt_map[cand] = "address"; break
if alt_map:
    train_df = train_df.rename(columns=alt_map)

# 5) Son güvenlik: gerçekten var mı?
missing = [c for c in ["address", "label"] if c not in train_df.columns]
if missing:
    raise ValueError(f"train.csv missing columns: {missing}. Found columns: {list(train_df.columns)}")

print(f"Train shape: {train_df.shape} | Test shape: {test_df.shape}")
print("Train columns:", list(train_df.columns))
print("Test columns:", list(test_df.columns))


Train shape: (848237, 2) | Test shape: (217241, 2)
Train columns: ['address', 'label']
Test columns: ['id', 'address']


In [12]:
def clean_address(addr):
    addr = str(addr).lower().strip()
    addr = re.sub(r"[.,;:/\\\-]", " ", addr)
    addr = re.sub(r"\s+", " ", addr)
    addr = addr.replace("mh", "mahallesi").replace("cd", "cadde").replace("sk", "sokak")
    addr = addr.replace("blv", "bulvarı").replace("ap", "apartmanı").replace("no", "numara")
    return addr

tqdm.pandas()
train_df['clean_address'] = train_df['address'].progress_apply(clean_address)
test_df['clean_address']  = test_df['address'].progress_apply(clean_address)

le = LabelEncoder()
train_df['label_enc'] = le.fit_transform(train_df['label'])
num_classes = len(le.classes_)
print(f"Toplam sınıf: {num_classes}")

train_idx, val_idx = train_test_split(
    np.arange(len(train_df)),
    test_size=0.1,
    stratify=train_df['label_enc'],
    random_state=42
)

X_train = train_df.iloc[train_idx]['clean_address'].values
y_train = train_df.iloc[train_idx]['label_enc'].values
X_val   = train_df.iloc[val_idx]['clean_address'].values
y_val   = train_df.iloc[val_idx]['label_enc'].values
X_test  = test_df['clean_address'].values


100%|██████████| 848237/848237 [00:08<00:00, 96066.94it/s]
100%|██████████| 217241/217241 [00:02<00:00, 93556.36it/s]


Toplam sınıf: 10390


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Kernel çökmesini önlemek için max_features = 50000-70000 yeterli
tfidf = TfidfVectorizer(max_features=100000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf   = tfidf.transform(X_val)
X_test_tfidf  = tfidf.transform(X_test)

print("TF-IDF hazır!")


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

class AddressDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        x = torch.tensor(self.X[idx].toarray(), dtype=torch.float32).squeeze(0)
        if self.y is not None:
            y = torch.tensor(self.y[idx], dtype=torch.long)
            return x, y
        else:
            return x

train_dataset = AddressDataset(X_train_tfidf, y_train)
val_dataset   = AddressDataset(X_val_tfidf, y_val)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=128, shuffle=False)


In [None]:
import torch.nn as nn

class DeepClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.dropout(self.relu(self.bn1(self.fc1(x))))
        x = self.dropout(self.relu(self.bn2(self.fc2(x))))
        x = self.fc3(x)
        return x

class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.1, dim=-1):
        super().__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

model = DeepClassifier(input_dim=100000, num_classes=num_classes).to(device)
criterion = LabelSmoothingLoss(classes=num_classes, smoothing=0.1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)


In [None]:
num_epochs = 10

for epoch in range(1, num_epochs+1):
    model.train()
    total_loss = 0
    for xb, yb in tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}"):
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            preds = model(xb)
            preds_labels = preds.argmax(dim=1).cpu().numpy()
            all_preds.extend(preds_labels)
            all_labels.extend(yb.numpy())

    f1 = f1_score(all_labels, all_preds, average='macro')
    print(f"Epoch {epoch}/{num_epochs} | Train Loss: {total_loss/len(train_loader):.4f} | Val Macro F1: {f1:.4f}")


In [None]:
test_dataset = AddressDataset(X_test_tfidf)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

model.eval()
test_preds = []

with torch.no_grad():
    for xb in tqdm(test_loader, desc="Test Prediction"):
        xb = xb.to(device)
        preds = model(xb)
        preds_labels = preds.argmax(dim=1).cpu().numpy()
        test_preds.extend(preds_labels)

test_labels = le.inverse_transform(test_preds)

submission_df = pd.DataFrame({
    "id": test_df["id"],
    "label": test_labels
})

submission_df.to_csv("submission_0.7plus.csv", index=False)
print("✅ Submission dosyası oluşturuldu")


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=34374b7c-5b06-4c6f-aef0-649afe004105' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>

In [1]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0


In [15]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from unidecode import unidecode

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ====================== DATA LOADING ======================
TRAIN_PATH = "/content/train.csv"
TEST_PATH = "/content/test.csv"

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

print(f"Train shape: {train_df.shape} | Test shape: {test_df.shape}")
print(f"Train columns: {train_df.columns.tolist()}")
print(f"Test columns: {test_df.columns.tolist()}")

# ====================== ENHANCED PREPROCESSING ======================

class TurkishAddressPreprocessor:
    def __init__(self):  # ÖNEMLİ: __init__ olmalı, _init_ değil!
        # Comprehensive abbreviation dictionary
        self.abbreviations = {
            # Mahalle variations
            'mah': 'mahallesi', 'mh': 'mahallesi', 'mah.': 'mahallesi', 'mh.': 'mahallesi',
            'mahalle': 'mahallesi', 'mahal': 'mahallesi',

            # Cadde variations
            'cad': 'caddesi', 'cd': 'caddesi', 'cad.': 'caddesi', 'cd.': 'caddesi',
            'cadde': 'caddesi', 'cadd': 'caddesi',

            # Sokak variations
            'sok': 'sokak', 'sk': 'sokak', 'sok.': 'sokak', 'sk.': 'sokak',
            'sokağı': 'sokak', 'sokagi': 'sokak',

            # Bulvar variations
            'blv': 'bulvari', 'bulv': 'bulvari', 'bulvar': 'bulvari',

            # Apartman variations
            'apt': 'apartmani', 'ap': 'apartmani', 'apt.': 'apartmani',
            'apartman': 'apartmani', 'apart': 'apartmani',

            # Other common abbreviations
            'no': 'numara', 'no.': 'numara', 'no:': 'numara',
            'd': 'daire', 'd.': 'daire', 'dai': 'daire', 'dai.': 'daire',
            'k': 'kat', 'k.': 'kat',
            'bl': 'blok', 'blk': 'blok',
            'sit': 'sitesi', 'sit.': 'sitesi',
        }

    def normalize_turkish_chars(self, text):
        """Normalize Turkish characters to ASCII equivalents"""
        # Turkish character mappings
        tr_map = {
            'ı': 'i', 'İ': 'i', 'I': 'i',
            'ğ': 'g', 'Ğ': 'g',
            'ü': 'u', 'Ü': 'u',
            'ş': 's', 'Ş': 's',
            'ö': 'o', 'Ö': 'o',
            'ç': 'c', 'Ç': 'c',
        }

        for tr_char, ascii_char in tr_map.items():
            text = text.replace(tr_char, ascii_char)

        return text

    def clean_address(self, addr):
        """Enhanced address cleaning with Turkish language support"""
        # Convert to string and lowercase
        addr = str(addr).lower().strip()

        # Normalize Turkish characters
        addr = self.normalize_turkish_chars(addr)

        # Remove extra whitespace first
        addr = re.sub(r'\s+', ' ', addr)

        # Normalize number formats before punctuation removal
        # Handle "No:5", "No 5", "no.5" patterns
        addr = re.sub(r'no[\s:.\-]*(\d+)', r'numara \1', addr)
        addr = re.sub(r'(\d+)[\s]*no\b', r'numara \1', addr)

        # Handle "K:2 D:4" patterns
        addr = re.sub(r'k[\s:.\-]*(\d+)', r'kat \1', addr)
        addr = re.sub(r'd[\s:.\-]*(\d+)', r'daire \1', addr)

        # Remove punctuation
        addr = re.sub(r'[.,;:/\\\-]', ' ', addr)

        # Expand abbreviations with word boundaries
        words = addr.split()
        expanded_words = []

        for word in words:
            # Check if word is an abbreviation
            if word in self.abbreviations:
                expanded_words.append(self.abbreviations[word])
            else:
                # Check without trailing dots
                word_clean = word.rstrip('.')
                if word_clean in self.abbreviations:
                    expanded_words.append(self.abbreviations[word_clean])
                else:
                    expanded_words.append(word)

        addr = ' '.join(expanded_words)

        # Remove duplicate spaces again
        addr = re.sub(r'\s+', ' ', addr)

        # Remove duplicate words (e.g., "narlidere narlidere")
        words = addr.split()
        seen = set()
        unique_words = []
        for word in words:
            if word not in seen or word.isdigit():
                unique_words.append(word)
                seen.add(word)

        addr = ' '.join(unique_words)

        return addr.strip()

# Initialize preprocessor
preprocessor = TurkishAddressPreprocessor()

# Apply preprocessing
tqdm.pandas()
train_df['clean_address'] = train_df['address'].progress_apply(preprocessor.clean_address)
test_df['clean_address'] = test_df['address'].progress_apply(preprocessor.clean_address)

# Show some examples
print("\n=== Preprocessing Examples ===")
for i in range(5):
    idx = np.random.randint(len(train_df))
    print(f"\nOriginal: {train_df.iloc[idx]['address']}")
    print(f"Cleaned:  {train_df.iloc[idx]['clean_address']}")

# ====================== LABEL ENCODING ======================
le = LabelEncoder()
train_df['label_enc'] = le.fit_transform(train_df['label'])
num_classes = len(le.classes_)
print(f"\nToplam sınıf: {num_classes}")

# ====================== TRAIN/VAL SPLIT ======================
train_idx, val_idx = train_test_split(
    np.arange(len(train_df)),
    test_size=0.1,
    stratify=train_df['label_enc'],
    random_state=42
)

X_train = train_df.iloc[train_idx]['clean_address'].values
y_train = train_df.iloc[train_idx]['label_enc'].values
X_val = train_df.iloc[val_idx]['clean_address'].values
y_val = train_df.iloc[val_idx]['label_enc'].values
X_test = test_df['clean_address'].values

# ====================== TF-IDF VECTORIZATION ======================
# Reduced max_features thanks to better preprocessing
tfidf = TfidfVectorizer(
    max_features=100000,  # Reduced from 1M to 100K
    ngram_range=(1, 3),
    min_df=2,           # Ignore very rare terms
    max_df=0.99,        # Ignore very common terms
    sublinear_tf=True,  # Use sublinear scaling
    strip_accents='unicode'
)

print("\nCreating TF-IDF features...")
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF shape: {X_train_tfidf.shape}")
print(f"Vocabulary size reduced to: {len(tfidf.vocabulary_)}")

# ====================== PYTORCH DATASET & MODEL ======================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"\nDevice: {device}")

class AddressDataset(Dataset):
    def __init__(self, X, y=None):  # ÇİFT ALT ÇİZGİ!
        self.X = X
        self.y = y

    def __len__(self):  # ÇİFT ALT ÇİZGİ!
        return self.X.shape[0]

    def __getitem__(self, idx):  # ÇİFT ALT ÇİZGİ!
        x = torch.tensor(self.X[idx].toarray(), dtype=torch.float32).squeeze(0)
        if self.y is not None:
            y = torch.tensor(self.y[idx], dtype=torch.long)
            return x, y
        else:
            return x

# Create datasets
train_dataset = AddressDataset(X_train_tfidf, y_train)
val_dataset = AddressDataset(X_val_tfidf, y_val)

# Create dataloaders with larger batch size (thanks to reduced features)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=256, shuffle=False, num_workers=2)

class DeepClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):  # ÇİFT ALT ÇİZGİ!
        super().__init__()  # ÇİFT ALT ÇİZGİ!
        self.fc1 = nn.Linear(input_dim, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, 256)
        self.bn3 = nn.BatchNorm1d(256)
        self.fc4 = nn.Linear(256, num_classes)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.dropout(self.relu(self.bn1(self.fc1(x))))
        x = self.dropout(self.relu(self.bn2(self.fc2(x))))
        x = self.dropout(self.relu(self.bn3(self.fc3(x))))
        x = self.fc4(x)
        return x

class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes, smoothing=0.1, dim=-1):  # ÇİFT ALT ÇİZGİ!
        super().__init__()  # ÇİFT ALT ÇİZGİ!
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

# ====================== MODEL TRAINING ======================
# Initialize model with correct input dimension
input_dim = X_train_tfidf.shape[1]  # This will be 100000 or less
model = DeepClassifier(input_dim=input_dim, num_classes=num_classes).to(device)

criterion = LabelSmoothingLoss(classes=num_classes, smoothing=0.1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

# Training parameters
num_epochs = 15
best_f1 = 0
patience = 3
patience_counter = 0

print("\n=== Starting Training ===")
for epoch in range(1, num_epochs + 1):
    # Training phase
    model.train()
    total_loss = 0
    train_pbar = tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs} [Train]")

    for xb, yb in train_pbar:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        total_loss += loss.item()
        train_pbar.set_postfix({'loss': f'{loss.item():.4f}'})

    avg_train_loss = total_loss / len(train_loader)

    # Validation phase
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        val_pbar = tqdm(val_loader, desc=f"Epoch {epoch}/{num_epochs} [Val]")
        for xb, yb in val_pbar:
            xb = xb.to(device)
            preds = model(xb)
            preds_labels = preds.argmax(dim=1).cpu().numpy()
            all_preds.extend(preds_labels)
            all_labels.extend(yb.numpy())

    # Calculate metrics
    f1 = f1_score(all_labels, all_preds, average='macro')

    print(f"Epoch {epoch} | Train Loss: {avg_train_loss:.4f} | Val Macro F1: {f1:.4f}")

    # Learning rate scheduling
    scheduler.step()

    # Early stopping
    if f1 > best_f1:
        best_f1 = f1
        patience_counter = 0
        # Save best model
        torch.save(model.state_dict(), 'best_model.pth')
        print(f"New best F1: {best_f1:.4f} - Model saved!")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping triggered. Best F1: {best_f1:.4f}")
            break

# Load best model for prediction
model.load_state_dict(torch.load('best_model.pth'))

# ====================== TEST PREDICTION ======================
test_dataset = AddressDataset(X_test_tfidf)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False, num_workers=2)

model.eval()
test_preds = []

print("\n=== Making Test Predictions ===")
with torch.no_grad():
    for xb in tqdm(test_loader, desc="Test Prediction"):
        xb = xb.to(device)
        preds = model(xb)
        preds_labels = preds.argmax(dim=1).cpu().numpy()
        test_preds.extend(preds_labels)

# Convert predictions back to original labels
test_labels = le.inverse_transform(test_preds)

# ====================== CREATE SUBMISSION ======================
submission_df = pd.DataFrame({
    "id": test_df["id"],
    "label": test_labels
})

submission_df.to_csv("submission_optimized.csv", index=False)
print("\n✅ Submission file created: submission_optimized.csv")

# ====================== ANALYSIS ======================
print("\n=== Preprocessing Impact Analysis ===")
print(f"Original average length: {train_df['address'].str.len().mean():.1f} chars")
print(f"Cleaned average length: {train_df['clean_address'].str.len().mean():.1f} chars")
print(f"Reduction: {(1 - train_df['clean_address'].str.len().mean() / train_df['address'].str.len().mean()) * 100:.1f}%")

print(f"\nVocabulary size: {len(tfidf.vocabulary_):,}")
print(f"Memory usage reduced by ~90% compared to 1M features")
print(f"Training speed increased by using batch size 256 instead of 128")

# Show some test predictions
print("\n=== Sample Test Predictions ===")
for i in range(5):
    idx = np.random.randint(len(test_df))
    print(f"\nTest Address: {test_df.iloc[idx]['address']}")
    print(f"Cleaned: {test_df.iloc[idx]['clean_address']}")
    print(f"Predicted Label: {submission_df.iloc[idx]['label']}")

Train shape: (848237, 2) | Test shape: (217241, 2)
Train columns: ['address', 'label']
Test columns: ['id', 'address']


100%|██████████| 848237/848237 [00:26<00:00, 32089.22it/s]
100%|██████████| 217241/217241 [00:06<00:00, 32000.70it/s]



=== Preprocessing Examples ===

Original: kemer mah Ayko 3. Caddesi 14/2 kat 2 daire 4 Efeler
Cleaned:  kemer mahallesi ayko 3 caddesi 14 2 kat 2 daire 4 efeler

Original: 6417/1 sokak No10 Daire5 Kat3 Yalı mahallesi- İzmir. Kuaför’ ün olduğu bina girişi. KARŞIYAKA / İZMİR
Cleaned:  6417 1 sokak numara 10 daire5 kat3 yali mahallesi i̇zmir kuafor’ un oldugu bina girisi karsiyaka i̇zmi̇r

Original: Muğla bodrum umurca mahallesi özgün sokak no 9 daire1
Cleaned:  mugla bodrum umurca mahallesi ozgun sokak numara 9 daire1

Original: UĞUR MUMCU MAH. 1301 SK. NO: 10 D: 23 35660 MENEMEN / İZMİR
Cleaned:  ugur mumcu mahallesi 1301 sokak numara 10 daire 23 35660 menemen i̇zmi̇r

Original: Atatürk Mah Atatürk mahallesi 1039.sok no.6 daire 7
Cleaned:  ataturk mahallesi 1039 sokak numara 6 daire 7

Toplam sınıf: 10390

Creating TF-IDF features...
TF-IDF shape: (763413, 100000)
Vocabulary size reduced to: 100000

Device: cuda

=== Starting Training ===


Epoch 1/15 [Train]: 100%|██████████| 2983/2983 [04:49<00:00, 10.32it/s, loss=3.6178]
Epoch 1/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.29it/s]


Epoch 1 | Train Loss: 5.2522 | Val Macro F1: 0.3680
New best F1: 0.3680 - Model saved!


Epoch 2/15 [Train]: 100%|██████████| 2983/2983 [04:49<00:00, 10.31it/s, loss=3.6777]
Epoch 2/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.22it/s]


Epoch 2 | Train Loss: 3.7500 | Val Macro F1: 0.4362
New best F1: 0.4362 - Model saved!


Epoch 3/15 [Train]: 100%|██████████| 2983/2983 [04:48<00:00, 10.36it/s, loss=3.6003]
Epoch 3/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.31it/s]


Epoch 3 | Train Loss: 3.4136 | Val Macro F1: 0.4787
New best F1: 0.4787 - Model saved!


Epoch 4/15 [Train]: 100%|██████████| 2983/2983 [04:48<00:00, 10.34it/s, loss=3.4712]
Epoch 4/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.26it/s]


Epoch 4 | Train Loss: 3.1844 | Val Macro F1: 0.5088
New best F1: 0.5088 - Model saved!


Epoch 5/15 [Train]: 100%|██████████| 2983/2983 [04:48<00:00, 10.33it/s, loss=3.6675]
Epoch 5/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.42it/s]


Epoch 5 | Train Loss: 2.9818 | Val Macro F1: 0.5392
New best F1: 0.5392 - Model saved!


Epoch 6/15 [Train]: 100%|██████████| 2983/2983 [04:48<00:00, 10.35it/s, loss=3.4354]
Epoch 6/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.42it/s]


Epoch 6 | Train Loss: 2.7832 | Val Macro F1: 0.5652
New best F1: 0.5652 - Model saved!


Epoch 7/15 [Train]: 100%|██████████| 2983/2983 [04:47<00:00, 10.36it/s, loss=3.4368]
Epoch 7/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.52it/s]


Epoch 7 | Train Loss: 2.5897 | Val Macro F1: 0.5898
New best F1: 0.5898 - Model saved!


Epoch 8/15 [Train]: 100%|██████████| 2983/2983 [04:47<00:00, 10.37it/s, loss=2.7178]
Epoch 8/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.50it/s]


Epoch 8 | Train Loss: 2.4181 | Val Macro F1: 0.6066
New best F1: 0.6066 - Model saved!


Epoch 9/15 [Train]: 100%|██████████| 2983/2983 [04:47<00:00, 10.38it/s, loss=2.7501]
Epoch 9/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.54it/s]


Epoch 9 | Train Loss: 2.2861 | Val Macro F1: 0.6157
New best F1: 0.6157 - Model saved!


Epoch 10/15 [Train]: 100%|██████████| 2983/2983 [04:48<00:00, 10.36it/s, loss=2.3491]
Epoch 10/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.49it/s]


Epoch 10 | Train Loss: 2.2112 | Val Macro F1: 0.6194
New best F1: 0.6194 - Model saved!


Epoch 11/15 [Train]: 100%|██████████| 2983/2983 [04:48<00:00, 10.35it/s, loss=2.3777]
Epoch 11/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.50it/s]


Epoch 11 | Train Loss: 2.1863 | Val Macro F1: 0.6198
New best F1: 0.6198 - Model saved!


Epoch 12/15 [Train]: 100%|██████████| 2983/2983 [04:48<00:00, 10.34it/s, loss=2.4934]
Epoch 12/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.47it/s]


Epoch 12 | Train Loss: 2.1949 | Val Macro F1: 0.6219
New best F1: 0.6219 - Model saved!


Epoch 13/15 [Train]: 100%|██████████| 2983/2983 [04:48<00:00, 10.33it/s, loss=2.5334]
Epoch 13/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.39it/s]


Epoch 13 | Train Loss: 2.2141 | Val Macro F1: 0.6216


Epoch 14/15 [Train]: 100%|██████████| 2983/2983 [04:48<00:00, 10.33it/s, loss=2.6070]
Epoch 14/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.44it/s]


Epoch 14 | Train Loss: 2.2670 | Val Macro F1: 0.6160


Epoch 15/15 [Train]: 100%|██████████| 2983/2983 [04:49<00:00, 10.32it/s, loss=2.2945]
Epoch 15/15 [Val]: 100%|██████████| 332/332 [00:21<00:00, 15.38it/s]


Epoch 15 | Train Loss: 2.3901 | Val Macro F1: 0.6028
Early stopping triggered. Best F1: 0.6219

=== Making Test Predictions ===


Test Prediction: 100%|██████████| 849/849 [00:53<00:00, 16.02it/s]



✅ Submission file created: submission_optimized.csv

=== Preprocessing Impact Analysis ===
Original average length: 65.0 chars
Cleaned average length: 67.8 chars
Reduction: -4.2%

Vocabulary size: 100,000
Memory usage reduced by ~90% compared to 1M features
Training speed increased by using batch size 256 instead of 128

=== Sample Test Predictions ===

Test Address: 705 sk no 6/1 şehitler mah
Cleaned: 705 sokak numara 6 1 sehitler mahallesi
Predicted Label: 7719

Test Address: osmaniye mah.2.taban sokak no9daire2 kınık/izmir
Cleaned: osmaniye mahallesi 2 taban sokak numara 9daire2 kinik izmir
Predicted Label: 8513

Test Address: Kazımdirik mahallesi 364/8 sok no:4 lumos coffee Lumos coffee avcılar next altı
Cleaned: kazimdirik mahallesi 364 8 sokak numara 4 lumos coffee avcilar next alti
Predicted Label: 5512

Test Address: ORTAKENT YAHŞİ MAH. KARGI CADDESİ SÜMBÜL SOKAK NO 51 CAMEL BEACH REZİDANS
Cleaned: ortakent yahsi̇ mahallesi kargi caddesi̇ sumbul sokak numara 51 camel beach rez