In [1]:
# -*- coding: utf-8 -*-
"""Untitled12.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1ecGAOKl2Nmjc9C1zPkF7wHT8lZ4xttPW
"""

# Teknofest x Hepsiburada Address Matching Pipeline
# Complete ML solution for Turkish address deduplication and matching

# ================================================================
# STEP 0: SETUP AND INSTALLATIONS
# ================================================================
!pip install sentence-transformers faiss-cpu lightgbm scikit-learn pandas numpy
!pip install unidecode regex tqdm



In [2]:
import pandas as pd
import numpy as np
import re
import string
from collections import defaultdict, Counter
import pickle
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb

from sentence_transformers import SentenceTransformer
import faiss
from unidecode import unidecode
from tqdm import tqdm
import gc

# Set random seed for reproducibility
np.random.seed(42)

In [3]:
# ================================================================
# STEP 1: DATA PREPROCESSING & NORMALIZATION
# ================================================================

class TurkishAddressNormalizer:
    """Comprehensive Turkish address normalizer"""

    def __init__(self):
        # Turkish abbreviation mappings
        self.abbreviations = {
            # Neighborhood/District
            'mh': 'mahallesi', 'mah': 'mahallesi', 'mahalle': 'mahallesi',

            # Street types
            'cd': 'caddesi', 'cad': 'caddesi', 'cadde': 'caddesi',
            'sk': 'sokagi', 'sok': 'sokagi', 'sokak': 'sokagi',
            'blv': 'bulvari', 'bulv': 'bulvari', 'bulvar': 'bulvari',
            'osb': 'organize sanayi bolge', 'km': 'kilometre',

            # Building types
            'apt': 'apartmani', 'ap': 'apartmani', 'apartman': 'apartmani',
            'sit': 'sitesi', 'site': 'sitesi',
            'blk': 'blok', 'blok': 'blok',
            'plz': 'plaza', 'plaza': 'plaza',
            'avm': 'alisveris merkezi',

            # Address components
            'no': 'numara', 'nu': 'numara',
            'kt': 'kat', 'kat': 'kat',
            'dr': 'daire', 'daire': 'daire', 'da': 'daire',
            'pst': 'posta kodu',

            # Directions
            'kz': 'kuzey', 'gy': 'guney', 'dt': 'dogu', 'bt': 'bati',

            # Common words
            'yrm': 'yurdu', 'otel': 'oteli', 'hst': 'hastanesi',
            'unv': 'universitesi', 'lise': 'lisesi', 'okl': 'okulu'
        }

        # Regex patterns for component extraction
        self.patterns = {
            'number': r'(?:no[:\s]*|numara[:\s]*|n[:\s]*)?(\d+)(?:[/\-](\d+))?',
            'floor': r'(?:kat[:\s]*|kt[:\s]*|k[:\s]*)?(\d+)(?:\s*\.?\s*kat)?',
            'apartment': r'(?:daire[:\s]*|dr[:\s]*|d[:\s]*)?(\d+)(?:\s*\.?\s*daire)?',
            'block': r'(?:blok[:\s]*|blk[:\s]*|b[:\s]*)?([a-zA-Z]?\d*)(?:\s*\.?\s*blok)?',
            'postal_code': r'\b(\d{5})\b'
        }

    def normalize_turkish_chars(self, text):
        """Convert Turkish characters and remove diacritics"""
        turkish_chars = {
            'ç': 'c', 'ğ': 'g', 'ı': 'i', 'ö': 'o', 'ş': 's', 'ü': 'u',
            'Ç': 'C', 'Ğ': 'G', 'İ': 'I', 'Ö': 'O', 'Ş': 'S', 'Ü': 'U'
        }
        for tr_char, en_char in turkish_chars.items():
            text = text.replace(tr_char, en_char)
        return text

    def expand_abbreviations(self, text):
        """Expand common Turkish abbreviations"""
        words = text.split()
        expanded_words = []

        for word in words:
            # Remove punctuation for matching
            clean_word = word.strip('.,;:()-').lower()
            if clean_word in self.abbreviations:
                expanded_words.append(self.abbreviations[clean_word])
            else:
                expanded_words.append(word)

        return ' '.join(expanded_words)

    def standardize_numbers(self, text):
        """Standardize number formats"""
        # Handle "No:12", "No=12", "12/3" patterns
        text = re.sub(r'no[:\s=]*(\d+)', r'numara \1', text, flags=re.IGNORECASE)
        text = re.sub(r'(\d+)[/\-](\d+)', r'numara \1 daire \2', text)

        # Handle floor patterns
        text = re.sub(r'(\d+)\.?\s*kat', r'\1 kat', text, flags=re.IGNORECASE)

        # Handle apartment patterns
        text = re.sub(r'(\d+)\.?\s*daire', r'\1 daire', text, flags=re.IGNORECASE)

        return text

    def clean_punctuation(self, text):
        """Remove unnecessary punctuation and normalize spacing"""
        # Remove special characters but keep Turkish letters
        text = re.sub(r'[^\w\sçğıöşüÇĞİÖŞÜ]', ' ', text)

        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text)

        return text.strip()

    def extract_components(self, text):
        """Extract address components using regex"""
        components = {}

        for component, pattern in self.patterns.items():
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                if component == 'number' and match.group(2):
                    components['number'] = match.group(1)
                    components['apartment'] = match.group(2)
                else:
                    components[component] = match.group(1)

        return components

    def normalize(self, address):
        """Apply full normalization pipeline"""
        if pd.isna(address) or not isinstance(address, str):
            return ""

        # Convert to lowercase
        address = address.lower()

        # Normalize Turkish characters
        address = self.normalize_turkish_chars(address)

        # Expand abbreviations
        address = self.expand_abbreviations(address)

        # Standardize numbers
        address = self.standardize_numbers(address)

        # Clean punctuation
        address = self.clean_punctuation(address)

        return address

# Initialize normalizer
normalizer = TurkishAddressNormalizer()

# ================================================================
# STEP 2: DATA LOADING AND PREPROCESSING
# ================================================================

def load_and_preprocess_data():
    """Load and preprocess the datasets"""
    print("Loading datasets...")

    # Load data (adjust paths as needed)
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')

    print(f"Train shape: {train_df.shape}")
    print(f"Test shape: {test_df.shape}")
    print(f"Unique labels in train: {train_df['label'].nunique()}")

    # Normalize addresses
    print("Normalizing addresses...")
    train_df['normalized_address'] = train_df['address'].apply(normalizer.normalize)
    test_df['normalized_address'] = test_df['address'].apply(normalizer.normalize)

    # Remove empty addresses
    train_df = train_df[train_df['normalized_address'].str.len() > 0].reset_index(drop=True)
    test_df = test_df[test_df['normalized_address'].str.len() > 0].reset_index(drop=True)

    print(f"After cleaning - Train: {len(train_df)}, Test: {len(test_df)}")

    return train_df, test_df

# Load data
train_df, test_df = load_and_preprocess_data()

Loading datasets...
Train shape: (848237, 2)
Test shape: (217241, 2)
Unique labels in train: 10390
Normalizing addresses...
After cleaning - Train: 848234, Test: 217241


In [None]:
# ================================================================
# STEP 3: BASELINE MODEL - TF-IDF CENTROID
# ================================================================

class TFIDFCentroidModel:
    def __init__(self, ngram_range=(2,11), max_features=None, checkpoint_every=50000):
        self.vectorizer = TfidfVectorizer(
            analyzer='char',
            ngram_range=ngram_range,
            max_features=max_features,
            lowercase=True,
            strip_accents='unicode',
            sublinear_tf=True,
            min_df=2,
            max_df=0.95,
            norm='l2',
            use_idf=True,
            smooth_idf=True,
            dtype=np.float32
        )
        self.label_encoder = LabelEncoder()
        self.centroids = None
        self.labels = None
        self.checkpoint_every = checkpoint_every

    def fit(self, addresses, labels):
        print("Fitting TF-IDF vectorizer...")
        encoded_labels = self.label_encoder.fit_transform(labels)
        self.labels = self.label_encoder.classes_

        X = self.vectorizer.fit_transform(addresses)
        self.centroids = np.zeros((len(self.labels), X.shape[1]), dtype=np.float32)

        print("Computing centroids with checkpoints...")
        for i, label in enumerate(self.labels):
            mask = encoded_labels == i
            if mask.sum() > 0:
                self.centroids[i] = X[mask].mean(axis=0).A1.astype(np.float32)
            # Checkpoint save
            if (i+1) % self.checkpoint_every == 0:
                checkpoint_path = f'model_checkpoint_{i+1}.npz'
                np.savez_compressed(checkpoint_path, centroids=self.centroids, labels=self.labels)
                print(f"[Checkpoint] Saved at label {i+1} -> {checkpoint_path}")
        print(f"Model fitted with {len(self.labels)} labels")

    def predict(self, addresses, top_k=1):
        X = self.vectorizer.transform(addresses)
        similarities = cosine_similarity(X, self.centroids)
        if top_k == 1:
            predictions = similarities.argmax(axis=1)
            return self.label_encoder.inverse_transform(predictions)
        else:
            top_indices = np.argsort(similarities, axis=1)[:, -top_k:][:, ::-1]
            top_labels = [self.label_encoder.inverse_transform(indices) for indices in top_indices]
            return top_labels, similarities

# ================================================================
# TRAIN AND SAVE
# ================================================================

tfidf_model = TFIDFCentroidModel(checkpoint_every=50000)
tfidf_model.fit(train_df['normalized_address'], train_df['label'])

tfidf_predictions = tfidf_model.predict(test_df['normalized_address'])

baseline_submission = pd.DataFrame({
    'id': test_df['id'],
    'label': tfidf_predictions
})
baseline_submission.to_csv('baseline_submission.csv', index=False)
print("Baseline submission saved with checkpoint support!")

Fitting TF-IDF vectorizer...
