In [1]:
# Google Colab'da çalıştırmak için:
!pip install pandas numpy fuzzywuzzy scikit-learn folium geopy geopandas

# Dosyaları yükle
# train.csv, test.csv dosyalarını upload et
!pip install unidecode fuzzywuzzy folium geopy geopandas

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0
Collecting unidecode
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.4.0-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.4.0


In [2]:
import pandas as pd
import numpy as np
import re
from typing import Dict, List, Tuple, Optional, Set
from unidecode import unidecode
from fuzzywuzzy import fuzz, process
from collections import defaultdict
import json

class TurkishAddressProcessor:
    """
    Comprehensive Turkish Address Processing Pipeline
    Türkçe adres işleme için gelişmiş ve kapsamlı bir sınıf
    """

    def __init__(self):
        """Initialize with comprehensive Turkish-specific configurations"""

        # Turkish character mapping (extended)
        self.char_mapping = {
            'ç': 'c', 'Ç': 'C',
            'ğ': 'g', 'Ğ': 'G',
            'ı': 'i', 'İ': 'I', 'i̇': 'i',  # Handle different i variations
            'ö': 'o', 'Ö': 'O',
            'ş': 's', 'Ş': 'S',
            'ü': 'u', 'Ü': 'U',
            'â': 'a', 'Â': 'A',  # Circumflex characters
            'î': 'i', 'Î': 'I',
            'û': 'u', 'Û': 'U'
        }

        # Comprehensive abbreviations dictionary
        self.abbreviations = {
            # Mahalle/District
            'mah': 'mahallesi', 'mh': 'mahallesi', 'mahl': 'mahallesi',
            'mah.': 'mahallesi', 'mh.': 'mahallesi', 'mahall': 'mahallesi',

            # Cadde/Street
            'cd': 'caddesi', 'cad': 'caddesi', 'cadde': 'caddesi',
            'cd.': 'caddesi', 'cad.': 'caddesi', 'cadd': 'caddesi',

            # Sokak/Street
            'sk': 'sokak', 'sok': 'sokak', 'sk.': 'sokak', 'sok.': 'sokak',
            'sokağı': 'sokak', 'sokagi': 'sokak',

            # Bulvar/Boulevard
            'bulv': 'bulvar', 'blv': 'bulvar', 'bul': 'bulvar',
            'bulv.': 'bulvar', 'blv.': 'bulvar', 'bulvarı': 'bulvar',

            # Apartman/Building
            'apt': 'apartmani', 'ap': 'apartmani', 'apt.': 'apartmani',
            'apart': 'apartmani', 'apartm': 'apartmani',

            # Numara/Number
            'no': 'numara', 'no.': 'numara', 'nu': 'numara',

            # Daire/Flat
            'd': 'daire', 'd.': 'daire', 'da': 'daire', 'dai': 'daire',

            # Kat/Floor
            'k': 'kat', 'k.': 'kat', 'kt': 'kat',

            # Blok/Block
            'bl': 'blok', 'blk': 'blok', 'bl.': 'blok',

            # Site/Complex
            'sit': 'sitesi', 'site': 'sitesi', 'st': 'sitesi',

            # İş Merkezi/Business Center
            'iş mrk': 'is merkezi', 'is mrk': 'is merkezi',
            'iş merk': 'is merkezi', 'is merk': 'is merkezi',

            # Plaza
            'plz': 'plaza', 'plaz': 'plaza',

            # Köy/Village
            'ky': 'koyu', 'köy': 'koyu', 'koy': 'koyu',

            # Mevki/Location
            'mevk': 'mevkii', 'mvk': 'mevkii', 'mev': 'mevkii',

            # Others
            'san': 'sanayi', 'sanay': 'sanayi',
            'org': 'organize', 'osb': 'organize sanayi bolgesi',
            'koop': 'kooperatifi', 'kop': 'kooperatifi',
            'müd': 'mudurlugu', 'mud': 'mudurlugu',
            'mrk': 'merkez', 'merk': 'merkez',
            'ünv': 'universitesi', 'unv': 'universitesi', 'üni': 'universitesi',
            'hst': 'hastanesi', 'has': 'hastanesi',
            'ilk': 'ilkokulu', 'ilkok': 'ilkokulu',
            'ort': 'ortaokulu', 'ortaok': 'ortaokulu',
            'lis': 'lisesi', 'lise': 'lisesi'
        }

        # Extended typo corrections with city districts
        self.typo_corrections = {
            # Istanbul districts
            'uskudar': 'uskudar', 'üsküdar': 'uskudar', 'uskudr': 'uskudar',
            'kadikoy': 'kadikoy', 'kadıköy': 'kadikoy', 'kadiköy': 'kadikoy',
            'besiktas': 'besiktas', 'beşiktaş': 'besiktas', 'besiktaş': 'besiktas',
            'sisli': 'sisli', 'şişli': 'sisli', 'şişl': 'sisli',
            'beyoglu': 'beyoglu', 'beyoğlu': 'beyoglu', 'beyogl': 'beyoglu',
            'fatih': 'fatih', 'fatıh': 'fatih',
            'maltepe': 'maltepe', 'maltep': 'maltepe',
            'kartal': 'kartal', 'kartl': 'kartal',
            'pendik': 'pendik', 'pendık': 'pendik',
            'tuzla': 'tuzla', 'tuzl': 'tuzla',
            'cekmekoy': 'cekmekoy', 'çekmeköy': 'cekmekoy', 'cekmeköy': 'cekmekoy',
            'sancaktepe': 'sancaktepe', 'sancaktep': 'sancaktepe',
            'sultanbeyli': 'sultanbeyli', 'sultanbeylie': 'sultanbeyli',
            'umraniye': 'umraniye', 'ümraniye': 'umraniye', 'umranıye': 'umraniye',
            'atasehir': 'atasehir', 'ataşehir': 'atasehir', 'atasehır': 'atasehir',
            'bagcilar': 'bagcilar', 'bağcılar': 'bagcilar', 'bagcılar': 'bagcilar',
            'bahcelievler': 'bahcelievler', 'bahçelievler': 'bahcelievler',
            'bakirkoy': 'bakirkoy', 'bakırköy': 'bakirkoy', 'bakirköy': 'bakirkoy',
            'basaksehir': 'basaksehir', 'başakşehir': 'basaksehir',
            'bayrampasa': 'bayrampasa', 'bayrampaşa': 'bayrampasa',
            'esenler': 'esenler', 'esenlr': 'esenler',
            'esenyurt': 'esenyurt', 'esenyrt': 'esenyurt',
            'eyup': 'eyup', 'eyüp': 'eyup', 'eyupsultan': 'eyupsultan',
            'gaziosmanpasa': 'gaziosmanpasa', 'gaziosmanpaşa': 'gaziosmanpasa',
            'gungoren': 'gungoren', 'güngören': 'gungoren',
            'kucukcekmece': 'kucukcekmece', 'küçükçekmece': 'kucukcekmece',
            'sariyer': 'sariyer', 'sarıyer': 'sariyer', 'saryer': 'sariyer',
            'sultangazi': 'sultangazi', 'sultangaz': 'sultangazi',
            'zeytinburnu': 'zeytinburnu', 'zeytinbrnu': 'zeytinburnu',
            'avcilar': 'avcilar', 'avcılar': 'avcilar',
            'beylikduzu': 'beylikduzu', 'beylikdüzü': 'beylikduzu',
            'buyukcekmece': 'buyukcekmece', 'büyükçekmece': 'buyukcekmece',
            'kagithane': 'kagithane', 'kağıthane': 'kagithane',

            # Major cities
            'ankara': 'ankara', 'ankra': 'ankara', 'ank': 'ankara',
            'izmir': 'izmir', 'izmır': 'izmir', 'izmr': 'izmir',
            'istanbul': 'istanbul', 'istanbl': 'istanbul', 'ist': 'istanbul',
            'bursa': 'bursa', 'brsa': 'bursa',
            'antalya': 'antalya', 'antalyaa': 'antalya', 'antaly': 'antalya',
            'adana': 'adana', 'adna': 'adana',
            'konya': 'konya', 'knya': 'konya',
            'gaziantep': 'gaziantep', 'gazi antep': 'gaziantep', 'antep': 'gaziantep',
            'kayseri': 'kayseri', 'kayser': 'kayseri',
            'eskisehir': 'eskisehir', 'eskişehir': 'eskisehir',
            'trabzon': 'trabzon', 'trabzn': 'trabzon',
            'samsun': 'samsun', 'samsn': 'samsun',

            # Common street names
            'ataturk': 'ataturk', 'atatürk': 'ataturk', 'atatrk': 'ataturk',
            'cumhuriyet': 'cumhuriyet', 'cumhuryet': 'cumhuriyet', 'cumhriyet': 'cumhuriyet',
            'istiklal': 'istiklal', 'istıklal': 'istiklal', 'istikal': 'istiklal',
            'inonu': 'inonu', 'inönü': 'inonu', 'ınonu': 'inonu',
            'menderes': 'menderes', 'menders': 'menderes'
        }

        # Turkish stopwords (extended)
        self.stopwords = {
            'il', 'ilce', 'ilcesi', 'ili', 'ilimiz', 'ilcemiz',
            'turkiye', 'turkey', 'tr', 'tc',
            'posta', 'kodu', 'pk', 'postakodu',
            've', 'veya', 'ya', 'da', 'de', 'ki', 'ile',
            'karsi', 'karsisi', 'yani', 'yaninda', 'arkasi', 'arkasinda',
            'ustu', 'ustunde', 'alti', 'altinda', 'ici', 'icinde',
            'dis', 'disinda', 'on', 'onunde', 'arka', 'arkada',
            'ust', 'alt', 'yan', 'kose', 'kosesi', 'kosesinde',
            'bitisik', 'bitisigi', 'civari', 'civarinda', 'yakini', 'yakininda'
        }

        # Address component patterns (enhanced)
        self.patterns = {
            'postal_code': r'\b\d{5}\b',
            'phone': r'(?:\+90|0)?[\s-]?\d{3}[\s-]?\d{3}[\s-]?\d{2}[\s-]?\d{2}',
            'floor': r'(?:kat|k\.?)\s*[:.]?\s*(\d+|zemin|bodrum|giris|ara)',
            'apartment': r'(?:daire|d\.?|dai\.?)\s*[:.]?\s*(\d+[a-zA-Z]?)',
            'building_no': r'(?:no|numara|nu\.?)\s*[:.]?\s*(\d+[a-zA-Z]?(?:[/-]\d+[a-zA-Z]?)?)',
            'block': r'(?:blok|bl\.?|blk\.?)\s*[:.]?\s*([a-zA-Z]\d*|\d+[a-zA-Z]?)',
            'district': r'(\w+(?:\s+\w+)*?)\s*(?:mahalle|mahallesi|mah\.?|mh\.?)',
            'street': r'(\w+(?:\s+\w+)*?)\s*(?:sokak|sokagi|sok\.?|sk\.?|caddesi|cad\.?|cd\.?|bulvar|bulvari|bulv\.?|blv\.?)',
            'site': r'(\w+(?:\s+\w+)*?)\s*(?:sitesi|site|sit\.?)',
            'plaza': r'(\w+(?:\s+\w+)*?)\s*(?:plaza|plz\.?|plaz\.?)',
            'apartment_name': r'(\w+(?:\s+\w+)*?)\s*(?:apartmani|apartman|apt\.?|ap\.?)',
            'village': r'(\w+(?:\s+\w+)*?)\s*(?:koyu|koy|ky\.?)',
            'neighborhood': r'(\w+(?:\s+\w+)*?)\s*(?:mevkii|mevki|mevk\.?|mvk\.?)',
            'district_name': r'(\w+)\s*(?:ilcesi|ilce|ilc\.?)',
            'city_name': r'(\w+)\s*(?:ili|il|sehri|sehir|kent)'
        }

        # City and district hierarchy
        self.city_hierarchy = {
            'istanbul': {
                'european': ['fatih', 'beyoglu', 'besiktas', 'sisli', 'kagithane', 'eyup',
                           'gaziosmanpasa', 'esenler', 'gungoren', 'bagcilar', 'bahcelievler',
                           'bakirkoy', 'zeytinburnu', 'kucukcekmece', 'avcilar', 'esenyurt',
                           'beylikduzu', 'buyukcekmece', 'basaksehir', 'arnavutkoy', 'sultangazi',
                           'bayrampasa', 'sariyer'],
                'asian': ['kadikoy', 'uskudar', 'umraniye', 'kartal', 'maltepe', 'pendik',
                         'tuzla', 'atasehir', 'cekmekoy', 'sancaktepe', 'sultanbeyli', 'sile',
                         'beykoz', 'adalar']
            },
            'ankara': {
                'merkez': ['cankaya', 'kecioren', 'yenimahalle', 'mamak', 'etimesgut',
                          'sincan', 'altindag', 'pursaklar', 'golbasi'],
                'ilce': ['polatli', 'beypazari', 'ayas', 'bala', 'camlidere', 'cubuk',
                        'elmadag', 'gudul', 'haymana', 'kahramankazan', 'kalecik',
                        'kizilcahamam', 'nallihan', 'sereflikochisar']
            },
            'izmir': {
                'merkez': ['konak', 'bornova', 'buca', 'karsiyaka', 'cigli', 'gaziemir',
                          'narlidere', 'balcova', 'bayrakli', 'karabaglar', 'guzelbahce'],
                'ilce': ['aliaga', 'bayindir', 'bergama', 'beydag', 'cesme', 'dikili',
                        'foca', 'karaburun', 'kemalpasa', 'kinik', 'kiraz', 'menderes',
                        'menemen', 'odemis', 'seferihisar', 'selcuk', 'tire', 'torbali', 'urla']
            }
        }

        # Common building types
        self.building_types = {
            'residence': ['residence', 'residance', 'rezidans', 'rezidence', 'konutlari', 'evleri'],
            'site': ['sitesi', 'site', 'konakları', 'konaklari', 'villaları', 'villalari'],
            'plaza': ['plaza', 'plz', 'plaza', 'tower', 'towers', 'kule', 'kulesi'],
            'merkez': ['merkezi', 'merkez', 'center', 'centre', 'mall', 'avm'],
            'is_merkezi': ['is merkezi', 'is hani', 'ishani', 'ticaret merkezi', 'ofis'],
            'apartman': ['apartmani', 'apartman', 'apt', 'blok', 'binasi']
        }

        # Direction words
        self.direction_words = {
            'kuzey': ['kuzey', 'k', 'north'],
            'guney': ['guney', 'g', 'south'],
            'dogu': ['dogu', 'd', 'east'],
            'bati': ['bati', 'b', 'west'],
            'merkez': ['merkez', 'orta', 'center', 'central'],
            'yukarı': ['yukari', 'ust', 'upper'],
            'asagi': ['asagi', 'alt', 'lower'],
            'yeni': ['yeni', 'new'],
            'eski': ['eski', 'old']
        }

        # Number words in Turkish
        self.number_words = {
            'bir': '1', 'iki': '2', 'uc': '3', 'dort': '4', 'bes': '5',
            'alti': '6', 'yedi': '7', 'sekiz': '8', 'dokuz': '9', 'on': '10',
            'onbir': '11', 'oniki': '12', 'onuc': '13', 'ondort': '14', 'onbes': '15',
            'yirmi': '20', 'otuz': '30', 'kirk': '40', 'elli': '50',
            'altmis': '60', 'yetmis': '70', 'seksen': '80', 'doksan': '90',
            'yuz': '100', 'birinci': '1', 'ikinci': '2', 'ucuncu': '3',
            'dorduncu': '4', 'besinci': '5', 'altinci': '6', 'yedinci': '7',
            'sekizinci': '8', 'dokuzuncu': '9', 'onuncu': '10'
        }

        # Common prefixes and suffixes
        self.address_prefixes = ['yeni', 'eski', 'buyuk', 'kucuk', 'orta', 'asagi', 'yukari',
                                'ic', 'dis', 'uzeri', 'alti', 'kuzey', 'guney', 'dogu', 'bati']

        self.address_suffixes = ['mahallesi', 'caddesi', 'sokagi', 'sokak', 'bulvari', 'yolu',
                                'meydani', 'parki', 'bahcesi', 'konaklari', 'evleri', 'sitesi']

    def normalize_turkish_chars(self, text: str) -> str:
        """Normalize Turkish characters to ASCII equivalents"""
        if not text:
            return text

        # Apply character mapping
        for turkish, latin in self.char_mapping.items():
            text = text.replace(turkish, latin)

        # Handle special cases
        text = text.replace('İ', 'I').replace('i̇', 'i')

        return text

    def expand_abbreviations(self, text: str) -> str:
        """Expand common Turkish address abbreviations"""
        if not text:
            return text

        # Create boundary pattern for whole word matching
        for abbr, full in sorted(self.abbreviations.items(), key=lambda x: len(x[0]), reverse=True):
            # Match abbreviation with word boundaries
            pattern = r'\b' + re.escape(abbr) + r'\.?\b'
            text = re.sub(pattern, full, text, flags=re.IGNORECASE)

        return text

    def normalize_numbers(self, text: str) -> str:
        """Advanced number normalization for Turkish addresses"""
        if not text:
            return text

        # Convert number words to digits
        for word, digit in self.number_words.items():
            pattern = r'\b' + word + r'\b'
            text = re.sub(pattern, digit, text, flags=re.IGNORECASE)

        # Handle special number patterns
        # No:25D:4 → numara 25 daire 4
        text = re.sub(r'no\s*[;:]\s*(\d+)\s*d\s*[:;]\s*(\d+)', r'numara \1 daire \2', text, flags=re.IGNORECASE)

        # No/25 or No.25 → numara 25
        text = re.sub(r'no\s*[\./:]\s*(\d+)', r'numara \1', text, flags=re.IGNORECASE)

        # 25/A or 25-A → 25A
        text = re.sub(r'(\d+)\s*[/-]\s*([a-zA-Z])', r'\1\2', text)

        # D:5 or D.5 → daire 5
        text = re.sub(r'\bd\s*[:\.]\s*(\d+)', r'daire \1', text, flags=re.IGNORECASE)

        # K:2 or K.2 → kat 2
        text = re.sub(r'\bk\s*[:\.]\s*(\d+)', r'kat \1', text, flags=re.IGNORECASE)

        # Blok A → blok A
        text = re.sub(r'blok\s+([a-zA-Z]\d*)', r'blok \1', text, flags=re.IGNORECASE)

        # Handle floor names
        text = re.sub(r'\b(zemin|bodrum|giris|ara)\s+kat\b', r'\1kat', text, flags=re.IGNORECASE)

        return text

    def fix_common_typos(self, text: str) -> str:
        """Fix common typos using advanced fuzzy matching"""
        if not text:
            return text

        words = text.split()
        fixed_words = []

        for word in words:
            clean_word = word.lower().strip()

            # Skip short words and numbers
            if len(clean_word) <= 3 or clean_word.isdigit():
                fixed_words.append(word)
                continue

            # Check against typo corrections
            best_match = None
            best_score = 0

            for typo, correct in self.typo_corrections.items():
                score = fuzz.ratio(clean_word, typo)
                if score > best_score and score >= 85:
                    best_score = score
                    best_match = correct

            if best_match:
                fixed_words.append(best_match)
            else:
                fixed_words.append(word)

        return ' '.join(fixed_words)

    def remove_stopwords(self, text: str) -> str:
        """Remove Turkish stopwords while preserving important address components"""
        if not text:
            return text

        words = text.split()
        filtered_words = []

        for i, word in enumerate(words):
            # Don't remove stopwords if they're part of an address component
            if word.lower() in self.stopwords:
                # Check if it's part of a meaningful phrase
                if i > 0 and words[i-1].lower() in ['posta', 'il', 'ilce']:
                    filtered_words.append(word)
                else:
                    continue
            else:
                filtered_words.append(word)

        return ' '.join(filtered_words)

    def extract_address_components(self, text: str) -> Dict[str, str]:
        """Extract structured components from Turkish address text"""
        components = {}

        # Extract patterns
        for component, pattern in self.patterns.items():
            matches = re.findall(pattern, text, re.IGNORECASE)
            if matches:
                # Get the best match (usually the first one)
                if isinstance(matches[0], tuple):
                    components[component] = matches[0][0]
                else:
                    components[component] = matches[0]

        # Extract city and district from hierarchy
        text_lower = text.lower()
        for city, districts in self.city_hierarchy.items():
            if city in text_lower:
                components['city'] = city
                # Check districts
                all_districts = []
                for region, district_list in districts.items():
                    all_districts.extend(district_list)

                for district in all_districts:
                    if district in text_lower:
                        components['district_name'] = district
                        break
                break

        return components

    def parse_address_hierarchy(self, text: str) -> Dict[str, str]:
        """Parse address into hierarchical components"""
        hierarchy = {
            'country': 'turkiye',
            'city': None,
            'district': None,
            'neighborhood': None,
            'street': None,
            'building': None,
            'floor': None,
            'apartment': None
        }

        # Extract components
        components = self.extract_address_components(text)

        # Map components to hierarchy
        if 'city_name' in components:
            hierarchy['city'] = components['city_name']
        elif 'city' in components:
            hierarchy['city'] = components['city']

        if 'district_name' in components:
            hierarchy['district'] = components['district_name']

        if 'district' in components:
            hierarchy['neighborhood'] = components['district']

        if 'street' in components:
            hierarchy['street'] = components['street']

        if 'building_no' in components:
            hierarchy['building'] = components['building_no']

        if 'floor' in components:
            hierarchy['floor'] = components['floor']

        if 'apartment' in components:
            hierarchy['apartment'] = components['apartment']

        return hierarchy

    def standardize_address_format(self, text: str) -> str:
        """Standardize address to a consistent format"""
        # Parse hierarchy
        hierarchy = self.parse_address_hierarchy(text)

        # Build standardized address
        parts = []

        if hierarchy['neighborhood']:
            parts.append(f"{hierarchy['neighborhood']} mahallesi")

        if hierarchy['street']:
            parts.append(hierarchy['street'])

        if hierarchy['building']:
            parts.append(f"numara {hierarchy['building']}")

        if hierarchy['floor']:
            parts.append(f"kat {hierarchy['floor']}")

        if hierarchy['apartment']:
            parts.append(f"daire {hierarchy['apartment']}")

        if hierarchy['district']:
            parts.append(hierarchy['district'])

        if hierarchy['city']:
            parts.append(hierarchy['city'])

        return ' '.join(parts)

    def calculate_component_similarity(self, addr1: str, addr2: str) -> Dict[str, float]:
        """Calculate similarity scores for each address component"""
        comp1 = self.extract_address_components(addr1)
        comp2 = self.extract_address_components(addr2)

        similarity_scores = {}

        # Check each component type
        component_types = set(comp1.keys()) | set(comp2.keys())

        for comp_type in component_types:
            val1 = comp1.get(comp_type, '')
            val2 = comp2.get(comp_type, '')

            if val1 and val2:
                # Use different similarity measures based on component type
                if comp_type in ['building_no', 'floor', 'apartment', 'postal_code']:
                    # Exact match for numbers
                    similarity_scores[comp_type] = 1.0 if val1 == val2 else 0.0
                else:
                    # Fuzzy match for text
                    similarity_scores[comp_type] = fuzz.ratio(val1, val2) / 100.0
            else:
                similarity_scores[comp_type] = 0.0

        return similarity_scores

    def is_valid_turkish_address(self, text: str) -> bool:
        """Validate if text contains minimum required Turkish address components"""
        if not text or len(text) < 10:
            return False

        # Check for minimum components
        components = self.extract_address_components(text)

        # Must have at least a street/district and city/district name
        has_location = any(key in components for key in ['street', 'district', 'district_name'])
        has_area = any(key in components for key in ['city_name', 'city', 'district_name'])

        return has_location or has_area

    def preprocess_address(self, text: str, standardize: bool = False) -> str:
        """
        Main preprocessing function with all cleaning steps

        Args:
            text (str): Raw address text
            standardize (bool): Whether to apply standardization

        Returns:
            str: Cleaned and normalized address
        """
        if pd.isna(text) or text == '' or text == '-----':
            return ''

        # Convert to string and clean
        text = str(text).strip()

        # Remove multiple spaces and normalize whitespace
        text = ' '.join(text.split())

        # Convert to lowercase
        text = text.lower()

        # Normalize Turkish characters
        text = self.normalize_turkish_chars(text)

        # Expand abbreviations
        text = self.expand_abbreviations(text)

        # Normalize numbers
        text = self.normalize_numbers(text)

        # Remove extra punctuation but keep necessary ones
        text = re.sub(r'[^\w\s/\-]', ' ', text)

        # Fix common typos
        text = self.fix_common_typos(text)

        # Normalize whitespace again
        text = ' '.join(text.split())

        # Optionally remove stopwords
        # text = self.remove_stopwords(text)

        # Optionally standardize format
        if standardize and self.is_valid_turkish_address(text):
            text = self.standardize_address_format(text)

        return text.strip()

    def get_address_features(self, text: str) -> Dict[str, any]:
        """Extract comprehensive features from address text"""
        features = {
            'length': len(text),
            'word_count': len(text.split()),
            'has_number': bool(re.search(r'\d', text)),
            'components': self.extract_address_components(text),
            'hierarchy': self.parse_address_hierarchy(text),
            'is_valid': self.is_valid_turkish_address(text)
        }

        # Count component types
        features['component_count'] = len([v for v in features['components'].values() if v])

        # Check for specific patterns
        features['has_postal_code'] = 'postal_code' in features['components']
        features['has_building_no'] = 'building_no' in features['components']
        features['has_apartment'] = 'apartment' in features['components']
        features['has_floor'] = 'floor' in features['components']
        features['has_street'] = 'street' in features['components']
        features['has_district'] = 'district' in features['components']

        # Check for building types
        text_lower = text.lower()
        for btype, keywords in self.building_types.items():
            features[f'has_{btype}'] = any(keyword in text_lower for keyword in keywords)

        # Check for direction words
        for direction, keywords in self.direction_words.items():
            features[f'has_{direction}'] = any(keyword in text_lower for keyword in keywords)

        return features

    def fuzzy_match_addresses(self, addr1: str, addr2: str, threshold: float = 0.8) -> Tuple[bool, float]:
        """
        Advanced fuzzy matching between two addresses

        Returns:
            Tuple[bool, float]: (is_match, similarity_score)
        """
        # Preprocess both addresses
        proc_addr1 = self.preprocess_address(addr1)
        proc_addr2 = self.preprocess_address(addr2)

        # If either is empty after preprocessing, no match
        if not proc_addr1 or not proc_addr2:
            return False, 0.0

        # Calculate different similarity metrics
        scores = []

        # 1. Overall string similarity
        string_sim = fuzz.ratio(proc_addr1, proc_addr2) / 100.0
        scores.append(('string', string_sim, 0.3))

        # 2. Token set similarity (order-independent)
        token_sim = fuzz.token_set_ratio(proc_addr1, proc_addr2) / 100.0
        scores.append(('token', token_sim, 0.3))

        # 3. Component-based similarity
        comp_similarities = self.calculate_component_similarity(addr1, addr2)
        if comp_similarities:
            comp_sim = sum(comp_similarities.values()) / len(comp_similarities)
            scores.append(('component', comp_sim, 0.4))

        # Calculate weighted average
        total_score = sum(score * weight for _, score, weight in scores)

        # Check if it's a match
        is_match = total_score >= threshold

        return is_match, total_score

    def group_similar_addresses(self, addresses: List[str], threshold: float = 0.85) -> Dict[int, List[int]]:
        """
        Group similar addresses together

        Returns:
            Dict mapping group_id to list of address indices
        """
        n_addresses = len(addresses)
        groups = {}
        assigned = set()
        group_id = 0

        for i in range(n_addresses):
            if i in assigned:
                continue

            # Start new group
            groups[group_id] = [i]
            assigned.add(i)

            # Find similar addresses
            for j in range(i + 1, n_addresses):
                if j not in assigned:
                    is_match, score = self.fuzzy_match_addresses(addresses[i], addresses[j], threshold)
                    if is_match:
                        groups[group_id].append(j)
                        assigned.add(j)

            group_id += 1

        return groups

    def preprocess_dataframe(self, df: pd.DataFrame, address_col: str = 'address',
                           extract_features: bool = True,
                           remove_duplicates: bool = True) -> pd.DataFrame:
        """
        Advanced preprocessing for entire dataframe

        Args:
            df (pd.DataFrame): Input dataframe
            address_col (str): Name of address column
            extract_features (bool): Whether to extract address features
            remove_duplicates (bool): Whether to mark duplicates

        Returns:
            pd.DataFrame: Processed dataframe with additional columns
        """
        # Create a copy
        df_processed = df.copy()

        print(f"Processing {len(df)} addresses...")

        # Apply preprocessing
        df_processed['processed_address'] = df_processed[address_col].apply(
            lambda x: self.preprocess_address(x, standardize=False)
        )

        # Apply standardization separately
        df_processed['standardized_address'] = df_processed[address_col].apply(
            lambda x: self.preprocess_address(x, standardize=True)
        )

        # Extract features if requested
        if extract_features:
            print("Extracting address features...")
            features_list = []

            for idx, row in df_processed.iterrows():
                features = self.get_address_features(row['processed_address'])
                features_list.append(features)

                if idx % 10000 == 0 and idx > 0:
                    print(f"  Processed {idx} addresses...")

            # Convert features to columns
            features_df = pd.DataFrame(features_list)

            # Add feature columns
            for col in ['length', 'word_count', 'component_count', 'has_number', 'is_valid',
                       'has_postal_code', 'has_building_no', 'has_apartment', 'has_floor',
                       'has_street', 'has_district']:
                if col in features_df.columns:
                    df_processed[f'feat_{col}'] = features_df[col]

        # Mark duplicates if requested
        if remove_duplicates:
            # Check for exact duplicates
            df_processed['is_duplicate_exact'] = df_processed.duplicated(
                subset=['processed_address'], keep='first'
            )

            # Check for standardized duplicates
            df_processed['is_duplicate_standard'] = df_processed.duplicated(
                subset=['standardized_address'], keep='first'
            )

            # Calculate fuzzy duplicates for a sample (expensive operation)
            if len(df_processed) < 10000:
                print("Detecting fuzzy duplicates...")
                addresses = df_processed['processed_address'].tolist()
                groups = self.group_similar_addresses(addresses, threshold=0.9)

                # Mark fuzzy duplicates
                fuzzy_dup = [False] * len(df_processed)
                for group_id, indices in groups.items():
                    if len(indices) > 1:
                        # Keep first, mark rest as duplicates
                        for idx in indices[1:]:
                            fuzzy_dup[idx] = True

                df_processed['is_duplicate_fuzzy'] = fuzzy_dup

        # Add validation flag
        df_processed['is_valid_address'] = df_processed['processed_address'].apply(
            self.is_valid_turkish_address
        )

        # Statistics
        print("\nPreprocessing Statistics:")
        print(f"Total addresses: {len(df_processed)}")
        print(f"Valid addresses: {df_processed['is_valid_address'].sum()}")

        if remove_duplicates:
            print(f"Exact duplicates: {df_processed['is_duplicate_exact'].sum()}")
            print(f"Standardized duplicates: {df_processed['is_duplicate_standard'].sum()}")
            if 'is_duplicate_fuzzy' in df_processed.columns:
                print(f"Fuzzy duplicates: {df_processed['is_duplicate_fuzzy'].sum()}")

        return df_processed

    def generate_address_report(self, df: pd.DataFrame) -> Dict[str, any]:
        """Generate comprehensive report on address data quality"""
        report = {
            'total_addresses': int(len(df)),  # int64'ü int'e çevir
            'processed_addresses': int(len(df[df['processed_address'] != ''])),
            'valid_addresses': int(df['is_valid_address'].sum()) if 'is_valid_address' in df.columns else 0,
            'statistics': {},
            'component_coverage': {},
            'quality_metrics': {}
        }

        # Component coverage
        if 'feat_has_street' in df.columns:
            component_cols = [col for col in df.columns if col.startswith('feat_has_')]
            for col in component_cols:
                component_name = col.replace('feat_has_', '')
                coverage = df[col].sum() / len(df) * 100
                report['component_coverage'][component_name] = f"{coverage:.1f}%"

        # Quality metrics
        if 'feat_component_count' in df.columns:
            report['quality_metrics']['avg_components'] = float(df['feat_component_count'].mean())
            report['quality_metrics']['min_components'] = int(df['feat_component_count'].min())
            report['quality_metrics']['max_components'] = int(df['feat_component_count'].max())

        # Address length statistics
        if 'feat_length' in df.columns:
            report['statistics']['avg_length'] = float(df['feat_length'].mean())
            report['statistics']['min_length'] = int(df['feat_length'].min())
            report['statistics']['max_length'] = int(df['feat_length'].max())

        # Word count statistics
        if 'feat_word_count' in df.columns:
            report['statistics']['avg_words'] = float(df['feat_word_count'].mean())
            report['statistics']['min_words'] = int(df['feat_word_count'].min())
            report['statistics']['max_words'] = int(df['feat_word_count'].max())

        return report


# Example usage and testing
if __name__ == "__main__":
    # Initialize processor
    processor = TurkishAddressProcessor()

    # Test cases
    test_addresses = [
        "Çankaya Mah. Atatürk Bulv. No:125/A D:3 K:2 Ankara",
        "Kadıköy İskele Cd. No:10/B İstanbul",
        "Narlıdere Mah. Mithatpaşa Cd. No:15/A D:3 K:2 İzmir",
        "Fatih mah menderes bul No;25D:4",
        "YENİ MAH. ESKİ SOK. APT. NO:5 DAİRE:3 KAT:2",
        "Uskudar Meydan Sk. Yeni Plaza K:5 Istanbul",
        "Beşiktaş Barbaros Bulvarı No:145 Kat:3 Daire:12 İSTANBUL",
        "atatürk mahallesi cumhuriyet caddesi no 25 daire 4 ankara",
        "Karşıyaka Çarşı Sok. No:5 İzmir",
        "Bakırköy İstasyon Cad. Güneş Apt. No:22/5 İstanbul",
        "Üsküdar Çarşı Mh. İskele Sk. No:10 İSTANBUL",
        "BAHÇELİEVLER 7. CAD. NO:15/B ÇANKAYA/ANKARA",
        "Yeşilköy Havalimanı Cd. No:11/1 Bakırköy İstanbul 34149"
    ]

    print("=" * 80)
    print("TURKISH ADDRESS PROCESSOR - COMPREHENSIVE TEST")
    print("=" * 80)

    # Test 1: Basic preprocessing
    print("\n1. BASIC PREPROCESSING TEST")
    print("-" * 50)
    for addr in test_addresses[:5]:
        processed = processor.preprocess_address(addr)
        print(f"Original:  {addr}")
        print(f"Processed: {processed}")
        print()

    # Test 2: Standardization
    print("\n2. ADDRESS STANDARDIZATION TEST")
    print("-" * 50)
    for addr in test_addresses[:5]:
        standardized = processor.preprocess_address(addr, standardize=True)
        print(f"Original:     {addr}")
        print(f"Standardized: {standardized}")
        print()

    # Test 3: Component extraction
    print("\n3. COMPONENT EXTRACTION TEST")
    print("-" * 50)
    for addr in test_addresses[:3]:
        components = processor.extract_address_components(processor.preprocess_address(addr))
        print(f"Address: {addr}")
        print("Components:")
        for comp, value in components.items():
            print(f"  {comp}: {value}")
        print()

    # Test 4: Address hierarchy
    print("\n4. ADDRESS HIERARCHY TEST")
    print("-" * 50)
    for addr in test_addresses[:3]:
        hierarchy = processor.parse_address_hierarchy(processor.preprocess_address(addr))
        print(f"Address: {addr}")
        print("Hierarchy:")
        for level, value in hierarchy.items():
            if value:
                print(f"  {level}: {value}")
        print()

    # Test 5: Similarity calculation
    print("\n5. ADDRESS SIMILARITY TEST")
    print("-" * 50)
    test_pairs = [
        (test_addresses[0], "Çankaya Mahallesi Atatürk Bulvarı 125A Daire 3 Ankara"),
        (test_addresses[1], "kadikoy iskele caddesi no 10B istanbul"),
        (test_addresses[6], "Beşiktaş Barbaros Bulv. No:145 D:12 K:3 İstanbul"),
        ("Atatürk Mah. İnönü Sok. No:25 Ankara", "atatürk mahallesi inönü sokak 25 ankara")
    ]

    for addr1, addr2 in test_pairs:
        is_match, score = processor.fuzzy_match_addresses(addr1, addr2)
        print(f"Address 1: {addr1}")
        print(f"Address 2: {addr2}")
        print(f"Match: {'YES' if is_match else 'NO'} (Score: {score:.2%})")

        # Component similarities
        comp_sim = processor.calculate_component_similarity(
            processor.preprocess_address(addr1),
            processor.preprocess_address(addr2)
        )
        if comp_sim:
            print("Component similarities:")
            for comp, sim in comp_sim.items():
                print(f"  {comp}: {sim:.2%}")
        print()

    # Test 6: Feature extraction
    print("\n6. FEATURE EXTRACTION TEST")
    print("-" * 50)
    for addr in test_addresses[:3]:
        features = processor.get_address_features(processor.preprocess_address(addr))
        print(f"Address: {addr}")
        print(f"Features:")
        print(f"  Length: {features['length']}")
        print(f"  Words: {features['word_count']}")
        print(f"  Components: {features['component_count']}")
        print(f"  Valid: {features['is_valid']}")
        print(f"  Has Number: {features['has_number']}")
        print()

    # Test 7: Batch processing
    print("\n7. BATCH PROCESSING TEST")
    print("-" * 50)
    test_df = pd.DataFrame({'address': test_addresses})
    processed_df = processor.preprocess_dataframe(test_df, extract_features=True)

    print(f"Processed {len(processed_df)} addresses")
    print("\nDataFrame columns:")
    print(processed_df.columns.tolist())

    print("\nSample results:")
    display_cols = ['address', 'processed_address', 'feat_component_count', 'is_valid_address']
    print(processed_df[display_cols].head(3).to_string())

    # Test 8: Duplicate detection
    print("\n8. DUPLICATE DETECTION TEST")
    print("-" * 50)
    dup_addresses = [
        "Kadıköy İskele Cad. No:10 İstanbul",
        "kadikoy iskele caddesi no 10 istanbul",
        "KADIKOY ISKELE CAD NO:10 ISTANBUL",
        "Bakırköy İstasyon Cad. No:22 İstanbul",
        "bakirkoy istasyon caddesi no 22 istanbul"
    ]

    dup_df = pd.DataFrame({'address': dup_addresses})
    dup_processed = processor.preprocess_dataframe(dup_df, remove_duplicates=True)

    print("Duplicate analysis:")
    for idx, row in dup_processed.iterrows():
        print(f"{row['address']}")
        print(f"  Exact duplicate: {row['is_duplicate_exact']}")
        print(f"  Standard duplicate: {row['is_duplicate_standard']}")
        if 'is_duplicate_fuzzy' in row:
            print(f"  Fuzzy duplicate: {row['is_duplicate_fuzzy']}")

    # Test 9: Address validation
    print("\n9. ADDRESS VALIDATION TEST")
    print("-" * 50)
    validation_tests = [
        "Çankaya Mah. Atatürk Bulv. No:125 Ankara",  # Valid
        "İstanbul",  # Too short
        "-----",  # Invalid
        "12345",  # Just numbers
        "Ankara Çankaya",  # Minimal but valid
        ""  # Empty
    ]

    for addr in validation_tests:
        is_valid = processor.is_valid_turkish_address(processor.preprocess_address(addr))
        print(f"Address: '{addr}' -> Valid: {is_valid}")

"""# Test 10: Generate report
print("\n10. ADDRESS QUALITY REPORT")
print("-" * 50)
report = processor.generate_address_report(processed_df)
"""
"""# JSON yerine direkt yazdır
for key, value in report.items():
    if isinstance(value, dict):
        print(f"{key}:")
        for sub_key, sub_value in value.items():
            print(f"  {sub_key}: {sub_value}")
    else:
        print(f"{key}: {value}")"""

"""    # Test 11: Number normalization edge cases
    print("\n11. NUMBER NORMALIZATION EDGE CASES")
    print("-" * 50)
    number_tests = [
        "No:25D:4",
        "No: 5",
        "No.25/A",
        "25/B",
        "D: 5",
        "Daire 5",
        "K: 2",
        "Kat: 2",
        "birinci kat",
        "ikinci sokak",
        "üçüncü cadde"
    ]

    for test in number_tests:
        normalized = processor.normalize_numbers(test.lower())
        print(f"Input:  {test}")
        print(f"Output: {normalized}")"""

"""    # Test 12: Performance test
    print("\n12. PERFORMANCE TEST")
    print("-" * 50)
    import time"""

"""    # Generate test data
    large_test = test_addresses * 100  # 1300 addresses

    start_time = time.time()
    large_df = pd.DataFrame({'address': large_test})
    processed_large = processor.preprocess_dataframe(large_df, extract_features=False, remove_duplicates=False)
    end_time = time.time()

    processing_time = end_time - start_time
    addresses_per_second = len(large_test) / processing_time
""""""
    print(f"Processed {len(large_test)} addresses in {processing_time:.2f} seconds")"""
"""    print(f"Speed: {addresses_per_second:.0f} addresses/second")"""

"""    print("\n" + "=" * 80)
    print("ALL TESTS COMPLETED SUCCESSFULLY!")
    print("=" * 80)"""



TURKISH ADDRESS PROCESSOR - COMPREHENSIVE TEST

1. BASIC PREPROCESSING TEST
--------------------------------------------------
Original:  Çankaya Mah. Atatürk Bulv. No:125/A D:3 K:2 Ankara
Processed: cankaya mahallesi ataturk bulvar numara 125a daire 3 kat 2 ankara

Original:  Kadıköy İskele Cd. No:10/B İstanbul
Processed: kadikoy iskele caddesi numara 10b istanbul

Original:  Narlıdere Mah. Mithatpaşa Cd. No:15/A D:3 K:2 İzmir
Processed: narlidere mahallesi mithatpasa caddesi numara 15a daire 3 kat 2 izmir

Original:  Fatih mah menderes bul No;25D:4
Processed: fatih mahallesi menderes bulvar numara 25d 4

Original:  YENİ MAH. ESKİ SOK. APT. NO:5 DAİRE:3 KAT:2
Processed: yeni mahallesi eski sokak apartmani numara 5 daire 3 kat 2


2. ADDRESS STANDARDIZATION TEST
--------------------------------------------------
Original:     Çankaya Mah. Atatürk Bulv. No:125/A D:3 K:2 Ankara
Standardized: cankaya mahallesi cankaya mahallesi ataturk numara 125a kat 2 daire 3 cankaya ankara

Original:  

'    print("\n" + "=" * 80)\n    print("ALL TESTS COMPLETED SUCCESSFULLY!")\n    print("=" * 80)'

In [None]:
# GPU optimizasyonlu kütüphaneleri yükle
!pip install sentence-transformers torch -q

import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import time
from typing import List, Dict, Tuple
import gc

# GPU kontrolü
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

class GPUTurkishAddressMatcher:
    """
    GPU-Optimized Turkish Address Matching System
    """

    def __init__(self, preprocessor=None):
        """Initialize with GPU-optimized components"""

        # Preprocessing için mevcut processor'ı kullan
        self.preprocessor = preprocessor or TurkishAddressProcessor()

        # GPU'ya model yükle
        print("Loading Turkish BERT model to GPU...")
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Turkish Sentence-BERT modeli
        self.model = SentenceTransformer('emrecan/bert-base-turkish-cased-mean-nli-stsb-tr')
        self.model = self.model.to(self.device)

        # Model özellikleri
        self.embedding_dim = 768  # BERT embedding dimension
        self.batch_size = 256 if torch.cuda.is_available() else 32

        # Storage for training data
        self.label_embeddings = {}
        self.label_ids = []
        self.label_info = {}

    def preprocess_batch(self, addresses: List[str]) -> List[str]:
        """Batch preprocessing with optimization"""
        processed = []
        for addr in addresses:
            if pd.isna(addr) or addr == '' or addr == '-----':
                processed.append('')
            else:
                # Basit preprocessing (hız için)
                proc = self.preprocessor.preprocess_address(addr, standardize=False)
                processed.append(proc)
        return processed

    def encode_addresses(self, addresses: List[str], desc: str = "Encoding") -> np.ndarray:
        """
        GPU-accelerated address encoding
        """
        # Preprocess addresses
        processed = self.preprocess_batch(addresses)

        # Filter out empty addresses
        valid_indices = [i for i, addr in enumerate(processed) if addr]
        valid_addresses = [processed[i] for i in valid_indices]

        if not valid_addresses:
            return np.zeros((len(addresses), self.embedding_dim))

        # Encode with GPU
        embeddings = self.model.encode(
            valid_addresses,
            batch_size=self.batch_size,
            device=self.device,
            show_progress_bar=True,
            convert_to_numpy=True,
            normalize_embeddings=True  # Normalized for cosine similarity
        )

        # Create full embedding matrix
        full_embeddings = np.zeros((len(addresses), self.embedding_dim))
        for idx, valid_idx in enumerate(valid_indices):
            full_embeddings[valid_idx] = embeddings[idx]

        return full_embeddings

    def train(self, train_df: pd.DataFrame):
        """
        GPU-optimized training
        """
        print("\n" + "="*60)
        print("GPU TRAINING STARTED")
        print("="*60)

        start_time = time.time()

        # Get unique labels
        unique_labels = train_df['label'].unique()
        print(f"\nUnique locations: {len(unique_labels)}")

        # Process in chunks to manage GPU memory
        chunk_size = 50000
        all_embeddings = []

        print("\nEncoding all training addresses...")
        for i in range(0, len(train_df), chunk_size):
            chunk = train_df.iloc[i:i+chunk_size]
            chunk_embeddings = self.encode_addresses(
                chunk['address'].tolist(),
                desc=f"Chunk {i//chunk_size + 1}"
            )
            all_embeddings.append(chunk_embeddings)

            # Clear GPU cache
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        # Combine all embeddings
        all_embeddings = np.vstack(all_embeddings)

        # Create label embeddings by averaging
        print("\nCreating label embeddings...")
        for label_idx, label in enumerate(unique_labels):
            mask = train_df['label'] == label
            label_indices = np.where(mask)[0]

            # Average embeddings for this label
            label_embedding = all_embeddings[label_indices].mean(axis=0)
            self.label_embeddings[label] = label_embedding
            self.label_info[label] = len(label_indices)

            if (label_idx + 1) % 1000 == 0:
                print(f"Processed {label_idx + 1}/{len(unique_labels)} labels...")

        # Convert to matrix for fast similarity computation
        self.label_ids = list(self.label_embeddings.keys())
        self.label_matrix = np.vstack([self.label_embeddings[lid] for lid in self.label_ids])

        train_time = time.time() - start_time
        print(f"\n✅ Training completed in {train_time:.1f} seconds")
        print(f"   Labels: {len(self.label_ids)}")
        print(f"   Embeddings shape: {self.label_matrix.shape}")

    def predict_batch(self, test_df: pd.DataFrame) -> pd.DataFrame:
        """
        GPU-optimized batch prediction
        """
        print("\n" + "="*60)
        print("GPU PREDICTION STARTED")
        print("="*60)

        start_time = time.time()

        # Ensure ID column
        if 'id' not in test_df.columns:
            test_df['id'] = range(len(test_df))

        predictions = []
        chunk_size = 10000

        print(f"\nProcessing {len(test_df)} test addresses...")

        for i in range(0, len(test_df), chunk_size):
            chunk = test_df.iloc[i:i+chunk_size]

            # Encode chunk
            print(f"\nEncoding chunk {i//chunk_size + 1}/{(len(test_df)-1)//chunk_size + 1}...")
            chunk_embeddings = self.encode_addresses(chunk['address'].tolist())

            # Calculate similarities with all labels
            similarities = cosine_similarity(chunk_embeddings, self.label_matrix)

            # Get best matches
            best_indices = similarities.argmax(axis=1)

            # Create predictions
            for j, best_idx in enumerate(best_indices):
                predictions.append({
                    'id': chunk.iloc[j]['id'],
                    'label': self.label_ids[best_idx],
                    'confidence': similarities[j, best_idx]
                })

            # Progress
            processed = min(i + chunk_size, len(test_df))
            elapsed = time.time() - start_time
            speed = processed / elapsed
            eta = (len(test_df) - processed) / speed

            print(f"Processed: {processed}/{len(test_df)} | "
                  f"Speed: {speed:.1f} addr/sec | "
                  f"ETA: {eta:.0f}s")

            # Clear GPU cache
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        pred_time = time.time() - start_time
        print(f"\n✅ Prediction completed in {pred_time:.1f} seconds")
        print(f"   Speed: {len(test_df)/pred_time:.1f} addresses/second")

        return pd.DataFrame(predictions)


def gpu_main():
    """
    Main execution with GPU optimization
    """
    print("🚀 GPU-ACCELERATED TURKISH ADDRESS MATCHING")
    print("="*60)

    # Load data
    print("\nLoading data...")
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')

    print(f"Train: {len(train_df)} addresses")
    print(f"Test: {len(test_df)} addresses")

    # Initialize GPU model
    print("\nInitializing GPU model...")
    processor = TurkishAddressProcessor()
    gpu_model = GPUTurkishAddressMatcher(processor)

    # Train
    gpu_model.train(train_df)

    # Clear memory before prediction
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # Predict
    predictions = gpu_model.predict_batch(test_df)

    # Save results
    submission_file = 'submission_gpu.csv'
    predictions[['id', 'label']].to_csv(submission_file, index=False)

    print(f"\n📁 Submission saved to: {submission_file}")

    # Statistics
    print("\nPrediction Statistics:")
    print(f"Total predictions: {len(predictions)}")
    print(f"Unique predictions: {predictions['label'].nunique()}")
    print(f"Avg confidence: {predictions['confidence'].mean():.3f}")

    # Top predictions
    print("\nTop 10 most confident predictions:")
    top_preds = predictions.nlargest(10, 'confidence')
    for _, row in top_preds.iterrows():
        test_addr = test_df[test_df['id'] == row['id']]['address'].iloc[0]
        print(f"ID {row['id']}: {test_addr[:50]}... → Label {row['label']} (conf: {row['confidence']:.3f})")

    print("\n✅ GPU Pipeline completed successfully!")
    print("Ready to submit to Kaggle! 🏆")


def ultra_fast_gpu_version():
    """
    Ultra fast version using only embeddings without preprocessing
    """
    print("⚡ ULTRA FAST GPU VERSION")

    # Load data
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')

    if 'id' not in test_df.columns:
        test_df['id'] = range(len(test_df))

    # Initialize model
    model = SentenceTransformer('emrecan/bert-base-turkish-cased-mean-nli-stsb-tr')
    model = model.to('cuda')

    print("Encoding training data...")
    # Encode all training addresses at once
    train_embeddings = model.encode(
        train_df['address'].tolist(),
        batch_size=512,
        show_progress_bar=True,
        device='cuda',
        normalize_embeddings=True
    )

    # Create label embeddings
    print("Creating label embeddings...")
    label_embeddings = {}
    unique_labels = train_df['label'].unique()

    for label in unique_labels:
        mask = train_df['label'] == label
        label_embeddings[label] = train_embeddings[mask].mean(axis=0)

    label_ids = list(label_embeddings.keys())
    label_matrix = np.vstack(list(label_embeddings.values()))

    print("Encoding test data...")
    # Encode test addresses
    test_embeddings = model.encode(
        test_df['address'].tolist(),
        batch_size=512,
        show_progress_bar=True,
        device='cuda',
        normalize_embeddings=True
    )

    print("Computing similarities...")
    # Compute all similarities at once
    similarities = test_embeddings @ label_matrix.T
    best_indices = similarities.argmax(axis=1)

    # Create predictions
    predictions = pd.DataFrame({
        'id': test_df['id'],
        'label': [label_ids[idx] for idx in best_indices]
    })

    predictions.to_csv('submission_ultra_fast.csv', index=False)
    print("✅ Done! Saved to submission_ultra_fast.csv")


# Memory-efficient version for large datasets
class MemoryEfficientGPUMatcher:
    """
    Memory-efficient GPU implementation for very large datasets
    """

    def __init__(self):
        self.model = SentenceTransformer('emrecan/bert-base-turkish-cased-mean-nli-stsb-tr')
        self.model = self.model.to('cuda')
        self.model.eval()  # Set to evaluation mode

    def encode_with_memory_management(self, texts: List[str], batch_size: int = 128) -> np.ndarray:
        """
        Encode texts with automatic memory management
        """
        embeddings = []

        with torch.no_grad():  # Disable gradient computation
            for i in range(0, len(texts), batch_size):
                batch = texts[i:i + batch_size]

                # Encode batch
                batch_embeddings = self.model.encode(
                    batch,
                    device='cuda',
                    show_progress_bar=False,
                    convert_to_numpy=True,
                    normalize_embeddings=True
                )

                embeddings.append(batch_embeddings)

                # Force GPU memory cleanup every 10 batches
                if (i // batch_size) % 10 == 0:
                    torch.cuda.empty_cache()
                    gc.collect()

        return np.vstack(embeddings)

    def predict_large_dataset(self, train_df: pd.DataFrame, test_df: pd.DataFrame):
        """
        Handle very large datasets efficiently
        """
        print("Processing large dataset with memory management...")

        # Process training data in chunks
        unique_labels = train_df['label'].unique()
        label_embeddings = {}

        # Process each label separately to save memory
        for idx, label in enumerate(unique_labels):
            if idx % 100 == 0:
                print(f"Processing label {idx}/{len(unique_labels)}")

            label_addresses = train_df[train_df['label'] == label]['address'].tolist()

            # Encode this label's addresses
            embeddings = self.encode_with_memory_management(label_addresses, batch_size=64)
            label_embeddings[label] = embeddings.mean(axis=0)

            # Clean up
            del embeddings
            gc.collect()

        # Create label matrix
        label_ids = list(label_embeddings.keys())
        label_matrix = np.vstack(list(label_embeddings.values()))

        # Process test data in chunks
        predictions = []
        chunk_size = 5000

        for i in range(0, len(test_df), chunk_size):
            print(f"Processing test chunk {i}/{len(test_df)}")

            chunk = test_df.iloc[i:i + chunk_size]
            chunk_embeddings = self.encode_with_memory_management(
                chunk['address'].tolist(),
                batch_size=128
            )

            # Compute similarities
            similarities = chunk_embeddings @ label_matrix.T
            best_indices = similarities.argmax(axis=1)

            # Add predictions
            for j, best_idx in enumerate(best_indices):
                predictions.append({
                    'id': chunk.iloc[j]['id'],
                    'label': label_ids[best_idx]
                })

            # Clean up
            del chunk_embeddings, similarities
            gc.collect()
            torch.cuda.empty_cache()

        return pd.DataFrame(predictions)


# Ana çalıştırma fonksiyonu
if __name__ == "__main__":
    # GPU durumunu kontrol et
    if not torch.cuda.is_available():
        print("⚠️  GPU not available! Using CPU fallback...")
        print("For faster processing, enable GPU runtime in Colab:")
        print("Runtime → Change runtime type → Hardware accelerator → GPU")
    else:
        print(f"✅ Using GPU: {torch.cuda.get_device_name(0)}")
        print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

    # Hangi versiyonu çalıştıracağını seç
    print("\nSelect version:")
    print("1. Standard GPU version (recommended)")
    print("2. Ultra fast version (less preprocessing)")
    print("3. Memory efficient version (for very large datasets)")

    choice = "1"  # Default choice

    if choice == "1":
        gpu_main()
    elif choice == "2":
        ultra_fast_gpu_version()
    elif choice == "3":
        # Memory efficient version
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        if 'id' not in test_df.columns:
            test_df['id'] = range(len(test_df))

        matcher = MemoryEfficientGPUMatcher()
        predictions = matcher.predict_large_dataset(train_df, test_df)
        predictions.to_csv('submission_memory_efficient.csv', index=False)
        print("✅ Saved to submission_memory_efficient.csv")


CUDA available: True
GPU: NVIDIA A100-SXM4-40GB
GPU Memory: 42.47 GB
✅ Using GPU: NVIDIA A100-SXM4-40GB
   Memory: 42.47 GB

Select version:
1. Standard GPU version (recommended)
2. Ultra fast version (less preprocessing)
3. Memory efficient version (for very large datasets)
🚀 GPU-ACCELERATED TURKISH ADDRESS MATCHING

Loading data...
Train: 848237 addresses
Test: 217241 addresses

Initializing GPU model...
Loading Turkish BERT model to GPU...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/431 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


GPU TRAINING STARTED

Unique locations: 10390

Encoding all training addresses...


Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/189 [00:00<?, ?it/s]


Creating label embeddings...
Processed 1000/10390 labels...
Processed 2000/10390 labels...
Processed 3000/10390 labels...
Processed 4000/10390 labels...
Processed 5000/10390 labels...
Processed 6000/10390 labels...
Processed 7000/10390 labels...
Processed 8000/10390 labels...
Processed 9000/10390 labels...
Processed 10000/10390 labels...

✅ Training completed in 11213.6 seconds
   Labels: 10390
   Embeddings shape: (10390, 768)

GPU PREDICTION STARTED

Processing 217241 test addresses...

Encoding chunk 1/22...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Processed: 10000/217241 | Speed: 76.1 addr/sec | ETA: 2725s

Encoding chunk 2/22...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Processed: 20000/217241 | Speed: 75.4 addr/sec | ETA: 2616s

Encoding chunk 3/22...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Processed: 30000/217241 | Speed: 75.2 addr/sec | ETA: 2489s

Encoding chunk 4/22...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Processed: 40000/217241 | Speed: 75.1 addr/sec | ETA: 2360s

Encoding chunk 5/22...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Processed: 50000/217241 | Speed: 75.1 addr/sec | ETA: 2228s

Encoding chunk 6/22...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Processed: 60000/217241 | Speed: 75.0 addr/sec | ETA: 2096s

Encoding chunk 7/22...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Processed: 70000/217241 | Speed: 75.0 addr/sec | ETA: 1963s

Encoding chunk 8/22...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Processed: 80000/217241 | Speed: 75.1 addr/sec | ETA: 1828s

Encoding chunk 9/22...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Processed: 90000/217241 | Speed: 75.1 addr/sec | ETA: 1694s

Encoding chunk 10/22...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Processed: 100000/217241 | Speed: 75.1 addr/sec | ETA: 1561s

Encoding chunk 11/22...


Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Processed: 110000/217241 | Speed: 75.1 addr/sec | ETA: 1429s

Encoding chunk 12/22...
