In [1]:
# ========================================================================
# TEKNOFEST 2025 - ADVANCED ADDRESS RESOLUTION SOLUTION
# Complete Colab-Ready Code for 0.80+ Score
# ========================================================================

# STEP 1: INSTALL REQUIREMENTS
print("🚀 Installing required packages...")
import subprocess
import sys

def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        return True
    except:
        return False

# Install packages with fallback
packages = [
    "pandas", "numpy", "scikit-learn", "matplotlib", "seaborn",
    "xgboost", "lightgbm", "catboost", "fuzzywuzzy", "python-levenshtein"
]

for package in packages:
    success = install_package(package)
    print(f"  {package}: {'✅' if success else '❌'}")

print("📦 Package installation completed!")


🚀 Installing required packages...
  pandas: ✅
  numpy: ✅
  scikit-learn: ✅
  matplotlib: ✅
  seaborn: ✅
  xgboost: ✅
  lightgbm: ✅
  catboost: ✅
  fuzzywuzzy: ✅
  python-levenshtein: ✅
📦 Package installation completed!


In [2]:
# ========================================================================
# STEP 2: IMPORT LIBRARIES WITH FALLBACK
# ========================================================================

import pandas as pd
import numpy as np
import re
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

# Optional imports with fallback
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
    print("✅ XGBoost loaded successfully")
except ImportError:
    XGBOOST_AVAILABLE = False
    print("⚠️ XGBoost not available, using alternatives")

try:
    import lightgbm as lgb
    LIGHTGBM_AVAILABLE = True
    print("✅ LightGBM loaded successfully")
except ImportError:
    LIGHTGBM_AVAILABLE = False
    print("⚠️ LightGBM not available, using alternatives")

try:
    from catboost import CatBoostClassifier
    CATBOOST_AVAILABLE = True
    print("✅ CatBoost loaded successfully")
except ImportError:
    CATBOOST_AVAILABLE = False
    print("⚠️ CatBoost not available, using alternatives")

try:
    from fuzzywuzzy import fuzz
    FUZZYWUZZY_AVAILABLE = True
    print("✅ FuzzyWuzzy loaded successfully")
except ImportError:
    FUZZYWUZZY_AVAILABLE = False
    print("⚠️ FuzzyWuzzy not available, using simple similarity")


✅ XGBoost loaded successfully
✅ LightGBM loaded successfully
✅ CatBoost loaded successfully
✅ FuzzyWuzzy loaded successfully


In [3]:
# ========================================================================
# STEP 3: TURKEY LOCATION DATABASE
# ========================================================================

class TurkeyLocationDatabase:
    """Comprehensive Turkey location database"""

    def __init__(self):
        # 81 Turkish provinces
        self.provinces = {
            'adana': ['adana'], 'adiyaman': ['adıyaman', 'adiyaman'], 'afyonkarahisar': ['afyon', 'afyonkarahisar'],
            'agri': ['ağrı', 'agri'], 'aksaray': ['aksaray'], 'amasya': ['amasya'], 'ankara': ['ankara'],
            'antalya': ['antalya'], 'ardahan': ['ardahan'], 'artvin': ['artvin'], 'aydin': ['aydın', 'aydin'],
            'balikesir': ['balıkesir', 'balikesir'], 'bartin': ['bartın', 'bartin'], 'batman': ['batman'],
            'bayburt': ['bayburt'], 'bilecik': ['bilecik'], 'bingol': ['bingöl', 'bingol'], 'bitlis': ['bitlis'],
            'bolu': ['bolu'], 'burdur': ['burdur'], 'bursa': ['bursa'], 'canakkale': ['çanakkale', 'canakkale'],
            'cankiri': ['çankırı', 'cankiri'], 'corum': ['çorum', 'corum'], 'denizli': ['denizli'],
            'diyarbakir': ['diyarbakır', 'diyarbakir'], 'duzce': ['düzce', 'duzce'], 'edirne': ['edirne'],
            'elazig': ['elazığ', 'elazig'], 'erzincan': ['erzincan'], 'erzurum': ['erzurum'], 'eskisehir': ['eskişehir', 'eskisehir'],
            'gaziantep': ['gaziantep'], 'giresun': ['giresun'], 'gumushane': ['gümüşhane', 'gumushane'],
            'hakkari': ['hakkâri', 'hakkari'], 'hatay': ['hatay'], 'igdir': ['iğdır', 'igdir'], 'isparta': ['isparta'],
            'istanbul': ['istanbul', 'İstanbul'], 'izmir': ['izmir', 'İzmir'], 'kahramanmaras': ['kahramanmaraş', 'kahramanmaras'],
            'karabuk': ['karabük', 'karabuk'], 'karaman': ['karaman'], 'kars': ['kars'], 'kastamonu': ['kastamonu'],
            'kayseri': ['kayseri'], 'kilis': ['kilis'], 'kirikkale': ['kırıkkale', 'kirikkale'], 'kirklareli': ['kırklareli', 'kirklareli'],
            'kirsehir': ['kırşehir', 'kirsehir'], 'kocaeli': ['kocaeli'], 'konya': ['konya'], 'kutahya': ['kütahya', 'kutahya'],
            'malatya': ['malatya'], 'manisa': ['manisa'], 'mardin': ['mardin'], 'mersin': ['mersin'],
            'mugla': ['muğla', 'mugla'], 'mus': ['muş', 'mus'], 'nevsehir': ['nevşehir', 'nevsehir'],
            'nigde': ['niğde', 'nigde'], 'ordu': ['ordu'], 'osmaniye': ['osmaniye'], 'rize': ['rize'],
            'sakarya': ['sakarya'], 'samsun': ['samsun'], 'sanliurfa': ['şanlıurfa', 'sanliurfa'],
            'siirt': ['siirt'], 'sinop': ['sinop'], 'sirnak': ['şırnak', 'sirnak'], 'sivas': ['sivas'],
            'tekirdag': ['tekirdağ', 'tekirdag'], 'tokat': ['tokat'], 'trabzon': ['trabzon'], 'tunceli': ['tunceli'],
            'usak': ['uşak', 'usak'], 'van': ['van'], 'yalova': ['yalova'], 'yozgat': ['yozgat'], 'zonguldak': ['zonguldak']
        }

        # Major districts by region
        self.districts = {
            # İzmir districts
            'aliaga': ['aliağa', 'aliaga'], 'balcova': ['balçova', 'balcova'], 'bayindir': ['bayındır', 'bayindir'],
            'bayrakli': ['bayraklı', 'bayrakli'], 'bergama': ['bergama'], 'bornova': ['bornova'],
            'buca': ['buca'], 'cesme': ['çeşme', 'cesme'], 'ciglli': ['çiğli', 'ciglli'], 'dikili': ['dikili'],
            'foca': ['foça', 'foca'], 'gaziemir': ['gaziemir'], 'guzelbahce': ['güzelbahçe', 'guzelbahce'],
            'karabaglar': ['karabağlar', 'karabaglar'], 'karaburun': ['karaburun'], 'karsiyaka': ['karşıyaka', 'karsiyaka'],
            'kemalpasa': ['kemalpaşa', 'kemalpasa'], 'kinik': ['kınık', 'kinik'], 'kiraz': ['kiraz'],
            'konak': ['konak'], 'menderes': ['menderes'], 'menemen': ['menemen'], 'narlidere': ['narlıdere', 'narlidere'],
            'odemis': ['ödemiş', 'odemis'], 'seferihisar': ['seferihisar'], 'selcuk': ['selçuk', 'selcuk'],
            'tire': ['tire'], 'torbali': ['torbalı', 'torbali'], 'urla': ['urla'],

            # Manisa districts
            'ahmetli': ['ahmetli'], 'akhisar': ['akhisar'], 'alasehir': ['alaşehir', 'alasehir'],
            'demirci': ['demirci'], 'golmarmara': ['gölmarmara', 'golmarmara'], 'gordes': ['gördes', 'gordes'],
            'kirkagac': ['kırkağaç', 'kirkagac'], 'koprubasi': ['köprübaşı', 'koprubasi'], 'kula': ['kula'],
            'salihli': ['salihli'], 'sarigol': ['sarıgöl', 'sarigol'], 'saruhanli': ['saruhanli'],
            'sehzadeler': ['şehzadeler', 'sehzadeler'], 'selendi': ['selendi'], 'soma': ['soma'],
            'turgutlu': ['turgutlu'], 'yunusemre': ['yunusemre'],

            # Denizli districts
            'acipayam': ['acıpayam', 'acipayam'], 'baklan': ['baklan'], 'bekilli': ['bekilli'],
            'beyagac': ['beyağaç', 'beyagac'], 'bozkurt': ['bozkurt'], 'buldan': ['buldan'],
            'cal': ['çal', 'cal'], 'cameli': ['çameli', 'cameli'], 'cardak': ['çardak', 'cardak'],
            'civril': ['çivril', 'civril'], 'guney': ['güney', 'guney'], 'honaz': ['honaz'],
            'kale': ['kale'], 'merkezefendi': ['merkezefendi'], 'pamukkale': ['pamukkale'],
            'saraykoy': ['sarayköy', 'saraykoy'], 'serinhisar': ['serinhisar'], 'tavas': ['tavas'],

            # Muğla districts
            'bodrum': ['bodrum'], 'dalaman': ['dalaman'], 'datca': ['datça', 'datca'], 'fethiye': ['fethiye'],
            'kavaklidere': ['kavaklıdere', 'kavaklidere'], 'koycegiz': ['köyceğiz', 'koycegiz'],
            'marmaris': ['marmaris'], 'mentese': ['menteşe', 'mentese'], 'milas': ['milas'],
            'ortaca': ['ortaca'], 'seydikemer': ['seydikemer'], 'ula': ['ula'], 'yatagan': ['yatağan', 'yatagan'],

            # Aydın districts
            'bozdogan': ['bozdoğan', 'bozdogan'], 'buharkent': ['buharkent'], 'cine': ['çine', 'cine'],
            'didim': ['didim'], 'efeler': ['efeler'], 'germencik': ['germencik'], 'incirliova': ['incirliova'],
            'karacasu': ['karacasu'], 'karpuzlu': ['karpuzlu'], 'kocarli': ['koçarlı', 'kocarli'],
            'kusadasi': ['kuşadası', 'kusadasi'], 'kuyucak': ['kuyucak'], 'nazilli': ['nazilli'],
            'soke': ['söke', 'soke'], 'sultanhisar': ['sultanhisar'], 'yenipazar': ['yenipazar'],

            # Istanbul major districts
            'adalar': ['adalar'], 'atasehir': ['ataşehir', 'atasehir'], 'avcilar': ['avcılar', 'avcilar'],
            'bagcilar': ['bağcılar', 'bagcilar'], 'bahcelievler': ['bahçelievler', 'bahcelievler'],
            'bakirkoy': ['bakırköy', 'bakirkoy'], 'besiktas': ['beşiktaş', 'besiktas'], 'beykoz': ['beykoz'],
            'beylikduzu': ['beylikdüzü', 'beylikduzu'], 'beyoglu': ['beyoğlu', 'beyoglu'],
            'buyukcekmece': ['büyükçekmece', 'buyukcekmece'], 'fatih': ['fatih'],
            'kadikoy': ['kadıköy', 'kadikoy'], 'kartal': ['kartal'], 'maltepe': ['maltepe'],
            'pendik': ['pendik'], 'sisli': ['şişli', 'sisli'], 'uskudar': ['üsküdar', 'uskudar'],

            # Ankara districts
            'altindag': ['altındağ', 'altindag'], 'cankaya': ['çankaya', 'cankaya'],
            'etimesgut': ['etimesgut'], 'golbasi': ['gölbaşı', 'golbasi'], 'kecioren': ['keçiören', 'kecioren'],
            'mamak': ['mamak'], 'polatli': ['polatlı', 'polatli'], 'pursaklar': ['pursaklar'],
            'sincan': ['sincan'], 'yenimahalle': ['yenimahalle']
        }

    def find_province_in_address(self, address):
        """Find province in address"""
        if pd.isna(address):
            return None
        address_lower = str(address).lower()

        for province, variants in self.provinces.items():
            for variant in variants:
                if variant.lower() in address_lower:
                    return province
        return None

    def find_district_in_address(self, address):
        """Find district in address"""
        if pd.isna(address):
            return None
        address_lower = str(address).lower()

        for district, variants in self.districts.items():
            for variant in variants:
                if variant.lower() in address_lower:
                    return district
        return None

In [4]:
# ========================================================================
# STEP 4: ADVANCED ADDRESS RESOLVER
# ========================================================================

class AdvancedAddressResolver:
    """Advanced multi-stage address resolution system"""

    def __init__(self):
        # Core components
        self.location_db = TurkeyLocationDatabase()
        self.address_normalizer = AddressNormalizer()
        self.feature_extractor = FeatureExtractor()
        self.ensemble_predictor = EnsemblePredictor()
        self.post_processor = PostProcessor()

        # Vectorizers
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=4000,
            ngram_range=(1, 4),
            analyzer='char_wb',
            lowercase=True,
            min_df=2,
            max_df=0.95
        )

        self.svd = TruncatedSVD(n_components=150, random_state=42)
        self.scaler = StandardScaler()

        # Storage
        self.address_lookup = {}
        self.label_centroids = {}

    def fit(self, addresses, labels):
        """Complete training pipeline"""
        print("🚀 Advanced Address Resolution Training Started...")
        print(f"📊 Training data: {len(addresses)} addresses, {len(set(labels))} unique labels")

        # Stage 1: Address normalization
        print("1️⃣ Normalizing addresses...")
        normalized_addresses = [self.address_normalizer.normalize(addr) for addr in addresses]

        # Stage 2: Feature extraction
        print("2️⃣ Extracting comprehensive features...")
        features = self._extract_all_features(normalized_addresses, labels, fit=True)

        # Stage 3: Build lookup tables
        print("3️⃣ Building lookup tables...")
        self._build_lookup_tables(normalized_addresses, labels)

        # Stage 4: Train ensemble
        print("4️⃣ Training ensemble models...")
        self.ensemble_predictor.fit(features, labels)

        # Stage 5: Configure post-processor
        print("5️⃣ Configuring post-processor...")
        self.post_processor.configure(normalized_addresses, labels)

        print("✅ Training completed successfully!")
        return self

    def predict(self, addresses):
        """Complete prediction pipeline"""
        print("🔮 Advanced Prediction Started...")

        # Normalize addresses
        normalized_addresses = [self.address_normalizer.normalize(addr) for addr in addresses]

        # Extract features
        features = self._extract_all_features(normalized_addresses, None, fit=False)

        # Ensemble prediction
        raw_predictions = self.ensemble_predictor.predict(features)

        # Post-processing
        final_predictions = self.post_processor.refine_predictions(
            normalized_addresses, raw_predictions
        )

        print("✅ Prediction completed successfully!")
        return final_predictions

    def _extract_all_features(self, addresses, labels, fit=False):
        """Extract comprehensive feature set"""

        # Text features (TF-IDF + SVD)
        if fit:
            tfidf_features = self.tfidf_vectorizer.fit_transform(addresses)
            text_features = self.svd.fit_transform(tfidf_features)
        else:
            tfidf_features = self.tfidf_vectorizer.transform(addresses)
            text_features = self.svd.transform(tfidf_features)

        # Geographic features
        geo_features = []
        for addr in addresses:
            geo_feat = self.feature_extractor.extract_geographic_features(addr, self.location_db)
            geo_features.append(geo_feat)
        geo_features = np.array(geo_features)

        # Structural features
        struct_features = []
        for addr in addresses:
            struct_feat = self.feature_extractor.extract_structural_features(addr)
            struct_features.append(struct_feat)
        struct_features = np.array(struct_features)

        # Semantic features (similarity to label centroids)
        if fit and labels is not None:
            self._build_label_centroids(text_features, labels)

        semantic_features = []
        for text_feat in text_features:
            sem_feat = self._get_semantic_similarities(text_feat)
            semantic_features.append(sem_feat)
        semantic_features = np.array(semantic_features)

        # Combine all features
        all_features = np.hstack([text_features, geo_features, struct_features, semantic_features])

        # Scale features
        if fit:
            all_features = self.scaler.fit_transform(all_features)
        else:
            all_features = self.scaler.transform(all_features)

        return all_features

    def _build_label_centroids(self, text_features, labels):
        """Build centroids for each label"""
        unique_labels = np.unique(labels)
        for label in unique_labels:
            mask = np.array(labels) == label
            if np.any(mask):
                centroid = np.mean(text_features[mask], axis=0)
                self.label_centroids[label] = centroid

    def _get_semantic_similarities(self, text_feature):
        """Get similarities to label centroids"""
        if not self.label_centroids:
            return np.zeros(20)  # Placeholder

        similarities = []
        for label, centroid in self.label_centroids.items():
            similarity = np.dot(text_feature, centroid) / (
                np.linalg.norm(text_feature) * np.linalg.norm(centroid) + 1e-8
            )
            similarities.append(max(0, similarity))

        similarities.sort(reverse=True)
        return np.array(similarities[:20])  # Top 20 similarities

    def _build_lookup_tables(self, addresses, labels):
        """Build address lookup tables"""
        for addr, label in zip(addresses, labels):
            clean_addr = self._clean_for_lookup(addr)
            if clean_addr not in self.address_lookup:
                self.address_lookup[clean_addr] = []
            self.address_lookup[clean_addr].append(label)

        # Keep most common label for each address
        for addr in self.address_lookup:
            label_counts = Counter(self.address_lookup[addr])
            self.address_lookup[addr] = label_counts.most_common(1)[0][0]

    def _clean_for_lookup(self, address):
        """Clean address for lookup"""
        clean_addr = re.sub(r'[^\w\s]', '', str(address).lower())
        return re.sub(r'\s+', ' ', clean_addr).strip()


class AddressNormalizer:
    """Advanced address normalization"""

    def normalize(self, address):
        """Comprehensive address normalization"""
        if pd.isna(address):
            return ""

        address = str(address).lower()

        # Remove excessive punctuation
        address = re.sub(r'[^\w\s/\-\.]', ' ', address)

        # Normalize whitespace
        address = re.sub(r'\s+', ' ', address)

        # Expand common abbreviations
        abbreviations = {
            r'\bmah\b': 'mahalle', r'\bmah\.\b': 'mahalle', r'\bmahallesi\b': 'mahalle',
            r'\bsok\b': 'sokak', r'\bsok\.\b': 'sokak', r'\bsokağı\b': 'sokak',
            r'\bcd\b': 'cadde', r'\bcd\.\b': 'cadde', r'\bcaddesi\b': 'cadde',
            r'\bapt\b': 'apartman', r'\bapt\.\b': 'apartman', r'\bapartmanı\b': 'apartman',
            r'\bno\b': 'numara', r'\bno\.\b': 'numara',
            r'\bd\b': 'daire', r'\bd\.\b': 'daire',
            r'\bk\b': 'kat', r'\bk\.\b': 'kat',
            r'\bblv\b': 'bulvar', r'\bblv\.\b': 'bulvar', r'\bbulvarı\b': 'bulvar',
            r'\bmh\b': 'mahalle', r'\bmh\.\b': 'mahalle',
            r'\bsit\b': 'sitesi', r'\bsit\.\b': 'sitesi'
        }

        for abbr, full in abbreviations.items():
            address = re.sub(abbr, full, address, flags=re.IGNORECASE)

        # Remove redundant punctuation
        address = re.sub(r'[/\-\.]', ' ', address)
        address = re.sub(r'\s+', ' ', address)

        return address.strip()


class FeatureExtractor:
    """Comprehensive feature extraction"""

    def extract_geographic_features(self, address, location_db):
        """Extract geographic features"""
        features = []

        # Province detection (one-hot for top 25 provinces)
        top_provinces = ['istanbul', 'ankara', 'izmir', 'bursa', 'antalya', 'adana', 'konya',
                        'gaziantep', 'manisa', 'denizli', 'mugla', 'aydin', 'kocaeli', 'mersin',
                        'samsun', 'kayseri', 'balikesir', 'hatay', 'trabzon', 'erzurum',
                        'diyarbakir', 'sanliurfa', 'malatya', 'tekirdağ', 'sakarya']

        found_province = location_db.find_province_in_address(address)
        for province in top_provinces:
            features.append(1 if found_province == province else 0)

        # District detection (one-hot for top 30 districts)
        top_districts = ['bornova', 'konak', 'karsiyaka', 'bayrakli', 'buca', 'ciglli', 'gaziemir',
                        'pamukkale', 'merkezefendi', 'fethiye', 'bodrum', 'marmaris', 'datca',
                        'efeler', 'nazilli', 'kusadasi', 'yunusemre', 'sehzadeler', 'akhisar',
                        'turgutlu', 'salihli', 'cankaya', 'altindag', 'kecioren', 'besiktas',
                        'kadikoy', 'sisli', 'fatih', 'beyoglu', 'uskudar']

        found_district = location_db.find_district_in_address(address)
        for district in top_districts:
            features.append(1 if found_district == district else 0)

        # Geographic completeness
        features.extend([
            1 if found_province else 0,
            1 if found_district else 0,
            1 if found_province and found_district else 0
        ])

        return features

    def extract_structural_features(self, address):
        """Extract structural features"""
        # Basic metrics
        length = len(address)
        word_count = len(address.split())

        # Component detection
        has_mahalle = bool(re.search(r'\bmahalle\b', address))
        has_sokak = bool(re.search(r'\bsokak\b', address))
        has_cadde = bool(re.search(r'\bcadde\b', address))
        has_bulvar = bool(re.search(r'\bbulvar\b', address))
        has_numara = bool(re.search(r'\bnumara\b', address))
        has_daire = bool(re.search(r'\bdaire\b', address))
        has_kat = bool(re.search(r'\bkat\b', address))
        has_apartman = bool(re.search(r'\bapartman\b', address))
        has_sitesi = bool(re.search(r'\bsitesi\b', address))
        has_blok = bool(re.search(r'\bblok\b', address))

        # Number analysis
        numbers = re.findall(r'\d+', address)
        number_count = len(numbers)

        if numbers:
            number_values = [int(n) for n in numbers if len(n) <= 6]  # Avoid very large numbers
            avg_number = np.mean(number_values) if number_values else 0
            max_number = max(number_values) if number_values else 0
            min_number = min(number_values) if number_values else 0
        else:
            avg_number = max_number = min_number = 0

        # Character analysis
        slash_count = address.count('/')
        dash_count = address.count('-')
        dot_count = address.count('.')
        comma_count = address.count(',')

        # Pattern analysis
        has_postal_pattern = bool(re.search(r'\b\d{5}\b', address))
        has_phone_pattern = bool(re.search(r'\b\d{10,11}\b', address))

        # Completeness scores
        address_components = [has_mahalle, has_sokak, has_numara, has_daire]
        completeness_basic = sum(address_components) / len(address_components)

        extended_components = [has_mahalle, has_sokak, has_numara, has_daire, has_apartman]
        completeness_extended = sum(extended_components) / len(extended_components)

        # Word diversity
        words = address.split()
        unique_words = set(words)
        word_diversity = len(unique_words) / max(len(words), 1)

        return [
            length, word_count, number_count, avg_number, max_number, min_number,
            has_mahalle, has_sokak, has_cadde, has_bulvar, has_numara, has_daire,
            has_kat, has_apartman, has_sitesi, has_blok, slash_count, dash_count,
            dot_count, comma_count, has_postal_pattern, has_phone_pattern,
            completeness_basic, completeness_extended, word_diversity
        ]


class EnsemblePredictor:
    """Advanced ensemble of multiple models"""

    def __init__(self):
        self.models = {}
        self._setup_models()

    def _setup_models(self):
        """Setup available models"""
        # Always available: Random Forest
        self.models['rf'] = RandomForestClassifier(
            n_estimators=200,
            max_depth=20,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        )

        # XGBoost if available
        if XGBOOST_AVAILABLE:
            self.models['xgb'] = XGBClassifier(
                n_estimators=300,
                max_depth=12,
                learning_rate=0.03,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                eval_metric='mlogloss',
                verbosity=0
            )

        # LightGBM if available
        if LIGHTGBM_AVAILABLE:
            self.models['lgb'] = lgb.LGBMClassifier(
                n_estimators=300,
                max_depth=12,
                learning_rate=0.03,
                feature_fraction=0.8,
                bagging_fraction=0.8,
                random_state=42,
                verbose=-1
            )

        # CatBoost if available
        if CATBOOST_AVAILABLE:
            self.models['catboost'] = CatBoostClassifier(
                iterations=200,
                depth=10,
                learning_rate=0.03,
                random_state=42,
                verbose=False
            )

        print(f"✅ Ensemble models initialized: {list(self.models.keys())}")

    def fit(self, X, y):
        """Train all available models"""
        for name, model in self.models.items():
            print(f"   Training {name}...")
            model.fit(X, y)
        return self

    def predict(self, X):
        """Ensemble prediction with weighted voting"""
        predictions = {}

        # Get predictions from all models
        for name, model in self.models.items():
            predictions[name] = model.predict(X)

        # Weighted voting based on model performance expectations
        weights = {
            'rf': 0.2,
            'xgb': 0.4 if 'xgb' in predictions else 0,
            'lgb': 0.3 if 'lgb' in predictions else 0,
            'catboost': 0.1 if 'catboost' in predictions else 0
        }

        # Normalize weights for available models
        available_weights = {k: v for k, v in weights.items() if k in predictions}
        total_weight = sum(available_weights.values())
        normalized_weights = {k: v/total_weight for k, v in available_weights.items()}

        # Ensemble voting
        final_predictions = []
        for i in range(len(X)):
            vote_counts = defaultdict(float)

            for model_name, preds in predictions.items():
                vote_counts[preds[i]] += normalized_weights[model_name]

            # Select prediction with highest weight
            best_prediction = max(vote_counts.items(), key=lambda x: x[1])[0]
            final_predictions.append(best_prediction)

        return np.array(final_predictions)


class PostProcessor:
    """Advanced post-processing with similarity matching"""

    def __init__(self):
        self.address_lookup = {}
        self.similarity_threshold = 0.82

    def configure(self, addresses, labels):
        """Configure with training data"""
        for addr, label in zip(addresses, labels):
            clean_addr = self._normalize_for_lookup(addr)
            if clean_addr not in self.address_lookup:
                self.address_lookup[clean_addr] = []
            self.address_lookup[clean_addr].append(label)

        # Keep most common label for each address
        for addr in self.address_lookup:
            label_counts = Counter(self.address_lookup[addr])
            self.address_lookup[addr] = label_counts.most_common(1)[0][0]

    def refine_predictions(self, addresses, predictions):
        """Refine predictions with lookup and similarity"""
        refined = predictions.copy()

        for i, addr in enumerate(addresses):
            clean_addr = self._normalize_for_lookup(addr)

            # Exact lookup first
            if clean_addr in self.address_lookup:
                refined[i] = self.address_lookup[clean_addr]
                continue

            # Similarity matching
            if FUZZYWUZZY_AVAILABLE:
                match = self._fuzzy_match(clean_addr)
                if match:
                    refined[i] = match
            else:
                match = self._simple_similarity_match(clean_addr)
                if match:
                    refined[i] = match

        return refined

    def _normalize_for_lookup(self, address):
        """Normalize address for lookup"""
        normalized = re.sub(r'[^\w\s]', '', str(address).lower())
        return re.sub(r'\s+', ' ', normalized).strip()

    def _fuzzy_match(self, address):
        """Fuzzy matching with fuzzywuzzy"""
        best_score = 0
        best_label = None

        # Limit search for performance (sample from lookup)
        lookup_items = list(self.address_lookup.items())
        if len(lookup_items) > 2000:
            np.random.seed(42)
            lookup_items = np.random.choice(lookup_items, 2000, replace=False)

        for lookup_addr, label in lookup_items:
            score = fuzz.ratio(address, lookup_addr) / 100.0
            if score > best_score and score > self.similarity_threshold:
                best_score = score
                best_label = label

        return best_label

    def _simple_similarity_match(self, address):
        """Simple similarity without fuzzywuzzy"""
        best_score = 0
        best_label = None

        address_words = set(address.split())
        if not address_words:
            return None

        # Limit search for performance
        lookup_items = list(self.address_lookup.items())[:1500]

        for lookup_addr, label in lookup_items:
            lookup_words = set(lookup_addr.split())
            if not lookup_words:
                continue

            # Jaccard similarity
            intersection = len(address_words & lookup_words)
            union = len(address_words | lookup_words)

            if union > 0:
                similarity = intersection / union
                if similarity > best_score and similarity > 0.7:
                    best_score = similarity
                    best_label = label

        return best_label

In [5]:
# ========================================================================
# STEP 5: SUBMISSION CREATION AND ANALYSIS
# ========================================================================

def create_submission_with_analysis(test_df, predictions):
    """Create submission file with comprehensive analysis"""

    # Create submission DataFrame
    submission = pd.DataFrame({
        'id': test_df['id'],
        'label': predictions
    })

    # Comprehensive analysis
    print("\n" + "="*60)
    print("📊 COMPREHENSIVE SUBMISSION ANALYSIS")
    print("="*60)

    # Basic format validation
    print("\n🔍 FORMAT VALIDATION:")
    print(f"Shape: {submission.shape}")
    print(f"Columns: {submission.columns.tolist()}")
    print(f"ID range: {submission['id'].min()}-{submission['id'].max()}")
    print(f"Label range: {submission['label'].min()}-{submission['label'].max()}")
    print(f"Null values: {submission.isnull().sum().sum()}")

    # Quality metrics
    pred_counts = submission['label'].value_counts()
    unique_preds = len(pred_counts)
    top_pred_count = pred_counts.iloc[0]
    top_pred_pct = (top_pred_count / len(submission)) * 100
    singletons = (pred_counts == 1).sum()

    # Shannon entropy
    probs = pred_counts / len(submission)
    entropy = -np.sum(probs * np.log2(probs + 1e-10))

    # Gini coefficient
    sorted_counts = np.sort(pred_counts.values)
    n = len(sorted_counts)
    cumsum = np.cumsum(sorted_counts)
    gini = (n + 1 - 2 * np.sum(cumsum) / cumsum[-1]) / n

    print(f"\n📈 QUALITY METRICS:")
    print(f"Unique predictions: {unique_preds:,}")
    print(f"Top prediction: {top_pred_count:,} times ({top_pred_pct:.2f}%)")
    print(f"Singleton predictions: {singletons:,} ({singletons/unique_preds*100:.1f}%)")
    print(f"Shannon entropy: {entropy:.4f}")
    print(f"Gini coefficient: {gini:.4f}")

    # Target achievement
    print(f"\n🎯 TARGET ACHIEVEMENT:")
    entropy_target = entropy > 12.0
    concentration_target = top_pred_pct < 2.0
    coverage_target = unique_preds > 7000
    diversity_target = singletons > 2000

    print(f"High Entropy (>12.0): {'✅' if entropy_target else '❌'} ({entropy:.2f})")
    print(f"Low Concentration (<2%): {'✅' if concentration_target else '❌'} ({top_pred_pct:.2f}%)")
    print(f"High Coverage (>7K): {'✅' if coverage_target else '❌'} ({unique_preds:,})")
    print(f"High Diversity (>2K singletons): {'✅' if diversity_target else '❌'} ({singletons:,})")

    # Overall assessment
    targets_met = sum([entropy_target, concentration_target, coverage_target, diversity_target])
    print(f"\nTargets met: {targets_met}/4")

    if targets_met >= 3:
        expected_score = "0.75-0.85"
        status = "🎉 EXCELLENT"
    elif targets_met >= 2:
        expected_score = "0.60-0.75"
        status = "✅ GOOD"
    else:
        expected_score = "0.40-0.60"
        status = "⚠️ NEEDS IMPROVEMENT"

    print(f"Status: {status}")
    print(f"Expected Score Range: {expected_score}")

    # Top predictions analysis
    print(f"\n📋 TOP 10 PREDICTIONS:")
    for i, (label, count) in enumerate(pred_counts.head(10).items(), 1):
        print(f"  {i:2d}. Label {label}: {count:,} times ({count/len(submission)*100:.2f}%)")

    # Distribution analysis
    print(f"\n📊 DISTRIBUTION ANALYSIS:")
    ranges = [(1, 1), (2, 5), (6, 20), (21, 100), (101, float('inf'))]
    for min_count, max_count in ranges:
        if max_count == float('inf'):
            mask = pred_counts >= min_count
            range_desc = f"{min_count}+"
        else:
            mask = (pred_counts >= min_count) & (pred_counts <= max_count)
            range_desc = f"{min_count}-{max_count}"

        count_in_range = mask.sum()
        pct_in_range = count_in_range / len(pred_counts) * 100
        print(f"  Labels appearing {range_desc} times: {count_in_range:,} ({pct_in_range:.1f}%)")

    return submission


def validate_and_save_submission(submission, filename='advanced_submission.csv'):
    """Validate and save submission with final checks"""

    print(f"\n💾 SAVING SUBMISSION:")

    # Final validation
    expected_rows = 217241
    validation_passed = True

    checks = [
        (len(submission) == expected_rows, f"Row count ({expected_rows})"),
        (set(submission.columns) == {'id', 'label'}, "Correct columns"),
        (submission['id'].min() == 0, "ID starts at 0"),
        (submission['id'].max() == expected_rows - 1, f"ID ends at {expected_rows-1}"),
        (submission.isnull().sum().sum() == 0, "No null values"),
        (submission['label'].dtype in ['int64', 'int32'], "Integer labels"),
        (len(set(submission['id'])) == expected_rows, "All IDs unique")
    ]

    for check_passed, check_desc in checks:
        status = "✅" if check_passed else "❌"
        print(f"  {check_desc}: {status}")
        if not check_passed:
            validation_passed = False

    if validation_passed:
        submission.to_csv(filename, index=False)
        print(f"\n🎉 SUBMISSION SUCCESSFULLY SAVED: {filename}")

        # File size info
        import os
        file_size = os.path.getsize(filename) / 1024 / 1024
        print(f"📁 File size: {file_size:.2f} MB")

        return True
    else:
        print(f"\n❌ VALIDATION FAILED - SUBMISSION NOT SAVED")
        return False


In [6]:
# ========================================================================
# STEP 6: MAIN EXECUTION FUNCTION
# ========================================================================

def run_complete_solution():
    """Run the complete address resolution solution"""

    print("🚀 TEKNOFEST 2025 - ADVANCED ADDRESS RESOLUTION")
    print("=" * 60)
    print("Target: 0.80+ Score")
    print("=" * 60)

    # Step 1: Load data
    print("\n📁 LOADING DATA...")
    try:
        # Try to load from files
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        print(f"✅ Data loaded successfully!")
        print(f"   Train: {len(train_df):,} samples")
        print(f"   Test: {len(test_df):,} samples")
        print(f"   Unique labels in train: {train_df['label'].nunique():,}")

    except FileNotFoundError:
        print("❌ CSV files not found!")
        print("Please upload train.csv and test.csv files to Colab")

        # Try Colab file upload
        try:
            from google.colab import files
            print("\n📤 Please upload your files:")

            print("1. Upload train.csv:")
            uploaded_train = files.upload()
            train_df = pd.read_csv(list(uploaded_train.keys())[0])

            print("2. Upload test.csv:")
            uploaded_test = files.upload()
            test_df = pd.read_csv(list(uploaded_test.keys())[0])

            print(f"✅ Files uploaded successfully!")
            print(f"   Train: {len(train_df):,} samples")
            print(f"   Test: {len(test_df):,} samples")

        except ImportError:
            print("❌ Not running in Colab and no CSV files found")
            print("Please ensure train.csv and test.csv are in the current directory")
            return None

    # Step 2: Data sampling for development (optional)
    use_sample = len(train_df) > 100000
    if use_sample:
        print(f"\n🔄 USING SAMPLE FOR FASTER DEVELOPMENT:")
        sample_size = min(80000, len(train_df))
        train_sample = train_df.sample(n=sample_size, random_state=42, stratify=None)
        print(f"   Using {len(train_sample):,} samples from {len(train_df):,}")
        print("   (Remove this sampling for final submission)")
    else:
        train_sample = train_df
        print(f"\n✅ Using full training set: {len(train_sample):,} samples")

    # Step 3: Initialize resolver
    print(f"\n🧠 INITIALIZING ADVANCED RESOLVER...")
    resolver = AdvancedAddressResolver()

    # Step 4: Training
    print(f"\n🎯 TRAINING PHASE...")
    resolver.fit(train_sample['address'].values, train_sample['label'].values)

    # Step 5: Prediction
    print(f"\n🔮 PREDICTION PHASE...")
    predictions = resolver.predict(test_df['address'].values)

    # Step 6: Create and analyze submission
    print(f"\n📊 CREATING SUBMISSION...")
    submission = create_submission_with_analysis(test_df, predictions)

    # Step 7: Save submission
    success = validate_and_save_submission(submission)

    if success:
        # Try to download in Colab
        try:
            from google.colab import files
            files.download('advanced_submission.csv')
            print("📥 File automatically downloaded!")
        except ImportError:
            print("💡 File saved in current directory")

        print(f"\n🎉 PROCESS COMPLETED SUCCESSFULLY!")
        print(f"Submission ready for Kaggle upload!")

    return submission


In [7]:
# ========================================================================
# STEP 7: EXECUTE SOLUTION
# ========================================================================

if __name__ == "__main__":
    # Run the complete solution
    submission = run_complete_solution()

    if submission is not None:
        print("\n" + "="*60)
        print("🏁 FINAL SUMMARY")
        print("="*60)
        print(f"✅ Submission created with {len(submission)} predictions")
        print(f"✅ {submission['label'].nunique()} unique labels predicted")
        print(f"✅ File saved as 'advanced_submission.csv'")
        print(f"🎯 Ready for Kaggle submission!")
        print("="*60)
    else:
        print("\n❌ Solution failed to complete")
        print("Please check error messages above")

🚀 TEKNOFEST 2025 - ADVANCED ADDRESS RESOLUTION
Target: 0.80+ Score

📁 LOADING DATA...
✅ Data loaded successfully!
   Train: 848,237 samples
   Test: 217,241 samples
   Unique labels in train: 10,390

🔄 USING SAMPLE FOR FASTER DEVELOPMENT:


TypeError: NDFrame.sample() got an unexpected keyword argument 'stratify'