In [5]:
# Colab ortamı için kütüphane kurulumu
!pip install --upgrade pip

# Ana kütüphaneler (Colab'da genelde yüklü ama güncelleyelim)
!pip install pandas numpy matplotlib seaborn scikit-learn

# XGBoost ve LightGBM (Colab uyumlu versiyonlar)
!pip install xgboost lightgbm

# CatBoost (Colab'da genelde sorunsuz)
!pip install catboost

# Text processing
!pip install fuzzywuzzy python-levenshtein

# NetworkX (opsiyonel - eğer network analysis yaparsak)
!pip install networkx

# Alternatif eğer sorun çıkarsa:
# !pip install xgboost==1.7.6  # Stable version
# !pip install lightgbm==3.3.5  # Stable version
# !pip install catboost==1.2     # Stable version

print("✅ Tüm kütüphaneler yüklendi!")

# Test et
try:
    import pandas as pd
    import numpy as np
    import xgboost as xgb
    import lightgbm as lgb
    from catboost import CatBoostClassifier
    from fuzzywuzzy import fuzz
    print("✅ Import testleri başarılı!")
except ImportError as e:
    print(f"❌ Import hatası: {e}")
    print("Manuel yükleme gerekebilir.")

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m149.1 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-levenshtein
  Downloading python_levenshtein-0.27.1-py3-n

In [8]:
import pandas as pd
import numpy as np
import re
from collections import defaultdict, Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
import networkx as nx
from scipy.spatial.distance import cosine
from fuzzywuzzy import fuzz
import pickle
import warnings
warnings.filterwarnings('ignore')

class AdvancedAddressResolver:
    def __init__(self):
        """Advanced multi-stage address resolution system"""
        # Stage 1: Hierarchical clustering
        self.geographic_clusterer = GeographicHierarchyClusterer()

        # Stage 2: Semantic similarity
        self.semantic_matcher = SemanticAddressMatcher()

        # Stage 3: Advanced ensemble
        self.ensemble_predictor = AdvancedEnsemblePredictor()

        # Stage 4: Post-processing
        self.post_processor = AddressPostProcessor()

        # Address database
        self.address_database = AddressDatabase()

    def fit(self, addresses, labels):
        """Multi-stage training process"""
        print("🚀 Advanced Address Resolution Training Started...")

        # Stage 1: Build address database and hierarchy
        print("1️⃣ Building geographic hierarchy...")
        self.address_database.build(addresses, labels)
        geographic_features = self.geographic_clusterer.fit_transform(addresses, labels)

        # Stage 2: Train semantic matcher
        print("2️⃣ Training semantic matcher...")
        semantic_features = self.semantic_matcher.fit_transform(addresses, labels)

        # Stage 3: Combine features and train ensemble
        print("3️⃣ Training advanced ensemble...")
        combined_features = np.hstack([geographic_features, semantic_features])
        self.ensemble_predictor.fit(combined_features, labels)

        # Stage 4: Prepare post-processor
        print("4️⃣ Configuring post-processor...")
        self.post_processor.configure(addresses, labels, self.address_database)

        print("✅ Training completed!")
        return self

    def predict(self, addresses):
        """Multi-stage prediction process"""
        print("🔮 Advanced prediction started...")

        # Stage 1: Extract geographic features
        geographic_features = self.geographic_clusterer.transform(addresses)

        # Stage 2: Extract semantic features
        semantic_features = self.semantic_matcher.transform(addresses)

        # Stage 3: Ensemble prediction
        combined_features = np.hstack([geographic_features, semantic_features])
        raw_predictions = self.ensemble_predictor.predict(combined_features)

        # Stage 4: Post-process with similarity and rules
        final_predictions = self.post_processor.refine_predictions(
            addresses, raw_predictions
        )

        print("✅ Prediction completed!")
        return final_predictions


class GeographicHierarchyClusterer:
    def __init__(self):
        self.province_clusters = {}
        self.district_clusters = {}
        self.neighborhood_clusters = {}
        self.vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1,3))

    def fit_transform(self, addresses, labels):
        """Create geographic hierarchy features"""
        # Parse all addresses for geographic info
        parsed_data = []
        for addr, label in zip(addresses, labels):
            parsed = self._parse_address_components(addr)
            parsed['label'] = label
            parsed_data.append(parsed)

        # Build province-level clusters
        self._build_province_clusters(parsed_data)

        # Build district-level clusters
        self._build_district_clusters(parsed_data)

        # Build neighborhood-level clusters
        self._build_neighborhood_clusters(parsed_data)

        # Extract features
        features = self._extract_hierarchy_features(addresses)
        return features

    def transform(self, addresses):
        """Transform new addresses using fitted hierarchy"""
        return self._extract_hierarchy_features(addresses)

    def _parse_address_components(self, address):
        """Enhanced address parsing with comprehensive Turkey locations"""
        if pd.isna(address):
            address = ""
        address = str(address).lower()

        # Import comprehensive location data
        from turkey_locations import TURKEY_LOCATIONS, find_location_in_address

        # All provinces (81 il)
        provinces = {}
        for prov, variants in TURKEY_LOCATIONS['provinces'].items():
            provinces[prov] = variants

        # All districts (comprehensive list)
        districts = {}
        for category in ['izmir_districts', 'manisa_districts', 'denizli_districts',
                        'mugla_districts', 'aydin_districts', 'istanbul_districts', 'ankara_districts']:
            if category in TURKEY_LOCATIONS:
                for dist, variants in TURKEY_LOCATIONS[category].items():
                    districts[dist] = variants

        # Extract components
        components = {
            'province': None,
            'district': None,
            'neighborhood': None,
            'street': None,
            'number': None
        }

        # Extract province using comprehensive list
        for prov, variants in provinces.items():
            if any(var.lower() in address for var in variants):
                components['province'] = prov
                break

        # Extract district using comprehensive list
        for dist, variants in districts.items():
            if any(var.lower() in address for var in variants):
                components['district'] = dist
                break

        # Extract neighborhood
        mah_match = re.search(r'(\w+)\s+(mah|mahalle|mahallesi)', address)
        if mah_match:
            components['neighborhood'] = mah_match.group(1)

        # Extract street
        sok_match = re.search(r'(\d+\.?\s*)(sok|sokak|sokağı)', address)
        if sok_match:
            components['street'] = sok_match.group(1).strip()

        # Extract number
        no_match = re.search(r'(no|numara)\s*:?\s*(\d+)', address)
        if no_match:
            components['number'] = no_match.group(2)

        return components

    def _build_province_clusters(self, parsed_data):
        """Build province-level label clusters"""
        province_labels = defaultdict(list)
        for item in parsed_data:
            if item['province']:
                province_labels[item['province']].append(item['label'])

        for province, labels in province_labels.items():
            self.province_clusters[province] = set(labels)

    def _build_district_clusters(self, parsed_data):
        """Build district-level label clusters"""
        district_labels = defaultdict(list)
        for item in parsed_data:
            if item['district']:
                district_labels[item['district']].append(item['label'])

        for district, labels in district_labels.items():
            self.district_clusters[district] = set(labels)

    def _build_neighborhood_clusters(self, parsed_data):
        """Build neighborhood-level label clusters"""
        neighborhood_labels = defaultdict(list)
        for item in parsed_data:
            if item['neighborhood']:
                neighborhood_labels[item['neighborhood']].append(item['label'])

        for neighborhood, labels in neighborhood_labels.items():
            self.neighborhood_clusters[neighborhood] = set(labels)

    def _extract_hierarchy_features(self, addresses):
        """Extract features based on geographic hierarchy"""
        features = []

        for address in addresses:
            parsed = self._parse_address_components(address)

            # Geographic hierarchy features
            feature_row = []

            # Province features (one-hot)
            provinces = ['izmir', 'manisa', 'denizli', 'mugla', 'aydin', 'usak']
            for prov in provinces:
                feature_row.append(1 if parsed['province'] == prov else 0)

            # District features (one-hot for top districts)
            top_districts = ['bornova', 'konak', 'karsiyaka', 'pamukkale', 'fethiye', 'bodrum']
            for dist in top_districts:
                feature_row.append(1 if parsed['district'] == dist else 0)

            # Completeness features
            feature_row.append(1 if parsed['province'] else 0)
            feature_row.append(1 if parsed['district'] else 0)
            feature_row.append(1 if parsed['neighborhood'] else 0)
            feature_row.append(1 if parsed['street'] else 0)
            feature_row.append(1 if parsed['number'] else 0)

            # Cluster size features (proxy for label frequency)
            prov_cluster_size = len(self.province_clusters.get(parsed['province'], set()))
            dist_cluster_size = len(self.district_clusters.get(parsed['district'], set()))
            neigh_cluster_size = len(self.neighborhood_clusters.get(parsed['neighborhood'], set()))

            feature_row.extend([
                np.log1p(prov_cluster_size),
                np.log1p(dist_cluster_size),
                np.log1p(neigh_cluster_size)
            ])

            features.append(feature_row)

        return np.array(features)


class SemanticAddressMatcher:
    def __init__(self):
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 4),
            analyzer='char_wb'
        )
        self.svd = TruncatedSVD(n_components=100, random_state=42)
        self.address_embeddings = None
        self.label_centroids = {}

    def fit_transform(self, addresses, labels):
        """Create semantic embeddings and label centroids"""
        # Clean addresses
        cleaned = [self._clean_address(addr) for addr in addresses]

        # Create TF-IDF features
        tfidf_features = self.tfidf_vectorizer.fit_transform(cleaned)

        # Reduce dimensionality
        reduced_features = self.svd.fit_transform(tfidf_features)
        self.address_embeddings = reduced_features

        # Calculate label centroids
        unique_labels = np.unique(labels)
        for label in unique_labels:
            mask = np.array(labels) == label
            if np.any(mask):
                centroid = np.mean(reduced_features[mask], axis=0)
                self.label_centroids[label] = centroid

        # Generate similarity features
        similarity_features = self._generate_similarity_features(reduced_features, labels)

        return np.hstack([reduced_features, similarity_features])

    def transform(self, addresses):
        """Transform new addresses to semantic features"""
        cleaned = [self._clean_address(addr) for addr in addresses]
        tfidf_features = self.tfidf_vectorizer.transform(cleaned)
        reduced_features = self.svd.transform(tfidf_features)

        similarity_features = self._generate_similarity_features(reduced_features, None)

        return np.hstack([reduced_features, similarity_features])

    def _clean_address(self, address):
        """Advanced address cleaning"""
        if pd.isna(address):
            return ""

        address = str(address).lower()

        # Remove noise
        address = re.sub(r'[^\w\s]', ' ', address)
        address = re.sub(r'\s+', ' ', address)

        # Expand abbreviations
        replacements = {
            'mah': 'mahalle', 'sok': 'sokak', 'cd': 'cadde',
            'apt': 'apartman', 'no': 'numara', 'bl': 'blok'
        }

        for abbr, full in replacements.items():
            address = re.sub(rf'\b{abbr}\b', full, address)

        return address.strip()

    def _generate_similarity_features(self, embeddings, labels=None):
        """Generate features based on similarity to label centroids"""
        if not self.label_centroids:
            return np.zeros((len(embeddings), 20))  # Placeholder

        similarity_features = []
        centroid_list = list(self.label_centroids.values())

        for embedding in embeddings:
            similarities = []
            for centroid in centroid_list:
                sim = 1 - cosine(embedding, centroid)
                similarities.append(max(0, sim))  # Ensure non-negative

            # Top-K similarities as features
            similarities.sort(reverse=True)
            top_similarities = similarities[:20]  # Top 20

            # Pad if needed
            while len(top_similarities) < 20:
                top_similarities.append(0.0)

            similarity_features.append(top_similarities)

        return np.array(similarity_features)


class AdvancedEnsemblePredictor:
    def __init__(self):
        # Multiple diverse models
        self.models = {
            'xgb': XGBClassifier(
                n_estimators=500,
                max_depth=15,
                learning_rate=0.02,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42
            ),
            'lgb': lgb.LGBMClassifier(
                n_estimators=500,
                max_depth=15,
                learning_rate=0.02,
                feature_fraction=0.8,
                bagging_fraction=0.8,
                random_state=42,
                verbose=-1
            ),
            'catboost': CatBoostClassifier(
                iterations=300,
                depth=12,
                learning_rate=0.02,
                random_state=42,
                verbose=False
            ),
            'rf': RandomForestClassifier(
                n_estimators=300,
                max_depth=20,
                min_samples_split=5,
                min_samples_leaf=2,
                random_state=42,
                n_jobs=-1
            )
        }

        self.scaler = StandardScaler()

    def fit(self, X, y):
        """Train ensemble of models"""
        X_scaled = self.scaler.fit_transform(X)

        for name, model in self.models.items():
            print(f"Training {name}...")
            model.fit(X_scaled, y)

        return self

    def predict(self, X):
        """Ensemble prediction with weighted voting"""
        X_scaled = self.scaler.transform(X)

        predictions = {}
        for name, model in self.models.items():
            predictions[name] = model.predict(X_scaled)

        # Weighted ensemble (based on expected performance)
        weights = {'xgb': 0.35, 'lgb': 0.3, 'catboost': 0.25, 'rf': 0.1}

        final_predictions = []
        for i in range(len(X_scaled)):
            # Weighted voting
            vote_counts = defaultdict(float)
            for name, preds in predictions.items():
                vote_counts[preds[i]] += weights[name]

            # Select prediction with highest weight
            best_prediction = max(vote_counts.items(), key=lambda x: x[1])[0]
            final_predictions.append(best_prediction)

        return np.array(final_predictions)


class AddressPostProcessor:
    def __init__(self):
        self.address_similarity_threshold = 0.85
        self.label_lookup = {}

    def configure(self, addresses, labels, address_database):
        """Configure post-processor with training data"""
        # Build exact and fuzzy lookup
        for addr, label in zip(addresses, labels):
            clean_addr = self._normalize_for_lookup(addr)
            if clean_addr not in self.label_lookup:
                self.label_lookup[clean_addr] = []
            self.label_lookup[clean_addr].append(label)

        # Keep most common label for each address
        for addr in self.label_lookup:
            self.label_lookup[addr] = Counter(self.label_lookup[addr]).most_common(1)[0][0]

    def refine_predictions(self, addresses, predictions):
        """Post-process predictions with similarity matching"""
        refined = predictions.copy()

        for i, addr in enumerate(addresses):
            clean_addr = self._normalize_for_lookup(addr)

            # Exact match lookup
            if clean_addr in self.label_lookup:
                refined[i] = self.label_lookup[clean_addr]
                continue

            # Fuzzy matching
            best_match = None
            best_score = 0

            for lookup_addr, lookup_label in list(self.label_lookup.items())[:1000]:  # Limit for performance
                score = fuzz.ratio(clean_addr, lookup_addr) / 100.0
                if score > best_score and score > self.address_similarity_threshold:
                    best_score = score
                    best_match = lookup_label

            if best_match is not None:
                refined[i] = best_match

        return refined

    def _normalize_for_lookup(self, address):
        """Normalize address for lookup"""
        if pd.isna(address):
            return ""

        address = str(address).lower()
        address = re.sub(r'[^\w\s]', '', address)
        address = re.sub(r'\s+', ' ', address)
        return address.strip()


class AddressDatabase:
    def __init__(self):
        self.address_to_labels = defaultdict(set)
        self.label_to_addresses = defaultdict(set)

    def build(self, addresses, labels):
        """Build bidirectional address-label database"""
        for addr, label in zip(addresses, labels):
            clean_addr = str(addr).lower().strip()
            self.address_to_labels[clean_addr].add(label)
            self.label_to_addresses[label].add(clean_addr)


# Main execution
def main():
    print("🚀 Advanced Address Resolution System")
    print("Target: 0.80+ score")

    # Load data
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')

    print(f"📊 Data: {len(train_df)} train, {len(test_df)} test")
    print(f"📊 Unique labels in train: {train_df['label'].nunique()}")

    # Initialize advanced resolver
    resolver = AdvancedAddressResolver()

    # Train with full dataset
    print("🎯 Training advanced resolver...")
    resolver.fit(train_df['address'].values, train_df['label'].values)

    # Predict on test set
    print("🔮 Generating advanced predictions...")
    predictions = resolver.predict(test_df['address'].values)

    # Create submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'label': predictions
    })

    # Analysis
    print(f"\n📊 Prediction Analysis:")
    print(f"Unique predictions: {len(set(predictions)):,}")
    print(f"Prediction distribution:")
    pred_counts = pd.Series(predictions).value_counts()
    print(f"  Top prediction: {pred_counts.iloc[0]:,} ({pred_counts.iloc[0]/len(predictions)*100:.2f}%)")
    print(f"  Singletons: {(pred_counts == 1).sum():,}")

    # Save
    submission.to_csv('advanced_submission.csv', index=False)
    print("✅ Advanced submission saved!")

    return submission

if __name__ == "__main__":
    submission = main()

🚀 Advanced Address Resolution System
Target: 0.80+ score
📊 Data: 848237 train, 217241 test
📊 Unique labels in train: 10390
🎯 Training advanced resolver...
🚀 Advanced Address Resolution Training Started...
1️⃣ Building geographic hierarchy...


ModuleNotFoundError: No module named 'turkey_locations'