In [None]:
!pip install faiss-cpu



In [None]:
# ================================
# STEP 1: INSTALL REQUIREMENTS
# ================================
print("üì¶ Installing packages...")

!pip install pandas numpy scikit-learn xgboost lightgbm catboost \
    fuzzywuzzy python-Levenshtein \
    transformers sentence-transformers

print("‚úÖ All packages installed!")

import pandas as pd, numpy as np, re, os
from fuzzywuzzy import fuzz
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sentence_transformers import SentenceTransformer
import faiss
from collections import Counter, defaultdict
import warnings
warnings.filterwarnings("ignore")

üì¶ Installing packages...
‚úÖ All packages installed!


In [None]:
# ================================
# STEP 2: TURKISH NLP + GEO DB
# ================================

    def __init__(self):
        # All 81 Turkish provinces with major districts
        self.turkish_administrative_units = {
            'adana': ['seyhan', 'yuregir', 'cukurova', 'sari√ßam', 'karaisali', 'karatas'],
            'adiyaman': ['merkez', 'besni', '√ßelikhan', 'gerger', 'golbasi', 'kahta', 'samsat', 'sincik', 'tut'],
            'afyonkarahisar': ['merkez', 'sandikli', 'dinar', 'bolvadin', '√ßay', 'dazkiri', 'emirdaƒü'],
            'agri': ['merkez', 'dogubayazit', 'patnos', 'tutak', 'diyadin', 'eleskirt', 'hamur', 'taslicay'],
            'amasya': ['merkez', 'merzifon', 'suluova', 'tasova', 'gokdere', 'hamam√∂z√º'],
            'ankara': ['cankaya', 'kecioren', 'yenimahalle', 'mamak', 'altindag', 'sincan', 'etimesgut',
                      'golbasi', 'polatli', 'pursaklar', 'akyurt', 'aya≈ü', 'bala', 'beypazari'],
            'antalya': ['muratpasa', 'kepez', 'konyaalti', 'alanya', 'manavgat', 'serik', 'kas', 'kemer',
                       'elmali', 'finike', 'gazipa≈üa', 'demre', 'kumluca', 'akseki'],
            'artvin': ['merkez', 'hopa', 'bor√ßka', 'arhavi', 'yusufeli', '≈üav≈üat', 'ardanu√ß', 'murgul'],
            'aydin': ['efeler', 'nazilli', 's√∂ke', 'ku≈üadasi', 'didim', '√ßine', 'bozdogan', 'germencik'],
            'balikesir': ['karesi', 'altieylul', 'bandirma', 'edremit', 'ayvalik', 'burhaniye', 'erdek'],
            'bartin': ['merkez', 'amasra', 'kurucasile', 'ulus'],
            'batman': ['merkez', 'kozluk', 'besiri', 'gercus', 'hasankeyf', 'sason'],
            'bayburt': ['merkez', 'aydƒ±ntepe', 'demirozu'],
            'bilecik': ['merkez', 'bozuyuk', 'sogut', 'osmaneli', 'golpazari', 'inhisar', 'pazaryeri', 'yenipazar'],
            'bingol': ['merkez', 'genc', 'karliova', 'solhan', 'adakli', 'kigi', 'yayladere', 'yedisu'],
            'bitlis': ['merkez', 'tatvan', 'g√ºroymak', 'hizan', 'mutki', 'adilcevaz', 'ahlat'],
            'bolu': ['merkez', 'd√ºzce', 'goynuk', 'mudurnu', 'mengen', 'gerede', 'kibriscik', 'seben'],
            'burdur': ['merkez', 'bucak', 'golhisar', 'yesilova', 'karamanli', 'aglasun', 'altinyayla'],
            'bursa': ['osmangazi', 'nilufer', 'yildirim', 'mudanya', 'gemlik', 'inegol', 'orhaneli',
                     'buyukorhan', 'harmancik', 'iznik', 'karacabey', 'keles', 'kestel', 'mustafakemalpasa'],
            'canakkale': ['merkez', 'gelibolu', 'biga', '√ßanakkale', 'ayvacik', 'bayrami√ß', 'bozcaada'],
            'cankiri': ['merkez', '√ßerkes', 'ilgaz', 'kur≈üunlu', 'orta', 'atkaracalar', 'bayram√∂ren'],
            'corum': ['merkez', 'osmancik', 'iskilip', 'kargi', 'dodurga', 'alaca', 'bayat', 'bogazkale'],
            'denizli': ['pamukkale', 'merkezefendi', 'honaz', 'tavas', '√ßal', 'acipayam', 'buldan', '√ßameli'],
            'diyarbakir': ['baglar', 'kayapinar', 'sur', 'yenisehir', 'bismil', '√ßermik', '√ßinar', 'dicle'],
            'edirne': ['merkez', 'kesan', 'uzunkopru', 'ipsala', 'havsa', 'enez', 'lalapa≈üa', 'meri√ß', 's√ºloglu'],
            'elazig': ['merkez', 'karakocan', 'keban', 'palu', 'sivrice', 'aƒüƒ±n', 'alacakaya', 'arƒ±cak', 'baskil'],
            'erzincan': ['merkez', '√ºz√ºml√º', 'refahiye', 'tercan', 'kemah', '√ßayƒ±rlƒ±', 'ili√ß', 'kemaliye', 'otlukbeli'],
            'erzurum': ['yakutiye', 'palandoken', 'aziziye', 'hinis', 'pasinler', 'a≈ükale', '√ßat', 'horasan'],
            'eskisehir': ['tepebasi', 'odunpazari', 'sivrihisar', '√ßifteler', 'alpu', 'beylikova', '√ßifteler'],
            'gaziantep': ['≈üahinbey', '≈üehitkamil', 'nizip', 'islahiye', 'nurdagi', 'araban', 'karkamƒ±≈ü'],
            'giresun': ['merkez', 'bulancak', 'espiye', 'gorele', 'tirebolu', 'alucra', '√ßamoluk', '√ßanak√ßƒ±'],
            'g√ºm√º≈ühane': ['merkez', 'kelkit', '≈üiran', 'torul', 'kose', 'k√ºrt√ºn'],
            'hakkari': ['merkez', 'y√ºksekova', '≈üemdinli', '√ßukurca'],
            'hatay': ['antakya', 'defne', 'arsuz', 'dortyol', 'iskenderun', 'kirikhan', 'payas', 'reyhanli', 'samandaƒü'],
            'igdir': ['merkez', 'tuzluca', 'karakoyunlu', 'aralik'],
            'isparta': ['merkez', 'yalva√ß', 'ke√ßiborlu', '≈üarkikaraaƒüa√ß', 'atabey', 'eƒüirdir', 'gelendost'],
            'istanbul': ['fatih', 'beyoglu', 'uskudar', 'kadikoy', 'besiktas', 'sisli', 'bakirkoy', 'zeytinburnu',
                        'esenler', 'gaziosmanpasa', 'kagithane', 'sariyer', 'maltepe', 'pendik', '√ºmraniye',
                        'beykoz', 'beylikd√ºz√º', 'esenyurt', 'avcƒ±lar', 'k√º√ß√ºk√ßekmece', 'b√ºy√ºk√ßekmece',
                        'bah√ßelievler', 'baƒücƒ±lar', 'g√ºng√∂ren', 'sultangazi', 'arnavutk√∂y', 'ba≈üak≈üehir',
                        'bayrampa≈üa', 'be≈üikta≈ü', 'ey√ºpsultan', 'kaƒüƒ±thane', 'kartal', 'silivri', 'sultanbeyli'],
            'izmir': ['konak', 'bornova', '√ßigli', 'kar≈üƒ±yaka', 'bayrakli', 'gaziemir', 'bal√ßova', 'narlidere',
                     'buca', 'menderes', 'torbalƒ±', 'menemen', 'fo√ßa', 'urla', '√ße≈üme', '√∂demi≈ü', 'tire',
                     'bergama', 'dikili', 'karaburun', 'kƒ±nƒ±k', 'kiraz', 'sel√ßuk', 'aliaƒüa'],
            'kahramanmaras': ['oniki≈üubat', 'dulkadiroƒülu', 'pazarcƒ±k', 'elbistan', 'af≈üin', 'andƒ±rƒ±n'],
            'karab√ºk': ['merkez', 'safranbolu', 'yenice', 'eskipazar', 'ovacƒ±k', 'eflani'],
            'karaman': ['merkez', 'ermenek', 'kazƒ±mkarabekir', 'ba≈üyayla', 'ayrancƒ±', 'sarƒ±veliler'],
            'kars': ['merkez', 'kagizman', 'ardahan', 'arpa√ßay', 'selim', 'akyaka', 'digor', 'susuz'],
            'kastamonu': ['merkez', 'ta≈ük√∂pr√º', 'sinop', 'boyabat', '√ßankƒ±rƒ±', 'abana', 'aƒülƒ±', 'ara√ß'],
            'kayseri': ['melikgazi', 'kocasinan', 'talas', 'develi', 'yahyalƒ±', 'b√ºnyan', 'felahiye', 'hacƒ±lar'],
            'kƒ±rƒ±kkale': ['merkez', 'yah≈üihan', 'keskin', 'sulakyurt', 'bah≈üƒ±lƒ±', 'balƒ±≈üeyh', '√ßelebi', 'delice'],
            'kirklareli': ['merkez', 'babaeski', 'luleburgaz', 'pƒ±narhisar', 'demirk√∂y', 'kof√ßaz', 'pehlivank√∂y', 'vize'],
            'kirsehir': ['merkez', 'kaman', 'mucur', '√ßi√ßekdaƒü', 'ak√ßakent', 'akpƒ±nar', 'boztepe'],
            'kilis': ['merkez', 'musabeyli', 'polateli', 'elbeyli'],
            'kocaeli': ['izmit', 'gebze', 'darƒ±ca', 'k√∂rfez', 'g√∂lc√ºk', 'kandƒ±ra', 'ba≈üiskele', '√ßayƒ±rova', 'derince', 'dilovasƒ±', 'kartepe'],
            'konya': ['sel√ßuklu', 'meram', 'karatay', 'eregli', 'ak≈üehir', 'bey≈üehir', '√ßumra', 'ilgƒ±n', 'kulu'],
            'k√ºtahya': ['merkez', 'tav≈üanlƒ±', 'gediz', 'simav', 'emet', 'altƒ±nta≈ü', 'aslanapa', '√ßavdarhisar'],
            'malatya': ['ye≈üilyurt', 'battalgazi', 'ak√ßadaƒü', 'darende', 'doƒüan≈üehir', 'hekimhan', 'kuluncak', 'p√ºt√ºrge'],
            'manisa': ['yunusemre', '≈üehzadeler', 'turgutlu', 'akhisar', 'salihli', 'ala≈üehir', 'demirci', 'g√∂rdes'],
            'mardin': ['artuklu', 'midyat', 'kƒ±zƒ±ltepe', 'nusaybin', '√∂merli', 'darge√ßit', 'derik', 'mazƒ±daƒüƒ±'],
            'mersin': ['yeni≈üehir', 'mezitli', 'toroslar', 'akdeniz', 'tarsus', 'erdemli', 'anamur', 'aydƒ±ncƒ±k'],
            'muƒüla': ['mente≈üe', 'bodrum', 'marmaris', 'fethiye', 'milas', 'ortaca', '√ßine', 'dat√ßa', 'k√∂yceƒüiz'],
            'mu≈ü': ['merkez', 'bulanƒ±k', 'malazgirt', 'varto', 'hask√∂y', 'korkut'],
            'nev≈üehir': ['merkez', '√ºrg√ºp', 'avanos', 'g√∂reme', 'derinkuyu', 'acƒ±g√∂l', '√ßat', 'hacƒ±bekta≈ü'],
            'niƒüde': ['merkez', 'bor', 'ulukƒ±≈üla', '√ßamardƒ±', 'altunhisar', '√ßiftlik'],
            'ordu': ['altƒ±nordu', '√ºnye', 'fatsa', 'per≈üembe', '√ßayba≈üƒ±', 'akku≈ü', 'aybasti', '√ßama≈ü'],
            'osmaniye': ['merkez', 'kadirli', 'd√ºzi√ßi', 'bah√ße', 'hasanbeyli', 'sumbas', 'toprakkale'],
            'rize': ['merkez', '√ßayeli', 'arde≈üen', 'pazar', 'fƒ±ndƒ±klƒ±', 'g√ºneysu', 'hem≈üin', 'ikizdere'],
            'sakarya': ['adapazarƒ±', 'serdivan', 'akyazƒ±', 'karasu', 'hendek', 'arifiye', 'erenler', 'ferizli'],
            'samsun': ['ilkadƒ±m', 'canik', 'atakum', 'bafra', '√ßar≈üamba', 'terme', 'ala√ßam', 'asarcƒ±k'],
            '≈üanlƒ±urfa': ['eyy√ºbiye', 'haliliye', 'karak√∂pr√º', 'viran≈üehir', 'suru√ß', 'ak√ßakale', 'birecik'],
            'siirt': ['merkez', 'kurtalan', '≈üirvan', 'baykan', 'pervari', 'aydƒ±nlar', 'eruh'],
            'sinop': ['merkez', 'boyabat', 'ayancƒ±k', 't√ºrkeli', 'erfelek', 'duraƒüan', 'gerze'],
            '≈üƒ±rnak': ['merkez', 'cizre', 'silopi', 'idil', 'g√º√ßl√ºkonak', 'beyt√º≈ü≈üebap', 'uludere'],
            'sivas': ['merkez', '≈üarkƒ±≈üla', 'yƒ±ldƒ±zeli', 'su≈üehri', 'divriƒüi', 'gemerek', 'g√ºr√ºn', 'hafik'],
            'tekirdaƒü': ['s√ºleymanpa≈üa', '√ßorlu', '√ßerkezk√∂y', 'hayrabolu', 'kapaklƒ±', 'malkara', 'marmaraereƒülisi', 'muratlƒ±', 'saray', '≈üark√∂y'],
            'tokat': ['merkez', 'turhal', 'erbaa', 'niksar', 're≈üadiye', 'almus', 'artova', 'ba≈ü√ßiftlik'],
            'trabzon': ['ortahisar', 'ak√ßaabat', 'vakfƒ±kebir', 'of', 'araklƒ±', 'arsin', 'be≈üikd√ºz√º', '√ßaykara'],
            'tunceli': ['merkez', '√ßemi≈ügezek', 'hozat', 'mazgirt', 'ovacƒ±k', 'pertek', 'p√ºl√ºm√ºr'],
            'u≈üak': ['merkez', 'banaz', 'e≈üme', 'karahallƒ±', 'sivaslƒ±', 'ulubey'],
            'van': ['ipekyolu', 'tu≈üba', 'edremit', 'geva≈ü', 'muradiye', 'bah√ßesaray', 'ba≈ükale', '√ßaldƒ±ran'],
            'yalova': ['merkez', '√ßiftlikk√∂y', 'altƒ±nova', 'armutlu', '√ßƒ±narcƒ±k', 'termal'],
            'yozgat': ['merkez', 'sorgun', 'boƒüazlƒ±yan', 'yerk√∂y', '≈üefaatli', 'akdaƒümadeni', 'aydƒ±ncƒ±k'],
            'zonguldak': ['merkez', 'ereƒüli', '√ßaycuma', 'devrek', 'g√∂k√ßebey', 'alaplƒ±', 'kilimli']
        }

        # City abbreviations and common variations
        self.city_variations = {
            'ist': 'istanbul', 'izm': 'izmir', 'ank': 'ankara', 'bur': 'bursa',
            'ada': 'adana', 'ant': 'antalya', 'mer': 'mersin', 'gaz': 'gaziantep',
            'kon': 'konya', 'kay': 'kayseri', 'den': 'denizli', 'man': 'manisa',
            'tra': 'trabzon', 'sam': 'samsun', 'esk': 'eskisehir', 'mal': 'malatya',
            # Common misspellings
            'uskuar': 'uskudar', 'kadkoy': 'kadikoy', 'besiktas': 'besiktas',
            'galtsaray': 'galatasaray', 'taksm': 'taksim', 'sisli': 'sisli',
            'bagcilar': 'bagcilar', 'umranye': 'umraniye', 'pendik': 'pendik'
        }

        # Postal code prefixes (first 2 digits)
        self.postal_prefixes = {
            '01': 'adana', '02': 'adiyaman', '03': 'afyonkarahisar', '04': 'agri',
            '05': 'amasya', '06': 'ankara', '07': 'antalya', '08': 'artvin',
            '09': 'aydin', '10': 'balikesir', '11': 'bilecik', '12': 'bingol',
            '13': 'bitlis', '14': 'bolu', '15': 'burdur', '16': 'bursa',
            '17': 'canakkale', '18': 'cankiri', '19': 'corum', '20': 'denizli',
            '21': 'diyarbakir', '22': 'edirne', '23': 'elazig', '24': 'erzincan',
            '25': 'erzurum', '26': 'eskisehir', '27': 'gaziantep', '28': 'giresun',
            '29': 'g√ºm√º≈ühane', '30': 'hakkari', '31': 'hatay', '32': 'isparta',
            '33': 'mersin', '34': 'istanbul', '35': 'izmir', '36': 'kars',
            '37': 'kastamonu', '38': 'kayseri', '39': 'kirklareli', '40': 'kirsehir',
            '41': 'kocaeli', '42': 'konya', '43': 'k√ºtahya', '44': 'malatya',
            '45': 'manisa', '47': 'mardin', '48': 'mugla', '49': 'mu≈ü',
            '50': 'nev≈üehir', '51': 'niƒüde', '52': 'ordu', '53': 'rize',
            '54': 'sakarya', '55': 'samsun', '56': 'siirt', '57': 'sinop',
            '58': 'sivas', '59': 'tekirdaƒü', '60': 'tokat', '61': 'trabzon',
            '62': 'tunceli', '63': '≈üanlƒ±urfa', '64': 'u≈üak', '65': 'van',
            '66': 'yozgat', '67': 'zonguldak', '68': 'aksaray', '69': 'bayburt',
            '70': 'karaman', '71': 'kƒ±rƒ±kkale', '72': 'batman', '73': '≈üƒ±rnak',
            '74': 'bartin', '75': 'ardahan', '76': 'igdir', '77': 'yalova',
            '78': 'karab√ºk', '79': 'kilis', '80': 'osmaniye', '81': 'd√ºzce'
        }
    def find_province(self,address):
        addr=address.lower()
        for p,vars in self.provinces.items():
            if any(v in addr for v in vars): return p
        return None
    def find_district(self,address):
        addr=address.lower()
        for d,vars in self.districts.items():
            if any(v in addr for v in vars): return d
        return None

In [None]:
# ================================
# STEP 3: NORMALIZER
# ================================
class AddressNormalizer:
    def normalize(self,address:str):
        if pd.isna(address): return ""
        address=str(address).lower()
        address=re.sub(r"[^\w\s]", " ", address)
        address=re.sub(r"\s+"," ",address)
        abbr = {r"\bmah\b":"mahalle", r"\bsok\b":"sokak", r"\bcd\b":"cadde",
                r"\bapt\b":"apartman", r"\bno\b":"numara", r"\bd\b":"daire"}
        for k,v in abbr.items(): address=re.sub(k,v,address)
        return address.strip()

In [None]:
# ================================
# STEP 3: NORMALIZER
# ================================
class AddressNormalizer:
    def normalize(self,address:str):
        if pd.isna(address): return ""
        address=str(address).lower()
        address=re.sub(r"[^\w\s]", " ", address)
        address=re.sub(r"\s+"," ",address)
        abbr = {r"\bmah\b":"mahalle", r"\bsok\b":"sokak", r"\bcd\b":"cadde",
                r"\bapt\b":"apartman", r"\bno\b":"numara", r"\bd\b":"daire"}
        for k,v in abbr.items(): address=re.sub(k,v,address)
        return address.strip()

In [None]:
# ================================
# STEP 4: FEATURE EXTRACTOR
#  - TF-IDF + SVD
#  - BERT Embeddings
#  - Geographic one-hot
# ================================

class FeatureExtractor:
    def __init__(self):
        self.tfidf = TfidfVectorizer(max_features=4000,ngram_range=(1,4),analyzer="char_wb")
        self.svd = TruncatedSVD(n_components=150,random_state=42)
        self.scaler = StandardScaler()
        # T√ºrk√ße BERT modeli (Sentence-BERT tabanlƒ±)
        self.bert = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
        self.geo = TurkeyLocationDatabase()
    def fit_transform(self,addresses):
        norm=[AddressNormalizer().normalize(a) for a in addresses]
        tfidf_mat=self.tfidf.fit_transform(norm)
        tfidf_svd=self.svd.fit_transform(tfidf_mat)
        bert_emb=self.bert.encode(norm,show_progress_bar=True)
        geo_feats=[self._geo(addr) for addr in norm]
        feats=np.hstack([tfidf_svd,bert_emb,geo_feats])
        return self.scaler.fit_transform(feats)
    def transform(self,addresses):
        norm=[AddressNormalizer().normalize(a) for a in addresses]
        tfidf_mat=self.tfidf.transform(norm)
        tfidf_svd=self.svd.transform(tfidf_mat)
        bert_emb=self.bert.encode(norm,show_progress_bar=True)
        geo_feats=[self._geo(addr) for addr in norm]
        feats=np.hstack([tfidf_svd,bert_emb,geo_feats])
        return self.scaler.transform(feats)
    def _geo(self,address):
        p=self.geo.find_province(address); d=self.geo.find_district(address)
        provinces=["istanbul","ankara","izmir","bursa","antalya","manisa","mugla","aydin","denizli"]
        districts=["bornova","konak","kar≈üƒ±yaka","bayraklƒ±","buca","fatih","kadƒ±k√∂y"]
        return [1 if p==pp else 0 for pp in provinces] + [1 if d==dd else 0 for dd in districts]

In [None]:
# ================================
# STEP 5: ENSEMBLE MODEL
# ================================
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier

class EnsemblePredictor:
    def __init__(self):
        self.models={
            "rf":RandomForestClassifier(n_estimators=200,max_depth=20,n_jobs=-1,random_state=42),
            "xgb":XGBClassifier(n_estimators=200,max_depth=10,learning_rate=0.05,eval_metric="mlogloss"),
            "lgb":lgb.LGBMClassifier(n_estimators=200,max_depth=10,learning_rate=0.05),
            "cat":CatBoostClassifier(iterations=200,depth=8,learning_rate=0.05,verbose=0)
        }
    def fit(self,X,y):
        for name,m in self.models.items(): m.fit(X,y)
    def predict(self,X):
        preds={}
        for n,m in self.models.items(): preds[n]=m.predict(X)
        weights={"rf":0.2,"xgb":0.3,"lgb":0.3,"cat":0.2}
        final=[]
        for i in range(len(X)):
            votes=defaultdict(float)
            for mdl,p in preds.items(): votes[p[i]]+=weights[mdl]
            final.append(max(votes.items(),key=lambda x:x[1])[0])
        return np.array(final)

In [None]:
# ================================
# STEP 6: ADVANCED DEDUP (FAISS)
# ================================
class Deduplicator:
    def __init__(self,emb_dim=384):
        self.index=faiss.IndexFlatIP(emb_dim) # cosine similarity
        self.embeddings=None
        self.ids=[]
    def build_index(self,addresses,ids):
        bert=SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
        norm=[AddressNormalizer().normalize(a) for a in addresses]
        self.embeddings=bert.encode(norm,normalize_embeddings=True)
        self.index.add(self.embeddings)
        self.ids=ids
    def find_duplicates(self,query_addresses,th=0.90):
        bert=SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
        norm=[AddressNormalizer().normalize(a) for a in query_addresses]
        q_emb=bert.encode(norm,normalize_embeddings=True)
        sims,I=self.index.search(q_emb,2) # top-2
        dup_map={}
        for i,(sim,idx) in enumerate(zip(sims,I)):
            if sim[1]>th: dup_map[i]=self.ids[idx[1]]
        return dup_map

In [None]:
# ================================
# STEP 7: PIPELINE
# ================================

class AdvancedHybridResolver:
    def __init__(self):
        self.fe=FeatureExtractor()
        self.ensemble=EnsemblePredictor()
        self.dedup=Deduplicator()
    def fit(self,X,y,ids):
        feats=self.fe.fit_transform(X)
        self.ensemble.fit(feats,y)
        self.dedup.build_index(X,ids)
    def predict(self,X):
        feats=self.fe.transform(X)
        preds=self.ensemble.predict(feats)
        dup_map=self.dedup.find_duplicates(X)
        for i in dup_map: preds[i]=preds[dup_map[i]]  # duplicate correction
        return preds

In [None]:
# ================================
# STEP 8: EXECUTION (SAMPLE)
# ================================
def run_solution():
    train=pd.read_csv("train.csv")
    test=pd.read_csv("test.csv")
    resolver=AdvancedHybridResolver()
    resolver.fit(train["address"].values,train["label"].values,train["id"].values)
    preds=resolver.predict(test["address"].values)
    sub=pd.DataFrame({"id":test["id"],"label":preds})
    sub.to_csv("hybrid_submission.csv",index=False)
    print("‚úÖ Submission saved hybrid_submission.csv")
    return sub

# Run
# submission=run_solution()

In [None]:
if __name__ == "__main__":
    print("üöÄ Teknofest Hybrid Address Resolver Ba≈üladƒ±")

    # Veri y√ºkle
    try:
        train = pd.read_csv("train.csv")
        test = pd.read_csv("test.csv")
    except FileNotFoundError:
        print("‚ùå train.csv ve test.csv dosyalarƒ±nƒ± y√ºklemen gerekiyor!")
        from google.colab import files
        print("üì§ L√ºtfen train.csv ve test.csv dosyalarƒ±nƒ± y√ºkleyin:")
        uploaded = files.upload()
        for k in uploaded.keys():
            if "train" in k: train=pd.read_csv(k)
            if "test" in k: test=pd.read_csv(k)

    # Yoksa ID s√ºtunu ekleyelim
    if "id" not in train.columns:
        train["id"] = np.arange(len(train))
    if "id" not in test.columns:
        test["id"] = np.arange(len(test))

    # Model olu≈ütur
    resolver = AdvancedHybridResolver()

    # Eƒüit
    print("üéØ Model eƒüitiliyor...")
    resolver.fit(train["address"].values, train["label"].values, train["id"].values)

    # Tahmin
    print("üîÆ Test verisi tahmin ediliyor...")
    predictions = resolver.predict(test["address"].values)

    # Submission kaydet
    submission = pd.DataFrame({"id": test["id"], "label": predictions})
    submission.to_csv("hybrid_submission.csv", index=False)
    print("‚úÖ hybrid_submission.csv dosyasƒ± kaydedildi")

    # ƒ∞lk satƒ±rlara bakalƒ±m
    print(submission.head())

üöÄ Teknofest Hybrid Address Resolver Ba≈üladƒ±
üéØ Model eƒüitiliyor...


Batches:   0%|          | 0/26508 [00:00<?, ?it/s]