In [None]:
import pandas as pd
import numpy as np
from thefuzz import fuzz
from rapidfuzz import fuzz as rfuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import jellyfish
import re
import unicodedata
from collections import defaultdict
from tqdm import tqdm

In [56]:
# Foreign country codes which are not canton abbreviations at the same time or "CH"
foreign_country_codes = [
    'AD', 'AE', 'AF', 'AL', 'AM', 'AO', 'AQ', 'AS', 'AT', 'AU', 'AW', 'AX', 'AZ',
    'BA', 'BB', 'BD', 'BF', 'BG', 'BH', 'BI', 'BJ', 'BM', 'BN', 'BO', 'BQ', 'BR', 'BT', 'BV', 'BW', 'BY', 'BZ',
    'CA', 'CC', 'CD', 'CF', 'CG', 'CI', 'CK', 'CL', 'CM', 'CN', 'CO', 'CR', 'CU', 'CV', 'CW', 'CX', 'CY', 'CZ',
    'DE', 'DJ', 'DK', 'DM', 'DO', 'DZ',
    'EC', 'EE', 'EG', 'EH', 'ER', 'ES', 'ET',
    'FI', 'FJ', 'FK', 'FM', 'FO',
    'GA', 'GB', 'GD', 'GF', 'GG', 'GH', 'GI', 'GM', 'GN', 'GP', 'GQ', 'GS', 'GT', 'GU', 'GW', 'GY',
    'HK', 'HM', 'HN', 'HR', 'HT', 'HU',
    'ID', 'IE', 'IL', 'IM', 'IN', 'IO', 'IQ', 'IR', 'IS', 'IT',
    'JE', 'JM', 'JO', 'JP',
    'KE', 'KG', 'KH', 'KI', 'KM', 'KN', 'KP', 'KR', 'KW', 'KY', 'KZ',
    'LA', 'LB', 'LC', 'LI', 'LK', 'LR', 'LS', 'LT', 'LV', 'LY',
    'MA', 'MC', 'MD', 'ME', 'MF', 'MG', 'MH', 'MK', 'ML', 'MM', 'MN', 'MO', 'MP', 'MQ', 'MR', 'MS', 'MT', 'MU', 'MV', 'MW', 'MX', 'MY', 'MZ',
    'NC', 'NF', 'NG', 'NI', 'NL', 'NO', 'NP', 'NR', 'NU', 'NZ',
    'OM',
    'PA', 'PE', 'PF', 'PG', 'PH', 'PK', 'PL', 'PM', 'PN', 'PR', 'PS', 'PT', 'PW', 'PY',
    'QA',
    'RE', 'RO', 'RS', 'RU', 'RW',
    'SA', 'SB', 'SC', 'SD', 'SE', 'SI', 'SJ', 'SK', 'SL', 'SM', 'SN', 'SR', 'SS', 'ST', 'SV', 'SX', 'SY',
    'TC', 'TD', 'TF', 'TH', 'TJ', 'TK', 'TL', 'TM', 'TN', 'TO', 'TR', 'TT', 'TV', 'TW', 'TZ',
    'UA', 'UG', 'UM', 'US', 'UY', 'UZ',
    'VA', 'VC', 'VE', 'VG', 'VI', 'VN', 'VU',
    'WF', 'WS',
    'YE', 'YT',
    'ZA', 'ZM', 'ZW',
    'XK', 'FRA', 'USA', 'UK', 'BGR', 'BIH', 'NA', 'XZ', 'CHN', 'DEU'
]

additional_foreign_indicators = [
    'Afrique du Sud', 'Albanie', 'Algérie', 'Allemagne', 'Andorre', 'Angola', 'Arabie saoudite',
    'Argentine', 'Arménie', 'Australie', 'Austriche', 'Azerbaïdjan', 'Bahamas', 'Bahreïn',
    'Bangladesh', 'Barbade', 'Belgique', 'Bénin', 'Bhoutan', 'Biélorussie', 'Birmanie', 'Bolivie',
    'Bosnie-Herzégovine', 'Botswana', 'Brésil', 'Brunei', 'Bulgarie', 'Burkina Faso', 'Burundi',
    'Cambodge', 'Cameroun', 'Canada', 'Cap-Vert', 'Chili', 'Chine', 'Chypre', 'Colombie',
    'Comores', 'Congo-Brazzaville', 'Congo-Kinshasa', 'Corée du Nord', 'Corée du Sud',
    'Costa Rica', "Côte d'Ivoire", 'Croatie', 'Cuba', 'Danemark', 'Djibouti', 'Dominique',
    'Égypte', 'Émirats arabes unis', 'Équateur', 'Érythrée', 'Espagne', 'Estonie', 'États-Unis',
    'Éthiopie', 'Finlande', 'France', 'Gabon', 'Gambie', 'Géorgie', 'Ghana', 'Grèce',
    'Grenade', 'Guatemala', 'Guinée', 'Guinée-Bissau', 'Guinée équatoriale', 'Guyana', 'Haïti',
    'Honduras', 'Hongrie', 'Inde', 'Indonésie', 'Irak', 'Iran', 'Irlande', 'Islande', 'Israël',
    'Italie', 'Jamaïque', 'Japon', 'Jordanie', 'Kazakhstan', 'Kenya', 'Kirghizistan', 'Kiribati',
    'Kosovo', 'Koweït', 'Laos', 'Lesotho', 'Lettonie', 'Liban', 'Libéria', 'Libye',
    'Liechtenstein', 'Lituanie', 'Luxembourg', 'Macédoine du Nord', 'Madagascar', 'Malaisie',
    'Malawi', 'Maldives', 'Mali', 'Malte', 'Maroc', 'Marshall', 'Maurice', 'Mauritanie',
    'Mexique', 'Micronésie', 'Moldavie', 'Monaco', 'Mongolie', 'Monténégro', 'Mozambique',
    'Namibie', 'Nauru', 'Népal', 'Nicaragua', 'Niger', 'Nigeria', 'Norvège', 'Nouvelle-Zélande',
    'Oman', 'Ouganda', 'Ouzbékistan', 'Pakistan', 'Palaos', 'Palestine', 'Panama',
    'Papouasie-Nouvelle-Guinée', 'Paraguay', 'Pays-Bas', 'Pérou', 'Philippines', 'Pologne',
    'Portugal', 'Qatar', 'République centrafricaine', 'République démocratique du Congo',
    'République dominicaine', 'République tchèque', 'Roumanie', 'Royaume-Uni', 'Russie', 'Rwanda',
    'Saint-Kitts-et-Nevis', 'Saint-Vincent-et-les-Grenadines', 'Sainte-Lucie', 'Saint-Marin',
    'Salomon', 'Salvador', 'Samoa', 'São Tomé-et-Principe', 'Sénégal', 'Serbie', 'Seychelles',
    'Sierra Leone', 'Singapour', 'Slovaquie', 'Slovénie', 'Somalie', 'Soudan', 'Soudan du Sud',
    'Sri Lanka', 'Suède', 'Suisse', 'Suriname', 'Syrie', 'Tadjikistan', 'Tanzanie', 'Tchad',
    'Thaïlande', 'Timor oriental', 'Togo', 'Tonga', 'Trinité-et-Tobago', 'Tunisie', 'Turkménistan',
    'Turquie', 'Tuvalu', 'Ukraine', 'Uruguay', 'Vanuatu', 'Vatican', 'Venezuela', 'Viêt Nam',
    'Yémen', 'Zambie', 'Zimbabwe'
]

false_positives = [
    'Taggia',
    'Eaubonne',
    'Montanay',
    'Avermes',
    'Serraval',
    'Fegersheim',
    'Bassens',
    'Ecully',
    'Lutterbach',
    'Eschen',
    'La Rochelle',
    'Porto',
    'Châtel',
    'Buc',
    'Alès',
    'Sindelfingen',
    'Champagnole',
    'Berlin',
    'Saint-Ismier',
    'Vers',
    'Luze',
    'Les Clefs',
]

common_mistakes = {
    'Guemligen': 'Muri bei Bern',
    'Sagno': 'Breggia',
    'Edingen': 'Endingen',
    'Ittingen': 'Ittigen',
    'Ebmatingen': 'Maur',
}

In [57]:
config = {
    'foreign_country_codes': foreign_country_codes,
    'additional_foreign_indicators': additional_foreign_indicators,
    'false_positives': false_positives,
    'common_mistakes': common_mistakes
}

In [None]:
class MunicipalityMatcher:
    def __init__(self, official_municipalities: pd.DataFrame, id_col: str, name_col: str, config: dict):
        """
        Initialize with official municipality data
        
        Args:
            official_municipalities: DataFrame with columns ['id', 'name']
        """
        assert id_col in official_municipalities.columns, "The id columns is invalide, please provide the correct column name!"
        assert name_col in official_municipalities.columns, "The name columns is invalide, please provide the correct column name!"

        self.officials = official_municipalities.rename(columns={id_col: 'matched_id', name_col: 'matched_name'})
        self.foreign_country_codes = {code.lower(): (-1, None, 1.0) for code in config.get('foreign_country_codes', [])}
        self.additional_foreign_indicators = {self.normalize_text(name): (-1, None, 1.0)  for name in config.get('additional_foreign_indicators', [])}
        self.false_positives = set([self.normalize_text(name) for name in config.get('false_positives', [])])
        self.common_mistakes = {self.normalize_text(k): self.normalize_text(v) for k, v in config.get('common_mistakes', {}).items()}
        
        self.preprocess_officials()
        
    def preprocess_officials(self):
        """Preprocess official names for better matching"""
        # Create normalized versions
        self.officials['normalized'] = self.officials['matched_name'].apply(self.normalize_text)
        self.officials['no_brackets'] = self.officials['normalized'].str.replace(r'\(|\)', '', regex=True)
        self.officials['confidence'] = 1.0
        
        # Create exact match lookup dictionaries
        self.exact_matches = dict(zip(self.officials['no_brackets'], zip(self.officials['matched_id'], self.officials['matched_name'], self.officials['confidence'])))

        # Create TF-IDF matrix for official names
        self.tfidf = TfidfVectorizer(analyzer='char', ngram_range=(2, 3))
        self.tfidf_matrix = self.tfidf.fit_transform(self.officials['normalized'])
        
        # Create ngram index
        self.ngram_index = self.create_ngram_index(self.officials['normalized'])

    @staticmethod
    def normalize_text(text):
        """
        Enhanced text normalization for German, French, and Italian characters
        """
        while text.count('(') > text.count(')'):  # Check if there is a missing closing bracket
            text += ')'
        text = re.sub(r'(?<!\s)\(', ' (', text)  # Adds whitespace before '(' if there is none
        text = re.sub(r'\b([A-Z]{2})\b', r'(\1)', text)  # ZH -> (ZH), (ZH ...)-> ((ZH) ...), (... ZH) -> (... (ZH)), (ZH) -> ((ZH)), 
        text = re.sub(r'\((\([A-Z]{2}\))\)', r'\1', text)  # Turns ((ZH)) back into (ZH)
        text = re.sub(r'^à\s+', '', text)  # Removes à at the beginning of a string
        text = re.sub(r'[^)\w\s]+$', '', text)  # Removes any trailing punctuation
        text = re.sub(r'\s+b$', '', text)  # Removes any trailing b

        # Convert to lowercase
        text = text.lower()

        # Replace German umlaute
        text = text.replace('ä', 'ae').replace('ö', 'oe').replace('ü', 'ue')

        # Remove accents and special characters (except those handled manually above)
        text = ''.join(
            c for c in unicodedata.normalize('NFD', text)
            if unicodedata.category(c) != 'Mn'
        )

        # Replace Abbreviations and hyphen
        replacements = {            
            ' b.': ' bei', 's.': 'san', ' v. d.': ' von der', ' a. d.': ' an der', ' a.': ' am',
            ' u.': ' und', ' z.': ' zur', 'st-': 'saint-', 'dev-': 'devant-', ' avec': '',
            "'": " ", "-": " ",
        }

        # Apply replacements
        for old, new in replacements.items():
            text = text.replace(old, new)

        # Remove any remaining non-English letters and special characters
        text = re.sub(r'[^a-z\s()]', '', text)
        
        # Remove multiple spaces
        text = re.sub(r'\s+', ' ', text)
        
        return text.strip()
    
    @staticmethod
    def create_ngram_index(texts, n=3):
        """Create ngram index for faster initial filtering"""
        index = defaultdict(set)
        for idx, text in enumerate(texts):
            ngrams = {text[i:i+n] for i in range(len(text)-n+1)}
            for ngram in ngrams:
                index[ngram].add(idx)
        return index
    
    def get_candidates(self, query_normalized, threshold=0.3):
        """Get candidate matches using ngram filtering"""
        query_ngrams = {query_normalized[i:i+3] for i in range(len(query_normalized)-2)}
        
        # Get indices of candidates that share ngrams
        candidate_indices = set()
        for ngram in query_ngrams:
            candidate_indices.update(self.ngram_index.get(ngram, set()))
            
        # Filter candidates by quick ratio threshold
        candidates = self.officials.iloc[list(candidate_indices)]
        candidates = candidates[
            candidates['normalized'].apply(
                lambda x: rfuzz.QRatio(query_normalized, x) > threshold
            )
        ]
        return candidates
    
    def match_name(self, query_normalized, threshold=0.85):
        """
        Match a single name against official municipalities using multiple techniques
        
        Returns:
            tuple: (best_match_id, best_match_name, confidence_score)
        """
        # Fix common mistakes
        query_normalized = self.common_mistakes.get(query_normalized, query_normalized)

        # If there are brackets, try to find an exact match
        matches = re.findall(r'\((.*?)\)', query_normalized)
        if matches:
            # First try to find a match from the values contained in brackets
            for match in matches:
                stripped_match = match.replace('(', '').replace(')', '')
                for lookup in [self.foreign_country_codes, self.additional_foreign_indicators, self.exact_matches]:
                    result = lookup.get(stripped_match)
                    if result:
                        return result

            # Else try to find exact match for the parts not in brackets
            remaining_part = re.sub(r'\(.*?\)', '', query_normalized).strip()
            result = self.exact_matches.get(remaining_part)
            if result:
                return result

        # If no exact match, proceed with fuzzy matching
        candidates = self.get_candidates(query_normalized)
        if len(candidates) == 0:
            return None, None, 0.0

        # Calculate various similarity scores
        scores = []
        for _, candidate in candidates.iterrows():
            # TF-IDF cosine similarity
            query_tfidf = self.tfidf.transform([query_normalized])
            tfidf_sim = cosine_similarity(
                query_tfidf, 
                self.tfidf_matrix[candidate.name]
            )[0][0]
            
            # Levenshtein ratio
            lev_ratio = fuzz.ratio(query_normalized, candidate['normalized']) / 100
            
            # Partial ratio
            part_ratio = fuzz.partial_ratio(query_normalized, candidate['normalized']) / 100

            # Token sort ratio (handles word reordering)
            token_sort = fuzz.token_sort_ratio(
                query_normalized, 
                candidate['normalized']
            ) / 100
            
            # Jaro-Winkler similarity (gives more weight to matching prefixes)
            jaro_sim = jellyfish.jaro_winkler_similarity(
                query_normalized, 
                candidate['normalized']
            )
            
            # Combine scores with weights
            combined_score = (
                0.4 * tfidf_sim +
                0.3 * lev_ratio +
                0.1 * part_ratio +
                0.1 * token_sort +
                0.1 * jaro_sim
            )
            
            scores.append((
                candidate['matched_id'],
                candidate['matched_name'],
                combined_score
            ))
        
        # Get best match
        best_match = max(scores, key=lambda x: x[2])

        # Return None if below threshold
        if best_match[2] < threshold:
            if any(code in query_normalized for code in ['(fr)', '(lu)', '(be)', '(gr)', '(ar)', '(ge)', '(sg)']):
                return -1, None, 1.0
            else:
                return None, None, 0.0
        elif (best_match[2] >= threshold) and (best_match[1] in self.false_positives):
            return -1, None, 1.0
        else:
            return best_match

    def match_dataframe(self, query_df, query_column, threshold=0.85):
        """Match multiple names in parallel"""
        # Try exact matches via normalized names
        query_df['normalized'] = query_df[query_column].apply(self.normalize_text)
        query_df = query_df.drop_duplicates(subset=['normalized'])

        merged_df = query_df.merge(self.officials, on='normalized', how='left')

        exact_matches = merged_df[~merged_df.matched_id.isna()].copy()
        exact_matches['matched_name'] = exact_matches.normalized
        print(f"Found {len(exact_matches)} exact matches!")

        # Apply advanced matching to unmatched entries
        no_matches = merged_df[merged_df.matched_id.isna()][[query_column, 'normalized']].copy()

        # Apply the function
        def match_apply(row):
            match = self.match_name(row.normalized, threshold)
            return pd.Series({'matched_id': match[0], 'matched_name': match[1], 'confidence': match[2]})
    
        tqdm.pandas(desc=f'Matching {len(no_matches)} names')
        no_matches[['matched_id', 'matched_name', 'confidence']] = no_matches.progress_apply(match_apply, axis=1)

        return pd.concat([exact_matches, no_matches], axis=0)

In [63]:
official_data = pd.read_csv('./official_bfs_gemeinden_2010-2024.csv')

residence_names = pd.read_csv('./data-1730366425179.csv', encoding='utf-8')
residence_names['residence'] = residence_names.residence.str.split(r'\bund\b|\bet\b|\be\b').explode('residence').reset_index(drop=True)

hometown_names = pd.read_csv('./data-1730366364994.csv', encoding='utf-8')
hometown_names['hometown'] = hometown_names.hometown.str.split(r'\bund\b|\bet\b|\be\b').explode('hometown').reset_index(drop=True)

In [64]:
matcher = MunicipalityMatcher(official_data, id_col='bfs_nr', name_col='gmde_name', config=config)

In [65]:
results_df = matcher.match_dataframe(residence_names, 'residence')

Found 2420 exact matches!


Matching 16596 names: 100%|██████████| 16596/16596 [12:48<00:00, 21.61it/s]


In [66]:
results_df.sort_values('confidence').to_excel('residence_test.xlsx', index=False)

In [144]:
test1 = 'Sulz ZH (Rickenbach ZH)'
test2 = 'Muehledorf BE (Kirchdorf (BE))'

In [146]:
norm = matcher.normalize_text(test2)
print(norm)
matcher.match_name(norm)

muehledorf (be) (kirchdorf (be))
be
kirchdorf be


('872', 'Kirchdorf (BE)', 1.0)

In [68]:
gmdstde = [
    'gemeindestand_2010-01-01.csv', 'gemeindestand_2010-04-25.csv', 'gemeindestand_2010-11-21.csv',
    'gemeindestand_2011-01-01.csv', 'gemeindestand_2011-07-01.csv',
    'gemeindestand_2012-01-01.csv', 'gemeindestand_2012-04-01.csv',
    'gemeindestand_2013-01-01.csv', 'gemeindestand_2013-04-14.csv',
    'gemeindestand_2014-01-01.csv', 'gemeindestand_2014-05-01.csv',
    'gemeindestand_2015-01-01.csv',
    'gemeindestand_2016-01-01.csv', 'gemeindestand_2016-04-10.csv', 'gemeindestand_2016-07-01.csv', 
    'gemeindestand_2017-01-01.csv', 'gemeindestand_2017-04-02.csv',
    'gemeindestand_2018-01-01.csv', 'gemeindestand_2018-04-01.csv',
    'gemeindestand_2019-01-01.csv',
    'gemeindestand_2020-01-01.csv', 'gemeindestand_2020-10-01.csv', 'gemeindestand_2020-10-17.csv', 'gemeindestand_2020-10-18.csv',
    'gemeindestand_2021-01-01.csv', 'gemeindestand_2021-04-18.csv', 'gemeindestand_2021-07-01.csv', 
    'gemeindestand_2022-01-01.csv', 'gemeindestand_2022-04-10.csv', 'gemeindestand_2022-05-01.csv',
    'gemeindestand_2023-01-01.csv', 'gemeindestand_2024-01-01.csv', 'gemeindestand_2024-03-01.csv'
]

base_df = pd.read_csv('../utils/data/snapshots/gemeindestand_2010-01-01.csv')
base_df['stand'] = '2010-01-01'

for s in gmdstde[1:]:
    df = pd.read_csv(f'../utils/data/snapshots/{s}')
    df['stand'] = s[14:-4]
    base_df = pd.concat([base_df, df])
    base_df = base_df.sort_values(['Name_de', 'stand'])
    base_df = base_df.drop_duplicates(subset=['Name_de'], keep='last')

In [69]:
base_df = base_df[['Name_de', 'Identifier', 'stand']].rename(columns={'Identifier': 'bfs_nr', 'Name_de': 'gmde_name'})

In [70]:
ambiguous = base_df[base_df.gmde_name.str.contains(r'\(')].sort_values('gmde_name')
ambiguous['gmde_name'] = ambiguous.gmde_name.apply(lambda x: re.sub(r'\(.*?\)', '', x).strip())
ambiguous['bfs_nr'] = ambiguous['bfs_nr'].astype(str)

grouped_df = ambiguous.groupby('gmde_name').agg({'bfs_nr': lambda x: ', '.join(x.tolist()), 'stand': lambda x: ', '.join(x.tolist())}).reset_index()

In [71]:
base_df['bfs_nr'] = base_df['bfs_nr'].astype(str)
base_df = pd.concat([base_df, grouped_df])

In [77]:
base_df.to_csv('official_bfs_gemeinden_2010-2024.csv', index=False)

In [75]:
base_df.gmde_name.nunique()

2826

In [73]:
base_df[base_df.duplicated('gmde_name', keep=False)]

Unnamed: 0,gmde_name,bfs_nr,stand


In [74]:
base_df

Unnamed: 0,gmde_name,bfs_nr,stand
1455,Aadorf,4551,2024-03-01
1400,Aarau,4001,2024-03-01
219,Aarberg,301,2024-03-01
1225,Aarburg,4271,2024-03-01
261,Aarwangen,321,2024-03-01
...,...,...,...
139,Wiler,6202,2024-03-01
140,Wisen,2502,2024-03-01
141,Wohlen,4082,2024-03-01
142,Zell,"1150, 231","2024-03-01, 2024-03-01"


In [39]:
base_df

Unnamed: 0,Identifier,Level,Parent,Name_en,Name_fr,Name_de,Name_it,stand
13,1,3,101.0,Aeugst am Albis,Aeugst am Albis,Aeugst am Albis,Aeugst am Albis,2024-03-01
1,2,3,101.0,Affoltern am Albis,Affoltern am Albis,Affoltern am Albis,Affoltern am Albis,2024-03-01
2,3,3,101.0,Bonstetten,Bonstetten,Bonstetten,Bonstetten,2024-03-01
3,4,3,101.0,Hausen am Albis,Hausen am Albis,Hausen am Albis,Hausen am Albis,2024-03-01
4,5,3,101.0,Hedingen,Hedingen,Hedingen,Hedingen,2024-03-01
...,...,...,...,...,...,...,...,...
2124,6808,3,2603.0,Clos du Doubs,Clos du Doubs,Clos du Doubs,Clos du Doubs,2024-03-01
2128,6809,3,2603.0,Haute-Ajoie,Haute-Ajoie,Haute-Ajoie,Haute-Ajoie,2024-03-01
2125,6810,3,2603.0,La Baroche,La Baroche,La Baroche,La Baroche,2024-03-01
2129,6811,3,2603.0,Damphreux-Lugnez,Damphreux-Lugnez,Damphreux-Lugnez,Damphreux-Lugnez,2024-03-01


In [38]:
df

Unnamed: 0,bfs_nr,name,stand
0,1,Aeugst am Albis,2024-03-01
1,2,Affoltern am Albis,2024-03-01
2,3,Bonstetten,2024-03-01
3,4,Hausen am Albis,2024-03-01
4,5,Hedingen,2024-03-01
...,...,...,...
2684,6808,Clos du Doubs,2024-03-01
2685,6809,Haute-Ajoie,2024-03-01
2686,6810,La Baroche,2024-03-01
2687,6811,Damphreux-Lugnez,2024-03-01


In [19]:
results_df = matcher.match_dataframe(residence_names, 'residence', threshold=0.5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exact_matches['matched_name'] = exact_matches.normalized


Found 2384 exact matches!


Matching 16750 names: 100%|██████████| 16750/16750 [27:42<00:00, 10.07it/s] 


In [20]:
name = "Ruethi (Rheintal) (Ruethi (SG))"
matcher.match_name(matcher.normalize_text(name), 0)

(3256, 'Rüthi (Rheintal)', np.float64(0.864693185848787))

In [21]:
results_df.sort_values('confidence').to_excel('residence_test.xlsx', index=False)

In [60]:
results_df.to_excel('residence_test.xlsx', index=False)

In [62]:
results_df[(results_df.confidence < 1.0) & (results_df.confidence > 0.0)]

Unnamed: 0,query,matched_id,matched_name,confidence
152,Gurbrue,665.0,Gurbrü,0.933121
168,Estvayer,2054.0,Estavayer,0.871452
252,Stuesslingen,2499.0,Stüsslingen,0.910276
282,Grueningen,116.0,Grüningen,0.856148
314,Niederoenz,982.0,Niederönz,0.882293
...,...,...,...,...
19760,Kleinboesingen,2266.0,Kleinbösingen,0.897775
19856,Stocken-Hoefen,770.0,Stocken-Höfen,0.935990
19858,Roemerswil,1039.0,Römerswil,0.859681
19859,Muenchenstein,2810.0,Münchenstein,0.876409


In [63]:
results_df[(results_df.confidence == 1.0)]

Unnamed: 0,query,matched_id,matched_name,confidence
0,Bottens,5514.0,Bottens,1.0
7,Payerne,5822.0,Payerne,1.0
9,Vevey,5890.0,Vevey,1.0
11,Obergerlafingen,2528.0,Obergerlafingen,1.0
17,Hausen AG,4100.0,Hausen (AG),1.0
...,...,...,...,...
20136,Bussigny-près-Lausanne,5624.0,Bussigny-près-Lausanne,1.0
20162,Wenslingen,2865.0,Wenslingen,1.0
20169,Heremence,6084.0,Hérémence,1.0
20170,Prilly,5589.0,Prilly,1.0


In [64]:
results_df[(results_df.confidence == 0.0)]

Unnamed: 0,query,matched_id,matched_name,confidence
1,Gotha (DE),,,0.0
2,Villorba (IT),,,0.0
3,Frages,,,0.0
4,Tschlin (Valsot),,,0.0
5,Ruemingen (DE),,,0.0
...,...,...,...,...
20171,Curbar (GB),,,0.0
20172,Monguzzo (IT),,,0.0
20173,Tumbaco (Pichincha-EC),,,0.0
20174,Dettighofen (DE),,,0.0
