In [1]:
import os
import json
import base64
import numpy as np
import pandas as pd
import geopandas as gpd
from typing import Optional
from pyiceberg.expressions import And, GreaterThanOrEqual, LessThanOrEqual
from pyiceberg.catalog import load_catalog

In [3]:
import numpy as np
import geopandas as gpd
from scipy.spatial import cKDTree

def search_spatial_candidates(
    reference_gdf: gpd.GeoDataFrame,
    compared_gdf: gpd.GeoDataFrame,
    k: int = 100,
    max_dist: float = 1000, 
    id_col: str = "id",
    crs_for_distance: int = 3857,
):
    """
    Attach k nearest compared POI ids & distances to reference_gdf.

    Returns
    -------
    GeoDataFrame with two new columns:
    - cand_ids   : list of compared ids
    - cand_dist_m: list of distances (meters)
    """

    ref_proj = reference_gdf.to_crs(crs_for_distance)
    cmp_proj = compared_gdf.to_crs(crs_for_distance)

    ref_xy = np.column_stack([ref_proj.geometry.x, ref_proj.geometry.y])
    cmp_xy = np.column_stack([cmp_proj.geometry.x, cmp_proj.geometry.y])

    tree = cKDTree(cmp_xy)
    k_eff = min(k, len(compared_gdf))

    dist, idx = tree.query(ref_xy, k=k_eff)

    if k_eff == 1:
        dist = dist.reshape(-1, 1)
        idx = idx.reshape(-1, 1)

    cmp_ids = compared_gdf[id_col].to_numpy()

    cand_ids = []
    cand_dists = []

    for row_idx, row_dist in zip(idx, dist):
        ids = []
        dists = []

        for j, d in zip(row_idx, row_dist):
            if d <= max_dist:
                ids.append(cmp_ids[j])
                dists.append(d)

        cand_ids.append(ids)
        cand_dists.append(dists)

    result = reference_gdf.copy()
    result["cand_ids"] = cand_ids
    result["cand_dist_m"] = cand_dists

    return result

In [4]:
FOOD_WORDS = {
    "RESTAURANT","RESTURANT","RESTARAUNT",
    "CAFE","CAFÉ","COFFEE","BAR","BISTRO","GRILL",
    "KITCHEN","DINER","EATERY","STEAKHOUSE",
    "SEAFOOD","BUFFET","BBQ","PIZZA","SUSHI","RAMEN",
    "NOODLE","NOODLES","BURGER","BURGERS","TACO","TACOS",
    "CHICKEN","WINGS","BAKERY","DELI","DELICATESSEN",
    "COURT","FOOD","EXPRESS","HOUSE","SHOP"
}

PLACE_WORDS = {
    "HALL","CENTER","CENTRE","PLAZA","MARKET","MALL",
    "GARDEN","GARDENS","PARK","SQUARE","TOWER","TOWERS",
    "STATION","TERMINAL","BUILDING","GALLERY","THEATER","SCHOOL","COURT","INN",
    "HOTEL","MOTEL","INN","SUITES","SUITE",
    "SPA","SALON","STUDIO","CENTER","CENTRE",
    "CLUB","LOUNGE","STATION","STOP"
}

LEGAL_WORDS = {
    "LLC","INC","CORP","CORPORATION","CO","COMPANY",
    "LTD","LIMITED","GROUP","HOLDINGS","OFFICE"
}

GRAMMAR = {
    "THE","OF","AT","AND","FOR","IN","ON","BY","WITH","TO","FROM"
}

NON_PRIMARY_TOKENS = FOOD_WORDS | PLACE_WORDS | LEGAL_WORDS | GRAMMAR

In [5]:
from rapidfuzz import process, fuzz
import pandas as pd
import re
import unicodedata


def clean_name(s):
    if not isinstance(s, str):
        return ""

    s = unicodedata.normalize("NFKD", s)
    s = "".join(c for c in s if not unicodedata.combining(c)) # 1. unicode normalize (remove accents)
    s = s.upper() # 2. uppercase
    s = re.sub(r"\([^)]*\)", "", s) 
    s = re.sub(r"\b'S\b", "", s) # new change
    s = re.sub(r"\bS\b", "", s) # new change
    s = s.encode("ascii", errors="ignore").decode() # 4. remove emoji / non ascii
    s = re.sub(r"[^\w\s]", " ", s) # 5. replace special chars with space
    s = re.sub(r"\s+", " ", s) # 6. collapse spaces

    return s.strip()

def extract_prinmary_str(name):

    tokens = name.split()
    core = [t for t in tokens if t not in NON_PRIMARY_TOKENS]
    if len(core) == 1 and len(core[0]) < 3:
        return name
    if core:
        return " ".join(core)
    return name

def match_by_name(
    reference_gdf: gpd.GeoDataFrame,
    compared_gdf: gpd.GeoDataFrame,
    re_name_col: str = "name",
    comp_name_col: str = "name",
    comp_id: str = "id",
    comp_id_col: str="cat_main",
    threshold: int = 80,
):
    """
    Perform WRatio name matching within spatial candidates.

    Returns
    -------
    GeoDataFrame with:
    - matched_id_name
    - name_score
    """

    # clean names for matching
    id_to_name_clean = compared_gdf.set_index(comp_id)[comp_name_col].apply(clean_name).apply(extract_prinmary_str).to_dict()
    # raw names for storage
    id_to_name_raw = compared_gdf.set_index(comp_id)[comp_name_col].to_dict()
    # raw compared df category
    id_to_cat = compared_gdf.set_index(comp_id)[comp_id_col].to_dict()

    matched_ids = []
    scores = []
    loc_dists = []
    matched_names = []
    matched_cats = []

    for _, row in reference_gdf.iterrows():
        query = extract_prinmary_str(clean_name(row.get(re_name_col)))

        if not isinstance(query, str) or not row["cand_ids"]:
            matched_ids.append(pd.NA)
            scores.append(pd.NA)
            loc_dists.append(pd.NA)
            matched_names.append(pd.NA)
            matched_cats.append(pd.NA)
            continue

        cand_names = [id_to_name_clean.get(cid, "") for cid in row["cand_ids"]]

        top_matches = process.extract(
            query,
            cand_names,
            scorer=fuzz.WRatio,
            limit=5
        )

        best_score = -1
        best_pos = None

        for name, wr, pos in top_matches:

            pr = fuzz.partial_ratio(query, name)
            ts = fuzz.token_sort_ratio(query, name)
            tset = fuzz.token_set_ratio(query, name)

            combined = max(wr, pr, ts, tset)

            if combined > best_score:
                best_score = combined
                best_pos = pos

        score = best_score

        if score >= threshold:
            matched_ids.append(row["cand_ids"][best_pos])
            scores.append(score)
            loc_dists.append(row["cand_dist_m"][best_pos])
            matched_names.append(id_to_name_raw.get(row["cand_ids"][best_pos]))
            matched_cats.append(id_to_cat.get(row["cand_ids"][best_pos]))
        else:
            matched_ids.append(pd.NA)
            scores.append(score)
            loc_dists.append(pd.NA)
            matched_names.append(pd.NA)
            matched_cats.append(pd.NA)


    result = reference_gdf.copy()
    result["matched_id"] = matched_ids
    result["name_score"] = scores
    result["location_distance"] = loc_dists
    result["matched_name"] = matched_names
    result["matched_cat_main"] = matched_cats

    return result

In [6]:
import pandas as pd
import geopandas as gpd
from rapidfuzz import fuzz
import re
import unicodedata

def clean_name(s):
    if not isinstance(s, str):
        return ""

    s = unicodedata.normalize("NFKD", s)
    s = "".join(c for c in s if not unicodedata.combining(c)) # 1. unicode normalize (remove accents)
    s = s.upper() # 2. uppercase
    s = re.sub(r"\([^)]*\)", "", s) 
    s = s.encode("ascii", errors="ignore").decode() # 4. remove emoji / non ascii
    s = re.sub(r"[^\w\s]", " ", s) # 5. replace special chars with space
    s = re.sub(r"\s+", " ", s) # 6. collapse spaces

    return s.strip()

def address_score_check(
    reference_gdf: gpd.GeoDataFrame,
    compared_gdf: gpd.GeoDataFrame,
    addr_col_ref: str = "addr_simple",
    addr_col_cmp: str = "address",
    matched_id_col: str = "matched_id",
    id_col: str = "id",
    out_col: str = "address_score",
    out_addr_col: str = "matched_address", 
):
    """
    Compute address similarity score (0-100) for already-matched rows.

    Only rows with non-null `matched_id_col` will be scored.
    Others will have NaN.

    Returns
    -------
    GeoDataFrame with a new column `out_col`.
    """

    # map: compared id -> compared address
    id_to_addr_clean = compared_gdf.set_index(id_col)[addr_col_cmp].apply(clean_name).to_dict()
    id_to_addr_raw = compared_gdf.set_index(id_col)[addr_col_cmp].to_dict()

    scores = []
    matched_addrs = []

    for _, row in reference_gdf.iterrows():
        matched_id = row.get(matched_id_col)

        # no matched id -> no score
        if pd.isna(matched_id):
            scores.append(pd.NA)
            matched_addrs.append(pd.NA)
            continue

        addr_ref = clean_name(row.get(addr_col_ref))
        addr_cmp = id_to_addr_clean.get(matched_id)

        if isinstance(addr_cmp, str) and addr_cmp.strip():
            matched_addrs.append(id_to_addr_raw.get(matched_id))
        else:
            matched_addrs.append(pd.NA)

        # missing address on either side -> no score
        if not isinstance(addr_ref, str) or not addr_ref.strip():
            scores.append(pd.NA)
            continue
        if not isinstance(addr_cmp, str) or not addr_cmp.strip():
            scores.append(pd.NA)
            continue

        wr = fuzz.WRatio(addr_ref, addr_cmp)
        pr = fuzz.partial_ratio(addr_ref, addr_cmp)
        ts = fuzz.token_sort_ratio(addr_ref, addr_cmp)
        tset = fuzz.token_set_ratio(addr_ref, addr_cmp)

        scores.append(int(max(wr, pr, ts, tset)))

    result = reference_gdf.copy()
    result[out_col] = scores
    result[out_addr_col] = matched_addrs
    return result

In [7]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

def calculate_similarity_check(
    df, 
    cat_col_ref: str = "primary_type", 
    cat_col_cmp: str = "matched_cat_main", 
    id_col: str = "matched_id", 
    result_col: str =  "category_sim",
):

    # 1. Setup Device
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
    
    # 2. Primary Gatekeeper: matched_id must be present
    mask = df[id_col].notna() & (df[id_col].astype(str).str.strip() != "")
    
    # 3. Create a helper to handle potential Nulls in text columns
    temp_df = df[mask].copy()
    
    # Identify where text is actually missing within the masked rows
    text_missing_mask = temp_df[cat_col_ref].isna() | temp_df[cat_col_cmp].isna()
    
    # Fill NaNs with empty strings just for the encoding step
    texts_1 = temp_df[cat_col_ref].fillna("").astype(str).tolist()
    texts_2 = temp_df[cat_col_cmp].fillna("").astype(str).tolist()

    print(f"Encoding {len(temp_df)} rows...")

    # 4. Batch Encoding
    emb1 = model.encode(texts_1, batch_size=256, show_progress_bar=True, convert_to_tensor=True)
    emb2 = model.encode(texts_2, batch_size=256, show_progress_bar=True, convert_to_tensor=True)

    # 5. Calculate Similarity
    scores = torch.nn.functional.cosine_similarity(emb1, emb2, dim=1).cpu().numpy()
    
    # 6. Apply NaN to rows where text was missing
    # Even if we encoded an empty string, the result isn't "real" if data was missing
    scores[text_missing_mask.values] = np.nan

    # 7. Map back to original dataframe
    df[result_col] = np.nan
    df.loc[mask, result_col] = scores
    
    return df

In [8]:
bsop_gplc = gpd.read_file('/Users/houpuli/Redlining Lab Dropbox/HOUPU LI/POI research/BSOP_MSA/bspo_google_place_5000/google_placescat_5000_clean.geojson')
google_naics_mapping = pd.read_csv('/Users/houpuli/Redlining Lab Dropbox/HOUPU LI/POI research/mapping_google_naics.csv')
bsop_gplc = bsop_gplc.merge(google_naics_mapping[['SubCategory','naics_code','naics_definition']], left_on = 'primary_type', right_on='SubCategory', how="left")
bsop_gplc['addr_simple'] = bsop_gplc['address'].str.split(',', n=1).str[0]
bsop_gplc = bsop_gplc.drop(columns=['SubCategory'])
bsop_gplc

Unnamed: 0,circle_id,id,name,address,primary_type,lat,lon,primary_cat,geometry,naics_code,naics_definition,addr_simple
0,0,ChIJwTqOxGO1kVQR3ffcwUqPDTc,Safeway Fuel Station,"23961 WA-3, Belfair, WA 98528, USA",gas_station,47.453613,-122.824365,automotive,POINT (-122.82436 47.45361),44.0,Retail Trade,23961 WA-3
1,0,ChIJpZyerHu1kVQRAc6e7nFRTOA,Local Wrench,"23530 WA-3 suite a, Belfair, WA 98528, USA",car_repair,47.447925,-122.828294,automotive,POINT (-122.82829 47.44793),81.0,Other Services (except Public Administration),23530 WA-3 suite a
2,0,ChIJ8SIqjWO1kVQR2blnwrvt0C4,Valvoline Instant Oil Change,"23970 N WA-3, Belfair, WA 98528, USA",car_repair,47.453070,-122.822240,automotive,POINT (-122.82224 47.45307),81.0,Other Services (except Public Administration),23970 N WA-3
3,0,ChIJ4ZaLfGO1kVQRKZIPKm91JmA,Chevron Belfair,"23880 WA-3, Belfair, WA 98528, USA",gas_station,47.452338,-122.824376,automotive,POINT (-122.82438 47.45234),44.0,Retail Trade,23880 WA-3
4,0,ChIJOYTHKWO1kVQRRXwvopTyBVw,QFC Fuel Center,"201 WA-300, Belfair, WA 98528, USA",gas_station,47.453683,-122.827552,automotive,POINT (-122.82755 47.45368),44.0,Retail Trade,201 WA-300
...,...,...,...,...,...,...,...,...,...,...,...,...
4256,42,ChIJD6bA4-Xgj1QRg1RuvTgcXe0,Paradise Bay Rd & S Bay Lane,"Port Ludlow, WA 98365, USA",bus_stop,47.912121,-122.692577,transportation,POINT (-122.69258 47.91212),48.0,Transportation and Warehousing,Port Ludlow
4257,42,ChIJLah42szmj1QRZpfU3Vo2xTY,Oak Bay Rd & Opp Olympus Blvd,"Washington 98365, USA",bus_stop,47.941977,-122.688553,transportation,POINT (-122.68855 47.94198),48.0,Transportation and Warehousing,Washington 98365
4258,44,ChIJ0Y_p64MCkFQRyYXYqJflZI8,Scatchet Head Rd at Bailey Rd (SB),"Washington 98236, USA",bus_stop,47.936720,-122.410862,transportation,POINT (-122.41086 47.93672),48.0,Transportation and Warehousing,Washington 98236
4259,44,ChIJKW2_64MCkFQR4vXk6q4IizQ,Scatchet Head Rd at Bailey Rd (NB),"Washington 98236, USA",bus_stop,47.936706,-122.410775,transportation,POINT (-122.41078 47.93671),48.0,Transportation and Warehousing,Washington 98236


In [2]:
bspo_fsq_msa = gpd.read_file('/Users/houpuli/Redlining Lab Dropbox/HOUPU LI/POI research/BSOP_MSA/bspo msa/bspo_fsq.geojson')
bspo_fsq_msa["cat_str"] = bspo_fsq_msa["fsq_category_labels"].str[0]
cats = bspo_fsq_msa["cat_str"].str.split(" > ", expand=True)
bspo_fsq_msa["cat_main"] = cats[0]
bspo_fsq_msa["cat_alt"] = cats[1]
bspo_fsq_msa

Unnamed: 0,fsq_place_id,name,latitude,longitude,address,locality,region,postcode,admin_region,post_town,...,fsq_category_ids,fsq_category_labels,placemaker_url,unresolved_flags,geom,bbox,geometry,cat_str,cat_main,cat_alt
0,724a9faacd2047365e05d1a4,Olsons Landscaping,47.405761,-123.007494,1531 NE Tahuya River Rd,Tahuya,WA,98588,,,...,[63be6904847c3692a84b9b5b],[Business and Professional Services > Home Imp...,https://foursquare.com/placemakers/review-plac...,,AAAAAAHAXsB6yWgpsEBHs+/41T1c,"{'xmin': '-123.0074943082061', 'ymin': '47.405...",POINT (-123.00749 47.40576),Business and Professional Services > Home Impr...,Business and Professional Services,Home Improvement Service
1,3ab619b0055d1f88bcc75630,Dennis A Bogue,47.406155,-123.007050,1521 NE Tahuya River Rd,Tahuya,WA,98588,,,...,[63be6904847c3692a84b9b35],[Business and Professional Services > Computer...,https://foursquare.com/placemakers/review-plac...,,AAAAAAHAXsBzgYZuF0BHs/ziQMBV,"{'xmin': '-123.00704992416318', 'ymin': '47.40...",POINT (-123.00705 47.40615),Business and Professional Services > Computer ...,Business and Professional Services,Computer Repair Service
2,a63b48dacf584a090498750d,Tahuya Trails,47.428412,-123.023500,350 NE Dewatto Rd,Tahuya,WA,98588,,,...,[4f4528bc4b90abdf24c9de85],[Sports and Recreation],https://foursquare.com/placemakers/review-plac...,,AAAAAAHAXsGBBkLUD0BHttY2s84U,"{'xmin': '-123.02350002790648', 'ymin': '47.42...",POINT (-123.02350 47.42841),Sports and Recreation,Sports and Recreation,
3,7cfba25f760e49a6b52327f5,Jamason Designs,47.436970,-123.027358,2021 NE Tee Lake Rd,Tahuya,WA,98588,,,...,[63be6904847c3692a84b9b90],[Business and Professional Services > Technolo...,https://foursquare.com/placemakers/review-plac...,,AAAAAAHAXsHAOmpWaEBHt+6hiGez,"{'xmin': '-123.02735767731212', 'ymin': '47.43...",POINT (-123.02736 47.43697),Business and Professional Services > Technolog...,Business and Professional Services,Technology Business
4,51fdeb7b498e523e9fdb4602,Tee Lake,47.437026,-123.028631,2020 NE Tee Lake Rd,Tahuya,WA,98588,,,...,[4bf58dd8d48988d161941735],[Landmarks and Outdoors > Lake],https://foursquare.com/placemakers/review-plac...,,AAAAAAHAXsHVGIn4DkBHt/B4VaB8,"{'xmin': '-123.02863133882076', 'ymin': '47.43...",POINT (-123.02863 47.43703),Landmarks and Outdoors > Lake,Landmarks and Outdoors,Lake
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19184,4f321c3c19836c91c7b6e940,Casual Gourmet,47.960930,-122.434539,3315 Wildes Rd,Clinton,WA,98236,,,...,[4d4b7105d754a06374d81259],[Dining and Drinking > Restaurant],https://foursquare.com/placemakers/review-plac...,,AAAAAAHAXpvPfMroL0BH+v/Ajfud,"{'xmin': '-122.4345390302194', 'ymin': '47.960...",POINT (-122.43454 47.96093),Dining and Drinking > Restaurant,Dining and Drinking,Restaurant
19185,2440191f034842d39e37a640,Harper Design,47.970125,-122.451078,6616 Sills Rd,Clinton,WA,98236,,,...,[5fac002599ce226e27fe72e5],[Business and Professional Services > Architec...,https://foursquare.com/placemakers/review-plac...,,AAAAAAHAXpzedz4a5UBH/C0Ms2U4,"{'xmin': '-122.45107823433825', 'ymin': '47.97...",POINT (-122.45108 47.97012),Business and Professional Services > Architect...,Business and Professional Services,Architecture Firm
19186,57a8a95b498e5952a80411bb,bayview pet resort,47.970645,-122.446232,2266 sills rd,Clinton,WA,98236,,,...,[4e52d2d203646f7c19daa8ae],[Community and Government > Animal Shelter],https://foursquare.com/placemakers/review-plac...,,AAAAAAHAXpyPD2cCoUBH/D4VM7vV,"{'xmin': '-122.44623169955823', 'ymin': '47.97...",POINT (-122.44623 47.97064),Community and Government > Animal Shelter,Community and Government,Animal Shelter
19187,239f04a54ad54a253dfc45c1,Whidbey South Construction,47.972301,-122.445634,6574 Longwood Ln,Clinton,WA,98236,,,...,[5454144b498ec1f095bff2f2],[Retail > Construction Supplies Store],https://foursquare.com/placemakers/review-plac...,,AAAAAAHAXpyFRIh2F0BH/HRc0mot,"{'xmin': '-122.44563401533411', 'ymin': '47.97...",POINT (-122.44563 47.97230),Retail > Construction Supplies Store,Retail,Construction Supplies Store


In [36]:
bspo_gplc_fsq = search_spatial_candidates(reference_gdf=bsop_gplc, compared_gdf=bspo_fsq_msa, id_col = "fsq_place_id", k=100)

In [37]:
bspo_gplc_fsq = match_by_name(reference_gdf=bspo_gplc_fsq, compared_gdf=bspo_fsq_msa, re_name_col = "name", comp_name_col = "name", comp_id = "fsq_place_id", comp_id_col ="cat_alt",  threshold=80)

In [38]:
bspo_gplc_fsq = address_score_check(reference_gdf=bspo_gplc_fsq, compared_gdf=bspo_fsq_msa, addr_col_ref = "addr_simple", addr_col_cmp = "address", id_col = "fsq_place_id")

In [39]:
bspo_gplc_fsq = calculate_similarity_check(bspo_gplc_fsq, cat_col_ref = "primary_type", cat_col_cmp = "matched_cat_main", id_col = "matched_id", result_col =  "category_sim")

Encoding 2746 rows...


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

In [40]:
# transfer the X and Y into float type and deal with the address score
cols_to_fix = ['name_score', 'location_distance', 'address_score', 'category_sim']
for col in cols_to_fix:
    bspo_gplc_fsq[col] = pd.to_numeric(bspo_gplc_fsq[col], errors='coerce')

In [18]:
df = bspo_gplc_fsq[bspo_gplc_fsq["matched_id"].notna()].copy()

N = 2000

weights = df["primary_cat"].map(
    df["primary_cat"].value_counts(normalize=True)
)

df_sample = df.sample(
    n=N,
    weights=weights,
    random_state=42
)
df_sample[['id','name','addr_simple','naics_definition', 'matched_id','matched_name','matched_address','matched_cat_main','location_distance']].to_csv('/Users/houpuli/Redlining Lab Dropbox/HOUPU LI/POI research/bspo_gplc_fsq_2000_sample.csv', index=False)

In [42]:
df_sample_check = pd.read_csv('/Users/houpuli/Redlining Lab Dropbox/HOUPU LI/POI research/BSOP_MSA/bspo_google_comparison/bspo_gplc_fsq_2000_sample.csv')
df_sample_check = df_sample_check.drop(columns='location_distance')
df_sample_check = df_sample_check.merge(bspo_gplc_fsq[['id','name_score','location_distance','address_score','category_sim']], left_on="id", right_on="id", how="left")

In [43]:
df_sample_check['is_true'].value_counts()

is_true
1    1640
0     360
Name: count, dtype: int64

In [44]:
df = df_sample_check.copy()
from sklearn.preprocessing import StandardScaler

X = df[['name_score', 'location_distance', 'address_score', 'category_sim']]
y = df['is_true']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [45]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.25,
    random_state=42,
    stratify=y  # keep the same proportion of True vs False in training set and test set
)

In [46]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

xgb_clf = XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    enable_categorical=False,
    eval_metric='auc',
    random_state=42
)

xgb_clf.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=False  
)

xgb_y_pred = xgb_clf.predict(X_test)
xgb_y_prob = xgb_clf.predict_proba(X_test)[:, 1]

xgb_cr = classification_report(y_test, xgb_y_pred)
xgb_auc = roc_auc_score(y_test, xgb_y_prob)

print(xgb_cr)
print("XGBoost AUC:", xgb_auc)

              precision    recall  f1-score   support

           0       0.77      0.83      0.80        90
           1       0.96      0.95      0.95       410

    accuracy                           0.93       500
   macro avg       0.87      0.89      0.88       500
weighted avg       0.93      0.93      0.93       500

XGBoost AUC: 0.9549864498644987


In [None]:
mask = bspo_gplc_fsq['matched_id'].notnull()
df_pred = bspo_gplc_fsq.loc[mask].copy()

feature_cols = ['name_score', 'location_distance', 'address_score', 'category_sim']

X_new = scaler.transform(df_pred[feature_cols])
df_pred['true_match_prob'] = xgb_clf.predict_proba(X_new)[:, 1]
df_pred['is_true_match'] = df_pred['true_match_prob'] >= 0.5

bspo_gplc_fsq.loc[mask, 'is_true_match'] = df_pred['is_true_match']
bspo_gplc_fsq.loc[mask, 'true_match_prob'] = df_pred['true_match_prob']

In [48]:
bspo_gplc_fsq['is_true_match'].value_counts()

is_true_match
True     2242
False     504
Name: count, dtype: int64

In [49]:
bspo_gplc_fsq.drop(columns=['cand_ids','cand_dist_m']).to_file('/Users/houpuli/Redlining Lab Dropbox/HOUPU LI/POI research/BSOP_MSA/bspo_google_comparison/bspo_gplc_fsq.geojson')