參考 link :https://www.kaggle.com/code/ryotayoshinobu/foursquare-lightgbm-baseline
This notebook shows how to solve the problem as a multi-class classification by finding candidate points based on geographic location.<br>
Similarity as a string, such as edit distance and LCS (Longest Common Subsequence), was used for the features of the candidate points.<br>
<br>
Inference is made on test data only, but the code for training is left commented out.<br>
<br>
In addition, making the matches bidirectional as a post-processing step improved the score by about 1%.<br>
<br>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import gc
import random
from glob import glob
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
import warnings
import seaborn as sns
import pickle
import json
import re
import time
import sys
from requests import get
import multiprocessing
import joblib
# from sentence_transformers import SentenceTransformer, util
# model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# 宣告一個專門放參數的 class
class CFG: # configuration
    seed = 46
    target = "point_of_interest"
    n_neighbors = 10
    n_splits = 3

    expID = ""
    if "google.colab" in sys.modules:
        expID = get("http://172.28.0.2:9000/api/sessions").json()[0]["name"].split(".")[0]

random.seed(CFG.seed)
os.environ["PYTHONHASHSEED"] = str(CFG.seed)
np.random.seed(CFG.seed)

plt.rcParams["font.size"] = 13
warnings.filterwarnings('ignore')

# %cd /content/drive/MyDrive/kaggle/foursquare-location-matching/{CFG.expID}

In [2]:
# train = pd.read_csv("./train.csv")
test = pd.read_csv("../input/foursquare-location-matching/test.csv")
test[CFG.target] = "TEST"

# test.head()

# Divide Train Data into about 600K×2

In [3]:
# kf = GroupKFold(n_splits=2)
# for i, (trn_idx, val_idx) in enumerate(kf.split(train, train[CFG.target], train[CFG.target])):
#     train.loc[val_idx, "set"] = i
# train["set"].value_counts()

In [4]:
# train.head()

# 資料代入 / 插補 / 填補 Data Imputation

In [5]:
from sklearn.neighbors import KNeighborsRegressor
# 把每一個地點都加上 另外一個最近地點的資訊
def add_neighbor_features(df):
    dfs = []
    # 需要 id 不然不沒辦法做後方 matches 的 set、沒辦法算分數
    columns = ['id', 'name', 'address', 'city', 'state',
           'zip', 'country', 'url', 'phone', 'categories']
    for c in columns:
        if c != "id":
            df[c] = df[c].astype(str).str.lower()
            
    # 把相同國家的 row 放進同一個 dataframe 裡面
    for country, country_df in tqdm(df.groupby("country")):
        country_df = country_df.reset_index(drop=True)
        # n_neighbors 表示要參考附近的幾筆資料才決定屬於哪一群
        k = min(len(country_df), CFG.n_neighbors)
        knn = KNeighborsRegressor(n_neighbors=k, 
                                  metric='haversine', n_jobs=-1)
        
        # 使用相同的"國家群"進行訓練
        # 同個國家的資料當成一群進行訓練
        knn.fit(country_df[['latitude','longitude']], country_df.index)
        
        # 得出分群後的資料，回傳每個點與最近的幾個點的距離與 indices
        dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)
        # 將 n_neighbors 各點的資料呈現在 country_df 上
        targets = country_df[CFG.target].values
        for i in range(k):
            country_df[f"d_near_{i}"] = dists[:, i]
            country_df[f"near_target_{i}"] = targets[nears[:, i]]
            for c in columns:
                country_df[f"near_{c}_{i}"] = country_df[c].values[nears[:, i]]
        # 若整個國家的資料點數量小於預設所需的 k ，則將多出的部分填補為 nan
        for i in range(k, CFG.n_neighbors):
            country_df[f"d_near_{i}"] = np.nan
            country_df[f"near_target_{i}"] = np.nan
            for c in columns:
                country_df[f"near_{c}_{i}"] = np.nan

        dfs.append(country_df)
    df = pd.concat(dfs).reset_index(drop=True)
    return df


In [6]:
test = add_neighbor_features(test)

  0%|          | 0/4 [00:00<?, ?it/s]

## Two-way Hash
用雜湊表來將正確答案產出

In [7]:
# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics

## 將input dataframe 轉成 dict，再拿出 id 與 poi 用 dict 回傳
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

## 將input dataframe 轉成 dict，再以 poi 為 index、取出"屬於此 poi"的 id set 用 dict 回傳
def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

# id2poi = get_id2poi(train)
# poi2ids = get_poi2ids(train)

## 比對
利用 KNN 的結果做出預測，再與正確答案做比對

In [8]:
## 已有 matches
def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        # 找出與該 id 相同 poi 之所有 id
        targets = poi2ids[id2poi[id_str]]
        # 整理 matches 格式 (根據 train + knn 所得) (以空格分割)
        preds = set(matches.split())
        # 比分數
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

In [9]:
# 清記憶體垃圾
# del train
gc.collect()

182

# Feature Engineering

In [10]:
if "google.colab" in sys.modules:
    !pip install Levenshtein

In [11]:
%load_ext Cython

In [12]:
%%cython
def LCS(str S, str T):
    cdef int i, j
    cdef list dp = [[0] * (len(T) + 1) for _ in range(len(S) + 1)]
    for i in range(len(S)):
        for j in range(len(T)):
            dp[i + 1][j + 1] = max(dp[i][j] + (S[i] == T[j]), dp[i + 1][j], dp[i][j + 1], dp[i + 1][j + 1])
    return dp[len(S)][len(T)]

In [13]:
# 利用 Levenshtein、difflib 算出兩字串的各種 features 相異程度數值，並把數值加入到 features 內
import Levenshtein
import difflib

def _add_distance_features(args):
    # _, df = args
    df = args
    columns = ['name', 'address', 'city', 'state',
           'zip', 'country', 'url', 'phone', 'categories']

    for i in tqdm(range(CFG.n_neighbors)):
        for c in columns:
            geshs = []
            levens = []
            jaros = []
            lcss = []
            sentence_transformer = []
            str_enter_count = 0
            for str1, str2 in df[[f"near_{c}_0", f"near_{c}_{i}"]].values.astype(str):
                # 一次只會進來一次，所以 for 是為了取值用的～
                str_enter_count += str_enter_count
                if str_enter_count >= 2:
                    print("bigger than 2","i = ", i,"c =", c)
                # 檢查是否為 NaN
                if str1==str1 and str2==str2:
                    geshs.append(difflib.SequenceMatcher(None, str1, str2).ratio())
                    levens.append(Levenshtein.distance(str1, str2))
                    jaros.append(Levenshtein.jaro_winkler(str1, str2))
                    lcss.append(LCS(str(str1), str(str2)))
                else:
                    geshs.append(-1)
                    levens.append(-1)
                    jaros.append(-1)
            df[f"near_{c}_{i}_gesh"] = geshs
            df[f"near_{c}_{i}_leven"] = levens
            df[f"near_{c}_{i}_jaro"] = jaros
            df[f"near_{c}_{i}_lcs"] = lcss

            # 如果是 name, address, city, state, url, categories 的情況下，再多加一個平均的 feature
            if not c in ['country', "phone", "zip"]:
                df[f"near_{c}_{i}_len"] = df[f"near_{c}_{i}"].astype(str).map(len)
                df[f"near_{c}_{i}_nleven"] = df[f"near_{c}_{i}_leven"] / df[[f"near_{c}_{i}_len", f"near_{c}_0_len"]].max(axis=1)
                df[f"near_{c}_{i}_nlcsi"] = df[f"near_{c}_{i}_lcs"] / df[f"near_{c}_{i}_len"]
                df[f"near_{c}_{i}_nlcs0"] = df[f"near_{c}_{i}_lcs"] / df[f"near_{c}_0_len"]
    return df

# muilty processing
def add_distance_features(df):
    processes = multiprocessing.cpu_count()
    with multiprocessing.Pool(processes=processes) as pool:
        dfs = pool.imap_unordered(_add_distance_features, df.groupby('country'))
        dfs = tqdm(dfs)
        dfs = list(dfs)
    df = pd.concat(dfs)
    return df

## stop plz

In [14]:
test = _add_distance_features(test)

  0%|          | 0/10 [00:00<?, ?it/s]

# Delete Unused Columns for avoiding OOM (out of memory)

In [15]:
# 在上面的 block 我們已經把轉換過的 feature 加入到 dataframe 內了，這裡 features 的目的就是為了移掉其他不重要的 features
features = []

columns = ['name', 'address', 'city', 'state',
       'zip', 'country', 'url', 'phone', 'categories']
for i in tqdm(range(CFG.n_neighbors)):
    features.append(f"d_near_{i}")
    for c in columns:        
        features += [f"near_{c}_{i}_gesh", f"near_{c}_{i}_jaro", f"near_{c}_{i}_lcs"]
        if c in ['country', "phone", "zip"]:
            features += [f"near_{c}_{i}_leven"]
        else:
            features += [f"near_{c}_{i}_len", f"near_{c}_{i}_nleven", f"near_{c}_{i}_nlcsi", f"near_{c}_{i}_nlcs0"]

for f in features:
#     assert f in test.columns
    if f not in test.columns:
        test[f] = np.nan

# print(features)

  0%|          | 0/10 [00:00<?, ?it/s]

In [16]:
test = test[features + ["id"] + [f"near_id_{i}" for i in range(CFG.n_neighbors)]]

test[features] = test[features].astype(np.float16)

test.reset_index(drop=True, inplace=True)

for _ in range(5):
    gc.collect()


**訓練與測試資料的格式**
- X 包含 
    - CFT 的資訊
    - n_neighber = 10, 的所有資訊
    - n_neighber 跟 CFT.target 相關的所有字串配對分數
- y 包含
    - 相同 POI 之中，最大的 neighber

# Model Running
使用 lightGBM 演算法

In [17]:
# 直接用 model.predict_proba 跑結果
def inference_lgbm(models, feat_df):
    pred = np.array([model.predict_proba(feat_df) for model in models])
    pred = np.mean(pred, axis=0)
    return pred

In [18]:
# 根據不同的分組，拿出不同的 model
models = [joblib.load(f'../input/continue-target/continue_target/lgbm_fold{i}.pkl') for i in range(CFG.n_splits)]
# 這裡直接測試test data 並回傳 k-fold 不同模型的預測結果平均
pred_con = inference_lgbm(models, test[features])

FileNotFoundError: [Errno 2] No such file or directory: '../input/continue-target/continue_target/lgbm_fold0.pkl'

In [None]:
# 根據不同的分組，拿出不同的 model
models = [joblib.load(f'../input/80percent/80percent/lgbm_fold{i}.pkl') for i in range(CFG.n_splits)]
# 這裡直接測試test data 並回傳 k-fold 不同模型的預測結果平均
pred_far = inference_lgbm(models, test[features])

# Check CV

In [None]:
# 每個 id 的周遭 10 個 ids 為誰 (dimension = i * 10)
near_ids = test[[f"near_id_{i}" for i in range(CFG.n_neighbors)]].values

matches = []
# 掃過全部的 test row ，得出其中的 id
# ps 為所有 id 的預測結果
# ids 周遭 10 個 id 為誰
for id, ps_con, ps_far, ids in tqdm(zip(test["id"], pred_con, pred_far, near_ids)):
    # 找出最大的那個 class 的值，代表預測的 target 是多少
    idx_con = np.argmax(ps_con)
    idx_far = np.argmax(ps_far)
    matches_string = id
    if idx_con > idx_far:
        for idx_count in range(1, idx_con +1):
            if ids[idx_count] == ids[idx_count]:
                matches_string += " "
                matches_string += ids[idx_count]
    elif idx_con < idx_far:
        for idx_count in range(1, idx_con +1):
            if ids[idx_count] == ids[idx_count]:
                matches_string += " "
                matches_string += ids[idx_count]
        if ids[idx_far] == ids[idx_far]:
            matches_string += " "
            matches_string += ids[idx_far]
    else: # idx_con == idx_far:
        if idx_con > 0:
          for idx_count in range(1, idx_con +1):
            if ids[idx_count] == ids[idx_count]:
                matches_string += " "
                matches_string += ids[idx_count] 
    matches.append(matches_string)
test["matches"] = matches
# print(f"CV: {get_score(test):.6f}")

# Simple Post-Processing

In [None]:
def postprocess(df):
    id2match = dict(zip(df["id"].values, df["matches"].str.split()))

    for match in tqdm(df["matches"]):
        match = match.split()
        if len(match) == 1:        
            continue
        
        base = match[0]
        for m in match[1:]:
            if not base in id2match[m]:
                id2match[m].append(base)
    df["matches"] = df["id"].map(id2match).map(" ".join)
    return df 

#test = postprocess(test)
test = postprocess(test)
# print(f"CV: {get_score(test):.6f}")

In [None]:
test

# Submit

In [None]:
ssub = pd.read_csv("../input/foursquare-location-matching/sample_submission.csv")
ssub = ssub.drop(columns="matches")
ssub = ssub.merge(test[["id", "matches"]], on="id")
ssub.to_csv("submission.csv", index=False)

ssub.head()