In [1]:
DEBUG = False
DEBUG_COUNTRY = ["GB"]

In [2]:
# =========================
# Library
# =========================
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import os
import gc
import random
from glob import glob
import warnings
import pickle
import json
import re
import time
import sys
from requests import get
import multiprocessing
import joblib
from joblib import Parallel, delayed
import Levenshtein
import difflib
from contextlib import contextmanager
from sklearn.neighbors import KNeighborsRegressor
import unicodedata
from transformers import AutoModel,AutoTokenizer
from cuml import ForestInference
from cuml.neighbors import NearestNeighbors
%env TOKENIZERS_PARALLELISM=true
import torch.nn.functional as F
import torch.nn as nn
import torch
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast
from catboost import CatBoostClassifier
warnings.filterwarnings('ignore')

env: TOKENIZERS_PARALLELISM=true


In [3]:
# =========================
# Constant
# =========================
TRAIN_PATH = "../input/foursquare-fold/fold_train.csv"
TRAIN_RAW_PATH = "../input/foursquare-location-matching/train.csv"
TEST_PATH = "../input/foursquare-location-matching/test.csv"
SUB_PATH = "../input/foursquare-location-matching/sample_submission.csv"
TARGET = "point_of_interest"

In [4]:
# =========================
# Settings
# =========================
n_neighbors_first_stage = 100
second_stage_rank = 20
thrid_stage_blocking = 0.01
fourth_stage_blocking = 0.02

# ================================
# model
# ================================
# place
first_stage_place_lgb_path = ["../input/foursquare-ex073/lgb_fold0.txt"]

# name
first_stage_name_lgb_path = ["../input/foursquare-ex074/lgb_fold0.txt"]

second_stage_lgb_path = ["../input/foursquare-ex075/lgb_fold0.txt"]
third_stage_cat_path = ["../input/foursquare-ex090/model"]


# ================================
# fe 
# ================================
category_encoder = "045"
fe045_categories_path = f"../input/foursquare-fe{category_encoder}/fe{category_encoder}_categories.pkl"
fe045_city_path = f"../input/foursquare-fe{category_encoder}/fe{category_encoder}_city.pkl"
fe045_country_path = f"../input/foursquare-fe{category_encoder}/fe{category_encoder}_country.pkl"
fe045_state_path = f"../input/foursquare-fe{category_encoder}/fe{category_encoder}_state.pkl"
fe046_svd_path = f"../input/foursquare-fe046/fe046_svd.pkl"

first_stage_place_features = ['latitude', 'longitude', 'rank', 'd_near', 
                             'near_latitude', 'near_longitude', 'name_jaro', 
                             'categories_jaro']

first_stage_name_features = ['latitude', 'longitude', 'rank', 'd_near', 'near_latitude', 'near_longitude', 'name_jaro', 'distance']


second_stage_features= ['latitude',
 'longitude',
 'rank',
 'd_near',
 'near_latitude',
 'near_longitude',
 'name_gesh',
 'name_leven',
 'name_jaro',
 'address_gesh',
 'address_leven',
 'address_jaro',
 'city_gesh',
 'city_leven',
 'city_jaro',
 'state_gesh',
 'state_leven',
 'state_jaro',
 'zip_gesh',
 'zip_leven',
 'zip_jaro',
 'url_gesh',
 'url_leven',
 'url_jaro',
 'phone_gesh',
 'phone_leven',
 'phone_jaro',
 'categories_gesh',
 'categories_leven',
 'categories_jaro',
 'distance',
 'distance_rank',
 'name_gesh_mean',
 'name_gesh_max',
 'near_name_gesh_mean',
 'near_name_gesh_max',
 'name_gesh_mean_rate',
 'name_gesh_max_rate',
 'near_name_gesh_mean_rate',
 'near_name_gesh_max_rate',
 'name_leven_mean',
 'name_leven_min',
 'near_name_leven_mean',
 'near_name_leven_min',
 'name_leven_mean_rate',
 'name_leven_min_rate',
 'near_name_leven_mean_rate',
 'near_name_leven_min_rate',
 'name_jaro_mean',
 'name_jaro_max',
 'near_name_jaro_mean',
 'near_name_jaro_max',
 'name_jaro_mean_rate',
 'name_jaro_max_rate',
 'near_name_jaro_mean_rate',
 'near_name_jaro_max_rate',
 'categories_gesh_mean',
 'categories_gesh_max',
 'near_categories_gesh_mean',
 'near_categories_gesh_max',
 'categories_gesh_mean_rate',
 'categories_gesh_max_rate',
 'near_categories_gesh_mean_rate',
 'near_categories_gesh_max_rate',
 'categories_leven_mean',
 'categories_leven_min',
 'near_categories_leven_mean',
 'near_categories_leven_min',
 'categories_leven_mean_rate',
 'categories_leven_min_rate',
 'near_categories_leven_mean_rate',
 'near_categories_leven_min_rate',
 'categories_jaro_mean',
 'categories_jaro_max',
 'near_categories_jaro_mean',
 'near_categories_jaro_max',
 'categories_jaro_mean_rate',
 'categories_jaro_max_rate',
 'near_categories_jaro_mean_rate',
 'near_categories_jaro_max_rate',
 'd_near_mean',
 'd_near_min',
 'near_d_near_mean',
 'near_d_near_min',
 'd_near_mean_rate',
 'd_near_min_rate',
 'near_d_near_mean_rate',
 'near_d_near_min_rate',
 'distance_mean',
 'distance_min',
 'near_distance_mean',
 'near_distance_min',
 'distance_mean_rate',
 'distance_min_rate',
 'near_distance_mean_rate',
 'near_distance_min_rate',
 'city_label',
 'near_city_label',
 'state_label',
 'near_state_label',
 'country_label',
 'near_country_label',
 'categories_label',
 'near_categories_label',
 'name_emb_svd0',
 'name_emb_svd1',
 'name_emb_svd2',
 'name_emb_svd3',
 'name_emb_svd4',
 'name_emb_svd5',
 'name_emb_svd6',
 'name_emb_svd7',
 'name_emb_svd8',
 'name_emb_svd9',
 'near_name_emb_svd0',
 'near_name_emb_svd1',
 'near_name_emb_svd2',
 'near_name_emb_svd3',
 'near_name_emb_svd4',
 'near_name_emb_svd5',
 'near_name_emb_svd6',
 'near_name_emb_svd7',
 'near_name_emb_svd8',
 'near_name_emb_svd9']

# ================================
# bert
# ================================
bert_num_cols1 = ["latitude", "longitude", "near_latitude", "near_longitude",
           "latdiff", "londiff", "manhattan", "euclidean", "haversine",
           "x", "y", "z", "near_x", "near_y", "near_z", "dot",
           'name_gesh', 'name_leven', 'name_jaro',
           'address_gesh', 'address_leven', 'address_jaro', 'city_gesh',
           'city_leven', 'city_jaro', 'state_gesh', 'state_leven', 'state_jaro',
           'zip_gesh', 'zip_leven', 'zip_jaro', 'url_gesh', 'url_leven',
           'url_jaro', 'phone_gesh', 'phone_leven','phone_jaro', 'categories_gesh', 'categories_leven', 'categories_jaro',
           'distance', 'distance_rank', 'name_gesh_mean', 'name_gesh_max',
           'near_name_gesh_mean', 'near_name_gesh_max', 'name_gesh_mean_rate',
           'name_gesh_max_rate', 'near_name_gesh_mean_rate',
           'near_name_gesh_max_rate', 'name_leven_mean', 'name_leven_min',
           'near_name_leven_mean', 'near_name_leven_min', 'name_leven_mean_rate',
           'name_leven_min_rate', 'near_name_leven_mean_rate',
           'near_name_leven_min_rate', 'name_jaro_mean', 'name_jaro_max',
           'near_name_jaro_mean', 'near_name_jaro_max', 'name_jaro_mean_rate',
           'name_jaro_max_rate', 'near_name_jaro_mean_rate',
           'near_name_jaro_max_rate', 'categories_gesh_mean',
           'categories_gesh_max', 'near_categories_gesh_mean',
           'near_categories_gesh_max', 'categories_gesh_mean_rate',
           'categories_gesh_max_rate', 'near_categories_gesh_mean_rate',
           'near_categories_gesh_max_rate', 'categories_leven_mean',
           'categories_leven_min', 'near_categories_leven_mean',
           'near_categories_leven_min', 'categories_leven_mean_rate',
           'categories_leven_min_rate', 'near_categories_leven_mean_rate',
           'near_categories_leven_min_rate', 'categories_jaro_mean',
           'categories_jaro_max', 'near_categories_jaro_mean',
           'near_categories_jaro_max','categories_jaro_mean_rate', 'categories_jaro_max_rate',
           'near_categories_jaro_mean_rate', 'near_categories_jaro_max_rate']


bert_num_cols2 = ["latitude","longitude",'name_gesh', 'name_leven', 'name_jaro',
       'address_gesh', 'address_leven', 'address_jaro', 'city_gesh',
       'city_leven', 'city_jaro', 'state_gesh', 'state_leven', 'state_jaro',
       'zip_gesh', 'zip_leven', 'zip_jaro', 'url_gesh', 'url_leven',
       'url_jaro', 'phone_gesh', 'phone_leven','phone_jaro', 'categories_gesh', 'categories_leven', 'categories_jaro',
       'distance', 'distance_rank', 'name_gesh_mean', 'name_gesh_max',
       'near_name_gesh_mean', 'near_name_gesh_max', 'name_gesh_mean_rate',
       'name_gesh_max_rate', 'near_name_gesh_mean_rate',
       'near_name_gesh_max_rate', 'name_leven_mean', 'name_leven_min',
       'near_name_leven_mean', 'near_name_leven_min', 'name_leven_mean_rate',
       'name_leven_min_rate', 'near_name_leven_mean_rate',
       'near_name_leven_min_rate', 'name_jaro_mean', 'name_jaro_max',
       'near_name_jaro_mean', 'near_name_jaro_max', 'name_jaro_mean_rate',
       'name_jaro_max_rate', 'near_name_jaro_mean_rate',
       'near_name_jaro_max_rate', 'categories_gesh_mean',
       'categories_gesh_max', 'near_categories_gesh_mean',
       'near_categories_gesh_max', 'categories_gesh_mean_rate',
       'categories_gesh_max_rate', 'near_categories_gesh_mean_rate',
       'near_categories_gesh_max_rate', 'categories_leven_mean',
       'categories_leven_min', 'near_categories_leven_mean',
       'near_categories_leven_min', 'categories_leven_mean_rate',
       'categories_leven_min_rate', 'near_categories_leven_mean_rate',
       'near_categories_leven_min_rate', 'categories_jaro_mean',
       'categories_jaro_max', 'near_categories_jaro_mean',
       'near_categories_jaro_max','categories_jaro_mean_rate', 'categories_jaro_max_rate',
       'near_categories_jaro_mean_rate', 'near_categories_jaro_max_rate']

sc_dict = {'latitude': [26.87459868745177, 23.144740576788625],
 'longitude': [20.70497351331466, 82.6778436146614],
 'near_latitude': [22.377329, 23.80125],
 'near_longitude': [47.604324, 72.81156],
 'latdiff': [0.0026245795, 0.60088676],
 'londiff': [-0.004257245, 1.7946571],
 'manhattan': [0.21769507, 2.1573222],
 'euclidean': [0.17776342, 1.8842192],
 'haversine': [0.002634386, 0.023196151],
 'x': [0, 1],
 'y': [0, 1],
 'z': [0, 1],
 'near_x': [0, 1],
 'near_y': [0, 1],
 'near_z': [0, 1],
 'dot': [0, 1],
 'name_gesh': [0.535247, 0.28312334],
 'name_leven': [12.289453, 8.717725],
 'name_jaro': [0.6814999, 0.278118],
 'address_gesh': [0.55750847, 0.32904968],
 'address_leven': [11.519652, 11.330601],
 'address_jaro': [0.68748957, 0.28175715],
 'city_gesh': [0.78845024, 0.33762273],
 'city_leven': [2.639476, 4.342476],
 'city_jaro': [0.8420279, 0.29122102],
 'state_gesh': [0.7989922, 0.3356451],
 'state_leven': [2.618497, 4.6914506],
 'state_jaro': [0.8407704, 0.2916656],
 'zip_gesh': [0.90403444, 0.18740492],
 'zip_leven': [0.6094352, 1.2436553],
 'zip_jaro': [0.9485774, 0.11972204],
 'url_gesh': [0.8163289, 0.22502218],
 'url_leven': [15.599948, 23.641768],
 'url_jaro': [0.9568382, 0.07457281],
 'phone_gesh': [0.7717348, 0.24688902],
 'phone_leven': [3.4015062, 3.4060066],
 'phone_jaro': [0.85937643, 0.16305733],
 'categories_gesh': [0.5930225, 0.32405168],
 'categories_leven': [10.481613, 10.402995],
 'categories_jaro': [0.72280174, 0.2595957],
 'distance': [3.5818813, 170.23347],
 'distance_rank': [11.811793, 10.082427],
 'name_gesh_mean': [0.40428534, 0.14291741],
 'name_gesh_max': [0.8063714, 0.17690912],
 'near_name_gesh_mean': [0.40961915, 0.14795418],
 'near_name_gesh_max': [0.8105544, 0.17704241],
 'name_gesh_mean_rate': [1.3989464, 1.0484291],
 'name_gesh_max_rate': [0.6530004, 0.29792878],
 'near_name_gesh_mean_rate': [1.3906603, 1.0571884],
 'near_name_gesh_max_rate': [0.65022796, 0.298543],
 'name_leven_mean': [14.080113, 5.990362],
 'name_leven_min': [5.0102377, 5.371386],
 'near_name_leven_mean': [13.955088, 6.017419],
 'near_name_leven_min': [4.898045, 5.3486223],
 'name_leven_mean_rate': [0.8712333, 0.6081466],
 'name_leven_min_rate': [3.0417712, 3.5432434],
 'near_name_leven_mean_rate': [0.88712335, 0.6753299],
 'near_name_leven_min_rate': [3.103235, 3.6311984],
 'name_jaro_mean': [0.59393793, 0.14204046],
 'name_jaro_max': [0.922182, 0.113545366],
 'near_name_jaro_mean': [0.5990131, 0.14566767],
 'near_name_jaro_max': [0.9246227, 0.112311274],
 'name_jaro_mean_rate': [1.17415, 0.75928885],
 'name_jaro_max_rate': [0.73229, 0.27749673],
 'near_name_jaro_mean_rate': [1.1688758, 0.76395464],
 'near_name_jaro_max_rate': [0.73066276, 0.2782531],
 'categories_gesh_mean': [0.45021054, 0.16041645],
 'categories_gesh_max': [0.8934652, 0.18464169],
 'near_categories_gesh_mean': [0.45295003, 0.1621628],
 'near_categories_gesh_max': [0.89459765, 0.18372972],
 'categories_gesh_mean_rate': [1.3444637, 0.71897376],
 'categories_gesh_max_rate': [0.65741974, 0.31307998],
 'near_categories_gesh_mean_rate': [1.3444862, 0.7269207],
 'near_categories_gesh_max_rate': [0.65774196, 0.31624898],
 'categories_leven_mean': [13.444308, 6.6503487],
 'categories_leven_min': [2.9128094, 5.704073],
 'near_categories_leven_mean': [13.410719, 6.695578],
 'near_categories_leven_min': [2.9045722, 5.7120137],
 'categories_leven_mean_rate': [0.7467077, 0.7704559],
 'categories_leven_min_rate': [2.024397, 1.5368462],
 'near_categories_leven_mean_rate': [0.75811625, 0.9181195],
 'near_categories_leven_min_rate': [2.0135462, 1.5323894],
 'categories_jaro_mean': [0.6192549, 0.1293145],
 'categories_jaro_max': [0.94835114, 0.116735004],
 'near_categories_jaro_mean': [0.6217563, 0.13072947],
 'near_categories_jaro_max': [0.94931644, 0.11579985],
 'categories_jaro_mean_rate': [1.1727746, 0.40448013],
 'categories_jaro_max_rate': [0.7595728, 0.24773462],
 'near_categories_jaro_mean_rate': [1.1711832, 0.4081076],
 'near_categories_jaro_max_rate': [0.75936586, 0.24919231]}

MAX_LEN = 32
BS = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BERT_MODEL = "../input/bert-base-multilingual/bert-base-multilingual-uncased"

# ================================
# third bert
# ================================
THIRD_MAX_LEN = 128
THIRD_BS = 32
THIRD_BERT_MODEL1 = "../input/xlm-roberta-large/xlm-roberta-large"
# third stage
third_stage_model1_path = "../input/foursquare-ex104/ex104_2.pth"

# ================================
# third bert2
# ================================
THIRD_MAX_LEN2 = 128
THIRD_BS2 = 48
THIRD_BERT_MODEL2 = "../input/mdeberta-base/mdeberta-v3-base"
# third stage
third_stage_model2_path = "../input/foursquare-ex115/ex115_3_ema.pth"

# ================================
# ensemble
# ================================
w1 = 0.013225176172449034
w2 = 0.28751060139700985
w3 = 0.3813813593361608
w4 = 0.31788286309438024

# ================================
# fourth bert
# ================================
FOURTH_MAX_LEN = 128
FOURTH_BS = 32
FOURTH_BERT_MODEL1 = "../input/xlm-roberta-large/xlm-roberta-large"
# fourth stage
fourth_stage_model1_path = "../input/foursquare-ex101/ex101_2.pth"

In [5]:
# colsのdictへの変換
first_stage_place_cols2num_dict = {}
for n,c in enumerate(first_stage_place_features):
    first_stage_place_cols2num_dict[c] = n
    
first_stage_name_cols2num_dict = {}
for n,c in enumerate(first_stage_name_features):
    first_stage_name_cols2num_dict[c] = n
    
second_stage_cols2num_dict = {}
for n,c in enumerate(second_stage_features):
    second_stage_cols2num_dict[c] = n

In [6]:
# ================================
# functions utils
# ================================
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    #print("column = ", len(df.columns))
    for i, col in enumerate(df.columns):
       #if i % 50 == 0:
       #     print(i)
        try:
            col_type = df[col].dtype

            if col_type != object:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int32)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float32)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float32)
        except:
            continue

    end_mem = df.memory_usage().sum() / 1024 ** 2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df


def join(df):
    x = [str(e) for e in list(df)]
    return " ".join(x)

# https://www.kaggle.com/code/columbia2131/foursquare-iou-metrics
def get_id2poi(input_df: pd.DataFrame) -> dict:
    return dict(zip(input_df['id'], input_df['point_of_interest']))

def get_poi2ids(input_df: pd.DataFrame) -> dict:
    return input_df.groupby('point_of_interest')['id'].apply(set).to_dict()

def get_score(input_df: pd.DataFrame):
    scores = []
    for id_str, matches in zip(input_df['id'].to_numpy(), input_df['matches'].to_numpy()):
        targets = poi2ids[id2poi[id_str]]
        preds = set(matches.split())
        score = len((targets & preds)) / len((targets | preds))
        scores.append(score)
    scores = np.array(scores)
    return scores.mean()

In [7]:
# ================================
# functions first stage
# ================================
def make_candidate_first_stage(country_df, n_neighbors,columns):
    concat_df = []
    knn = KNeighborsRegressor(n_neighbors=min(len(country_df), n_neighbors), 
                              metric="euclidean", n_jobs=-1)
    knn.fit(country_df[['latitude','longitude']], country_df.index)
    dists, nears = knn.kneighbors(country_df[['latitude','longitude']], return_distance=True)

    for i in range(min(len(country_df), n_neighbors)):
        country_df_ = country_df[columns].copy()
        country_df_["rank"] = i
        country_df_["d_near"] = dists[:, i]
        for c in columns:
            country_df_[f"near_{c}"] = country_df_[c].values[nears[:, i]]
        concat_df.append(country_df_)
    concat_df = pd.concat(concat_df).reset_index(drop=True)
    return concat_df


def delete_match_id(concat_df):
    concat_df["id_match"] = concat_df["id"] == concat_df["near_id"]
    concat_df["id_match"] = concat_df["id_match"].astype(int)
    concat_df = concat_df[concat_df["id_match"] == 0].reset_index(drop=True)
    concat_df = reduce_mem_usage(concat_df)
    return concat_df


def df2numpy(concat_npy,concat_df,columns,cols2num_dict):
    for c in columns:
        concat_npy[:,cols2num_dict[c]] = concat_df[c].values.astype(np.float32)
        concat_df = concat_df.drop(columns = c)
    return concat_npy, concat_df

def text_preprocess(text):
    text = str(text)
    text = text.replace(" ","")
    text = text.lower()
    text = unicodedata.normalize("NFKC",text)
    return text

# 特徴量エンジニアリング
def calc_distance_first_stage(c1_array,c2_array):
    distance = np.zeros(len(c1_array),dtype=np.float32)
    for n,(c1,c2) in enumerate(zip(c1_array,c2_array)):
        c1 = text_preprocess(c1)
        c2 = text_preprocess(c2)
        if (str(c1) != "nan") and (str(c2) != "nan"):
            distance[n] = Levenshtein.jaro_winkler(str(c1), str(c2))
        else:
            distance[n] = np.nan
    return distance.reshape([-1,1])

# 予測値作成
def make_pred(npy,model_list):
    len_npy = len(npy)
    batch = 50000
    if len_npy > batch:
        pred = np.zeros(len_npy)
        all_batch = int(len_npy //  batch + 1)
        for n,m in enumerate(model_list):
            for i in range(all_batch):
                if i < all_batch - 1:
                    pred[i *  batch : (i + 1) * batch] += m.predict(
                        npy[i *  batch : (i + 1) * batch].astype(np.float32)) / len(model_list)
                else:
                    pred[i *  batch : ] += m.predict(
                        npy[i *  batch : ].astype(np.float32)) / len(model_list)
    else:         
        for n,m in enumerate(model_list):
            if n == 0:
                pred = m.predict(npy.astype(np.float32)) / len(model_list)
            else:
                pred += m.predict(npy.astype(np.float32)) / len(model_list)
    return pred


def remove_low_rank_place(concat_df, concat_npy, second_stage_rank, first_stage_place_cols2num_dict):
    concat_df["id"] = concat_df["id"].astype("category")
    concat_df["pred_rank"] = concat_df.groupby(by="id")["pred"].rank(ascending=False)
    concat_npy = concat_npy[concat_df["pred_rank"] <= second_stage_rank]
    concat_df = concat_df[concat_df["pred_rank"] <= second_stage_rank].reset_index(drop=True)
    for c in ["longitude","near_longitude","latitude","near_latitude"]:
        concat_df[c] = concat_npy[:,first_stage_place_cols2num_dict[c]]
    return concat_df, concat_npy

In [8]:
# ================================
# functions first stage name
# ================================
def make_candidate_name_first_stage(country_df, country_name_emb, n_neighbors,columns):
    concat_df = []
    knn = NearestNeighbors(n_neighbors=min(len(country_df), n_neighbors),metric="cosine")
    knn.fit(country_name_emb)
    dists, nears = knn.kneighbors(country_name_emb)
    del knn
    for i in range(min(len(country_df), n_neighbors)):
        country_df_ = country_df[columns].copy()
        country_df_["rank"] = i
        country_df_["d_near"] = dists[:, i]
        for c in columns:
            country_df_[f"near_{c}"] = country_df_[c].values[nears[:, i]]
        concat_df.append(country_df_)
    concat_df = pd.concat(concat_df).reset_index(drop=True)
    return concat_df

def make_place_distance(concat_npy,first_stage_name_cols2num_dict):
    c = "distance"
    c1 = "latitude"
    c2 = "near_latitude"
    c3 = "longitude"
    c4 = "near_longitude"
    concat_npy[:,first_stage_name_cols2num_dict[c]] = (concat_npy[:,first_stage_name_cols2num_dict[c1]] - concat_npy[:,first_stage_name_cols2num_dict[c2]])**2 + \
    (concat_npy[:,first_stage_name_cols2num_dict[c3]] - concat_npy[:,first_stage_name_cols2num_dict[c4]])**2
    return concat_npy

def remove_low_rank_name(concat_df, concat_npy, second_stage_rank, first_stage_name_cols2num_dict):
    concat_df["id"] = concat_df["id"].astype("category")
    concat_df["pred_rank"] = concat_df.groupby(by="id")["pred"].rank(ascending=False)
    concat_npy = concat_npy[concat_df["pred_rank"] <= second_stage_rank]
    concat_df = concat_df[concat_df["pred_rank"] <= second_stage_rank].reset_index(drop=True)
    for c in ["longitude","near_longitude","latitude","near_latitude","rank","d_near"]:
        concat_df[c] = concat_npy[:,first_stage_name_cols2num_dict[c]]
    return concat_df, concat_npy

In [9]:
## ================================
# functions second stage
# ================================
def move_features(concat_npy2, concat_df, move_features):
    for c in move_features:
        concat_npy2[:,second_stage_cols2num_dict[c]] = concat_df[c].values.astype(np.float32)
    concat_df.drop(columns = move_features,inplace=True)
    return concat_npy2, concat_df
        
def merge_raw_data(concat_df, country_df,use_cols):
    country_df_ = country_df[use_cols].copy()
    country_df_["id"] = country_df_["id"].astype("category")
    concat_df = concat_df.merge(country_df_,how="left",on="id")
    country_df_.columns = [f"near_{i}" for i in country_df_.columns]
    concat_df = concat_df.merge(country_df_,how="left",on="near_id")
    del country_df_
    gc.collect()
    return concat_df

def calc_distance_second_stage(c1_array,c2_array,col):
    distance = np.zeros([len(c1_array),3],dtype=np.float32)
    for n,(c1,c2) in enumerate(zip(c1_array,c2_array)):
        c1 = text_preprocess(c1)
        c2 = text_preprocess(c2)
        if (str(c1) != "nan") and (str(c2) != "nan"):
            distance[n,:] = np.array([difflib.SequenceMatcher(None, str(c1), str(c2)).ratio(),
                    Levenshtein.distance(str(c1), str(c2)),
                    Levenshtein.jaro_winkler(str(c1), str(c2))])
        else:
            distance[n,:] = np.array([np.nan,np.nan,np.nan])
    return distance

        
def make_distance_second_stage(train,concat_npy,distance_columns,second_stage_cols2num_dict):
    for c in distance_columns:
        distance = calc_distance_second_stage(train[c].values,train[f"near_{c}"].values,c)
        for n,f_c in enumerate([f"{c}_gesh",f"{c}_leven",f"{c}_jaro"]):
            concat_npy[:,second_stage_cols2num_dict[f_c]] = distance[:,n].astype(np.float32)
        if c not in ["categories","city","country","state"]:
            train = train.drop(columns = [c,f"near_{c}"])
        del distance
        gc.collect()
    # 位置の距離
    c = "distance"
    c1 = "latitude"
    c2 = "near_latitude"
    c3 = "longitude"
    c4 = "near_longitude"
    concat_npy[:,second_stage_cols2num_dict[c]] = \
    (concat_npy[:,second_stage_cols2num_dict[c1]] - concat_npy[:,second_stage_cols2num_dict[c2]])**2 \
    + (concat_npy[:,second_stage_cols2num_dict[c3]] - concat_npy[:,second_stage_cols2num_dict[c4]])**2
    return train, concat_npy
    

def distance_agg(concat_npy,train,cols2num_dict):
    for c in ["name","categories"]:
        for d in ["gesh","leven","jaro"]:
            train[f"{c}_{d}"] = concat_npy[:,cols2num_dict[f"{c}_{d}"]]
            if d == "leven":
                tmp_mean = train.groupby(by="id")[f"{c}_{d}"].mean().to_dict()
                tmp_min = train.groupby(by="id")[f"{c}_{d}"].min().to_dict()
                concat_npy[:,cols2num_dict[f"{c}_{d}_mean"]] = train["id"].map(tmp_mean)
                concat_npy[:,cols2num_dict[f"{c}_{d}_min"]] = train["id"].map(tmp_min)
                concat_npy[:,cols2num_dict[f"near_{c}_{d}_mean"]] = train["near_id"].map(tmp_mean)
                concat_npy[:,cols2num_dict[f"near_{c}_{d}_min"]] = train["near_id"].map(tmp_min)
                concat_npy[:,cols2num_dict[f"{c}_{d}_mean_rate"]] = concat_npy[:,cols2num_dict[f"{c}_{d}"]] / concat_npy[:,cols2num_dict[f"{c}_{d}_mean"]]
                concat_npy[:,cols2num_dict[f"{c}_{d}_min_rate"]] = concat_npy[:,cols2num_dict[f"{c}_{d}"]] / concat_npy[:,cols2num_dict[f"{c}_{d}_min"]]
                concat_npy[:,cols2num_dict[f"near_{c}_{d}_mean_rate"]] = concat_npy[:,cols2num_dict[f"{c}_{d}"]] / concat_npy[:,cols2num_dict[f"near_{c}_{d}_mean"]]
                concat_npy[:,cols2num_dict[f"near_{c}_{d}_min_rate"]] = concat_npy[:,cols2num_dict[f"{c}_{d}"]] / concat_npy[:,cols2num_dict[f"near_{c}_{d}_min"]]
            else:
                tmp_mean = train.groupby(by="id")[f"{c}_{d}"].mean().to_dict()
                tmp_max = train.groupby(by="id")[f"{c}_{d}"].max().to_dict()
                concat_npy[:,cols2num_dict[f"{c}_{d}_mean"]] = train["id"].map(tmp_mean)
                concat_npy[:,cols2num_dict[f"{c}_{d}_max"]] = train["id"].map(tmp_max)
                concat_npy[:,cols2num_dict[f"near_{c}_{d}_mean"]] = train["near_id"].map(tmp_mean)
                concat_npy[:,cols2num_dict[f"near_{c}_{d}_max"]] = train["near_id"].map(tmp_max)
                concat_npy[:,cols2num_dict[f"{c}_{d}_mean_rate"]] = concat_npy[:,cols2num_dict[f"{c}_{d}"]] / concat_npy[:,cols2num_dict[f"{c}_{d}_mean"]]
                concat_npy[:,cols2num_dict[f"{c}_{d}_max_rate"]] = concat_npy[:,cols2num_dict[f"{c}_{d}"]] / concat_npy[:,cols2num_dict[f"{c}_{d}_max"]]
                concat_npy[:,cols2num_dict[f"near_{c}_{d}_mean_rate"]] = concat_npy[:,cols2num_dict[f"{c}_{d}"]] / concat_npy[:,cols2num_dict[f"near_{c}_{d}_mean"]]
                concat_npy[:,cols2num_dict[f"near_{c}_{d}_max_rate"]] = concat_npy[:,cols2num_dict[f"{c}_{d}"]] / concat_npy[:,cols2num_dict[f"near_{c}_{d}_max"]]
            train = train.drop(f"{c}_{d}",axis=1)

    for c in ["d_near","distance"]:
        train[f"{c}"] = concat_npy[:,cols2num_dict[f"{c}"]]
        tmp_mean = train.groupby(by="id")[c].mean().to_dict()
        tmp_min = train.groupby(by="id")[c].min().to_dict()
        concat_npy[:,cols2num_dict[f"{c}_mean"]] = train["id"].map(tmp_mean)
        concat_npy[:,cols2num_dict[f"{c}_min"]] = train["id"].map(tmp_min)
        concat_npy[:,cols2num_dict[f"near_{c}_mean"]] = train["near_id"].map(tmp_mean)
        concat_npy[:,cols2num_dict[f"near_{c}_min"]] = train["near_id"].map(tmp_min)

        concat_npy[:,cols2num_dict[f"{c}_mean_rate"]] = concat_npy[:,cols2num_dict[f"{c}"]] / concat_npy[:,cols2num_dict[f"{c}_mean"]]
        concat_npy[:,cols2num_dict[f"{c}_min_rate"]] = concat_npy[:,cols2num_dict[f"{c}"]] / concat_npy[:,cols2num_dict[f"{c}_min"]]
        concat_npy[:,cols2num_dict[f"near_{c}_mean_rate"]] = concat_npy[:,cols2num_dict[f"{c}"]] / concat_npy[:,cols2num_dict[f"near_{c}_mean"]]
        concat_npy[:,cols2num_dict[f"near_{c}_min_rate"]] = concat_npy[:,cols2num_dict[f"{c}"]] / concat_npy[:,cols2num_dict[f"near_{c}_min"]]
        if c == "distance":
            concat_npy[:,cols2num_dict[f"distance_rank"]] = train.groupby(by="id")[c].rank()
    return concat_npy

def make_cat_features(concat_npy,concat_df,cat_dict_list,cols2num_dict):
    for n,c in enumerate(["categories","city","country","state"]):
        concat_npy[:,cols2num_dict[f"{c}_label"]] = concat_df[c].map(cat_dict_list[n])
        concat_npy[:,cols2num_dict[f"near_{c}_label"]] = concat_df[f"near_{c}"].map(cat_dict_list[n])
        concat_df = concat_df.drop(columns = [c,f"near_{c}"])
        gc.collect()
    return concat_npy,concat_df


def concat_name_emb(concat_npy,concat_df,id2num_dict,bert_emb,cols2num_dict):
    # nameのemb
    name_svd = np.zeros([len(concat_npy),10],np.float32)
    near_name_svd = np.zeros([len(concat_npy),10],np.float32)
    for n,i in enumerate(concat_df["id"].values):
        name_svd[n,] = bert_emb[id2num_dict[i]]
    for n,i in enumerate(concat_df["near_id"].values):
        near_name_svd[n,] = bert_emb[id2num_dict[i]]
    concat_npy[:,cols2num_dict['name_emb_svd0']:cols2num_dict['name_emb_svd9'] + 1] = name_svd
    del name_svd
    concat_npy[:,cols2num_dict['near_name_emb_svd0']:cols2num_dict['near_name_emb_svd9'] + 1] = near_name_svd
    del near_name_svd
    return concat_npy


    



In [10]:
# ================================
# functions bert
# ================================
def text_preprocess_bert(text):
    text = str(text)
    text = text.lower()
    return text


class BertDataset(Dataset):
    def __init__(self, text, tokenizer, max_len,preprocess=None):
        self.text = text
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.preprocess = preprocess

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        if self.preprocess:
            text = text_preprocess_bert(self.text[item])
        else:
            text = str(self.text[item])
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        
        return {
            "input_ids": torch.tensor(ids, dtype=torch.long),
            "attention_mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long)
        }
    
class bert_model(nn.Module):
    def __init__(self):
        super(bert_model, self).__init__()
        self.model = AutoModel.from_pretrained(BERT_MODEL)

    def forward(self, ids, mask):
        # pooler
        bert_out = self.model(ids, attention_mask=mask)[0]
        x = F.normalize((bert_out[:, 1:, :]*mask[:, 1:, None]).mean(axis=1))
        return x
    
def make_emb(model,train_loader,svd=None):
    bert_emb = []
    with torch.no_grad():
        for d in tqdm(train_loader,total=len(train_loader)):
            input_ids = d['input_ids']
            mask = d['attention_mask']
            token_type_ids = d["token_type_ids"]
            input_ids = input_ids.to(device)
            mask = mask.to(device)
            output = model(input_ids, mask)
            output = output.detach().cpu().numpy().astype(np.float32)
            if svd is not None:
                output = svd.transform(output)
            bert_emb.append(output)
    torch.cuda.empty_cache()
    bert_emb = np.concatenate(bert_emb)
    return bert_emb

In [11]:
# ===============================
# third stage
# ===============================
# ===============================================================================
# Get manhattan distance
# ===============================================================================
def manhattan(lat1, long1, lat2, long2):
    return np.abs(lat2 - lat1) + np.abs(long2 - long1)

# ===============================================================================
# Get haversine distance
# ===============================================================================
def vectorized_haversine(lats1, lats2, longs1, longs2):
    # radius = 6371
    radius = 1
    dlat=np.radians(lats2 - lats1)
    dlon=np.radians(longs2 - longs1)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lats1)) \
        * np.cos(np.radians(lats2)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = radius * c
    return d

def add_lat_lon_distance_features(df):
    lat1 = df['latitude']
    lat2 = df['near_latitude']
    lon1 = df['longitude']
    lon2 = df['near_longitude']
    df['latdiff'] = (lat1 - lat2)
    df['londiff'] = (lon1 - lon2)
    df['manhattan'] = manhattan(lat1, lon1, lat2, lon2)
    df['euclidean'] = (df['latdiff'] ** 2 + df['londiff'] ** 2) ** 0.5
    df['haversine'] = vectorized_haversine(lat1, lat2, lon1, lon2)
    df["x"] = np.cos(np.radians(df["latitude"]))*np.cos(np.radians(df["longitude"]))
    df["y"] = np.sin(np.radians(df["latitude"]))*np.cos(np.radians(df["longitude"]))
    df["z"] = np.sin(np.radians(df["longitude"]))
    df["near_x"] = np.cos(np.radians(df["near_latitude"]))*np.cos(np.radians(df["near_longitude"]))
    df["near_y"] = np.sin(np.radians(df["near_latitude"]))*np.cos(np.radians(df["near_longitude"]))
    df["near_z"] = np.sin(np.radians(df["near_longitude"]))
    df["dot"] = df["x"]*df["near_x"]+df["y"]*df["near_y"]+df["z"]*df["near_z"]


    col_64 = list(df.dtypes[df.dtypes == np.float64].index)
    for col in col_64:
        df[col] = df[col].astype(np.float32)
    return df

def blocking_and_cat_pred(concat_df,concat_npy2,pred,country_df,thrid_stage_blocking,cat_model):
    concat_df["pred"] = pred
    remain_cols = ["id","near_id","pred"]
    drop_cols = [i for i in concat_df.columns if i not in remain_cols]
    concat_df.drop(columns=drop_cols,inplace=True)
    concat_df["latitude"] = concat_npy2[:,second_stage_cols2num_dict["latitude"]]
    concat_df["longitude"] = concat_npy2[:,second_stage_cols2num_dict["longitude"]]
    concat_df["near_latitude"] = concat_npy2[:,second_stage_cols2num_dict["near_latitude"]]
    concat_df["near_longitude"] = concat_npy2[:,second_stage_cols2num_dict["near_longitude"]]
    concat_npy2 = concat_npy2[concat_df["pred"] >= thrid_stage_blocking]
    concat_df = concat_df[concat_df["pred"] >= thrid_stage_blocking].reset_index(drop=True)
    cat_pred = cat_model.predict_proba(concat_npy2.astype(np.float32))[:,1]
    concat_df["cat_pred"] = cat_pred
    concat_df = concat_df.merge(country_df[["id","name","categories",'address','city','state']],how="left",on="id")
    country_df = country_df.rename(columns = {"id":"near_id","name":"near_name","categories":"near_categories",
                                             'address':'near_address',"city":"near_city",'state':'near_state'})
    concat_df = concat_df.merge(country_df[["near_id","near_name","near_categories",
                                           'near_address',"near_city",'near_state']],how="left",on="near_id")
    concat_df = add_lat_lon_distance_features(concat_df)
    return concat_df,concat_npy2


def move_and_sc_num_features(concat_df,concat_npy2,bert_num_cols1,second_stage_cols2num_dict,sc_dict):
    concat_npy3 = np.zeros([len(concat_npy2),len(bert_num_cols1)]).astype(np.float32)
    for n,c in enumerate(bert_num_cols1):
        if c in second_stage_cols2num_dict.keys():
            concat_npy3[:,n] = concat_npy2[:,second_stage_cols2num_dict[c]]
            # infの処理
            concat_npy3[concat_npy3[:,n] == np.inf,n] = np.nan
            concat_npy3[concat_npy3[:,n] == -np.inf,n] = np.nan
            # scaling
            concat_npy3[:,n] = (concat_npy3[:,n] - sc_dict[c][0]) / (sc_dict[c][1]) 
        else:
            concat_npy3[:,n] = concat_df[c].values.astype(np.float32)
            # infの処理
            concat_npy3[concat_npy3[:,n] == np.inf,n] = np.nan
            concat_npy3[concat_npy3[:,n] == -np.inf,n] = np.nan
            # scaling
            concat_npy3[:,n] = (concat_npy3[:,n] - sc_dict[c][0]) / (sc_dict[c][1]) 
    # nanの処理
    concat_npy3 = np.nan_to_num(concat_npy3)
    return concat_npy3


def token_sort(concat_df,tokenizer,concat_npy3):
    token_len = []
    for t,n_t in zip(concat_df["text"].values, concat_df["near_text"].values):
        inputs = tokenizer.encode_plus(t, n_t, 
                                       add_special_tokens=True,
                                      return_offsets_mapping=False)
        token_len.append(len(inputs["input_ids"]))
    concat_df["token_len"] = token_len
    concat_df["num_index"] = np.arange(len(concat_df))
    concat_df = concat_df.sort_values(by="token_len").reset_index(drop=True)
    concat_npy3 = concat_npy3[concat_df["num_index"].values,:]
    concat_df.drop(columns = ["num_index"],inplace=True)
    return concat_df,concat_npy3


class FourSquareDataset(Dataset):
    def __init__(self, text, near_text,num_features, tokenizer, max_len):
        self.text = text
        self.near_text = near_text
        self.num_features = num_features
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        text = self.text[item]
        near_text = self.near_text[item]
        inputs = self.tokenizer(
            text,near_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True
        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]
        num_feature = self.num_features[item]
        return {
                "input_ids": torch.tensor(ids, dtype=torch.long),
                "attention_mask": torch.tensor(mask, dtype=torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
                "num_feature" : torch.tensor(num_feature, dtype=torch.float32),
            }
    
class TransformerHead(nn.Module):
    def __init__(self, in_features, max_length=128, num_layers=1, nhead=8):
        super().__init__()

        self.transformer = nn.TransformerEncoder(
            encoder_layer=nn.TransformerEncoderLayer(d_model=in_features, nhead=nhead),
            num_layers=num_layers,
        )
        self.row_fc = nn.Linear(in_features, 1)
        self.out_features = max_length

    def forward(self, x):
        out = self.transformer(x)
        out = self.row_fc(out).squeeze(-1)
        p1d = (0, self.out_features - out.shape[-1])
        out = F.pad(out, p1d, "constant", 0)
        return out

class FourSquare_model2(nn.Module):
    def __init__(self):
        super(FourSquare_model2, self).__init__()
        self.model = AutoModel.from_pretrained(THIRD_BERT_MODEL2)
        self.head_type = "linear"
        encoder_feature_size = 768
        if self.head_type == "transformer":
            self.transformer_head = TransformerHead(
                    in_features=768,
                    max_length=128,
                    num_layers=1,
                    nhead=8,
                )
            encoder_feature_size = self.transformer_head.out_features
        self.ln1 = nn.LayerNorm(encoder_feature_size)
        self.linear1 = nn.Sequential(
            nn.Linear(encoder_feature_size,128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.2))
        
        self.linear2 = nn.Sequential(
            nn.Linear(90,128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.2))
        
        self.linear3 = nn.Sequential(
            nn.Linear(128 + 128,64),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64,1),
           )
        

    

    def forward(self, ids, mask, token_type_ids,num_features):
        if self.head_type == "transformer":
            out = self.model(ids, attention_mask=mask,token_type_ids=token_type_ids)['last_hidden_state']
            out = self.transformer_head(out)
        else:
            out = self.model(ids, attention_mask=mask,token_type_ids=token_type_ids)['last_hidden_state'][:,0,:]
        out =  self.ln1(out)
        out = self.linear1(out)
        out2 = self.linear2(num_features)
        out = torch.cat([out,out2],axis=-1)
        out = self.linear3(out)
        return out
    
    


    
class FourSquare_model(nn.Module):
    def __init__(self):
        super(FourSquare_model, self).__init__()
        self.model = AutoModel.from_pretrained(THIRD_BERT_MODEL1)
        self.ln1 = nn.LayerNorm(1024)
        self.linear1 = nn.Sequential(
            nn.Linear(1024,128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.2))
        
        self.linear2 = nn.Sequential(
            nn.Linear(76,128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.2))
        
        self.linear3 = nn.Sequential(
            nn.Linear(128 + 128,64),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64,1),
           )
        
    

    def forward(self, ids, mask, token_type_ids,num_features):
        # pooler
        out = self.model(ids, attention_mask=mask,token_type_ids=token_type_ids)['last_hidden_state'][:,0,:]
        out =  self.ln1(out)
        out = self.linear1(out)
        out2 = self.linear2(num_features)
        out = torch.cat([out,out2],axis=-1)
        out = self.linear3(out)
        return out
    
    
def collate(d):
    mask_len = int(d["attention_mask"].sum(axis=1).max())
    return {"input_ids" : d['input_ids'][:,:mask_len],
                "attention_mask" : d['attention_mask'][:,:mask_len],
                "token_type_ids" : d["token_type_ids"][:,:mask_len],
                 "num_feature" : d["num_feature"]}


def make_thrid_pred_amp(model,test_loader):
    test_preds = []
    with torch.no_grad():
        for d in tqdm(test_loader,total=len(test_loader)):
            d = collate(d)
            ids = d["input_ids"].to(device)
            mask = d['attention_mask'].to(device)
            token_type_ids = d["token_type_ids"].to(device)
            num_features = d["num_feature"].to(device)
            with autocast():
                outputs = model(ids,mask,token_type_ids,num_features)
            test_preds.append(outputs.sigmoid().detach().cpu().numpy())
    torch.cuda.empty_cache()
    test_preds = np.concatenate(test_preds,axis=0)
    return test_preds

def make_thrid_pred(model,test_loader):
    test_preds = []
    with torch.no_grad():
        for d in tqdm(test_loader,total=len(test_loader)):
            d = collate(d)
            ids = d["input_ids"].to(device)
            mask = d['attention_mask'].to(device)
            token_type_ids = d["token_type_ids"].to(device)
            num_features = d["num_feature"].to(device)
            outputs = model(ids,mask,token_type_ids,num_features)
            test_preds.append(outputs.sigmoid().detach().cpu().numpy())
    torch.cuda.empty_cache()
    test_preds = np.concatenate(test_preds,axis=0)
    return test_preds

def pp(concat_df,pred):
    sub_ = pd.DataFrame()
    sub_["id"] = concat_df["id"].values
    sub_["near_id"] = concat_df["near_id"].values
    sub_["pred"] = concat_df[pred]
    sub_ = sub_[sub_["pred"] > 0.5].reset_index(drop=True)
    # PP
    # idとnear_idを交換したものの作成
    sub__ = sub_.copy()
    sub__.columns = ["near_id","id","pred"]
    sub_ = pd.concat([sub_,sub__]).reset_index(drop=True)
    sub_ = sub_.drop_duplicates(subset=["id","near_id"]).reset_index(drop=True)
    sub2_ = pd.DataFrame()
    sub2_["id"] = id_unique
    sub2_["near_id"] = id_unique
    # id == id2を入れる
    sub_ = pd.concat([sub_,sub2_]).reset_index(drop=True)
    del sub2_,sub__
    gc.collect()
    return sub_

In [12]:
# =================================
# 4th stage
# =================================
class FourSquare_model3(nn.Module):
    def __init__(self):
        super(FourSquare_model3, self).__init__()
        self.model = AutoModel.from_pretrained(FOURTH_BERT_MODEL1)
        self.ln1 = nn.LayerNorm(1024)
        self.linear1 = nn.Sequential(
            nn.Linear(1024,128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.2))
        
        self.linear2 = nn.Sequential(
            nn.Linear(2,32),
            nn.LayerNorm(32),
            nn.ReLU(),
            nn.Dropout(0.2))
        
        self.linear3 = nn.Sequential(
            nn.Linear(128 + 32,64),
            nn.LayerNorm(64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64,1),
           )
        
    def forward(self, ids, mask, token_type_ids,num_features):
        # pooler
        out = self.model(ids, attention_mask=mask,token_type_ids=token_type_ids)['last_hidden_state'][:,0,:]
        out =  self.ln1(out)
        out = self.linear1(out)
        out2 = self.linear2(num_features)
        out = torch.cat([out,out2],axis=-1)
        out = self.linear3(out)
        return out
    
def id_sort(id_near_id):
    id_near_id = "-".join(sorted(id_near_id.split("-")))
    return id_near_id

def make_pair_4th(sub_3,sub_pair):
    new_pair = []
    id_array = sub_3["id"].values
    match_array = sub_3["matches"].values
    for i in tqdm(range(len(sub_3))):
        id_ = id_array[i]
        match_ = match_array[i]
        df_ = pd.DataFrame()
        df_["near_id"] = match_.split(" ")
        df_["id"] = id_
        new_pair.append(df_)
    new_pair = pd.concat(new_pair).reset_index(drop=True)
    sub_pair["id_near_id"] = sub_pair["id"].astype(str) + "-" + sub_pair["near_id"].astype(str)
    new_pair["id_near_id"] = new_pair["id"].astype(str) + "-" + new_pair["near_id"].astype(str)
    # ~3rdまで出たpairを削除
    new_pair = new_pair[~new_pair["id_near_id"].isin(sub_pair["id_near_id"])].reset_index(drop=True)
    # 重複の削除
    new_pair["id_near_id_sort"] = new_pair["id_near_id"].map(id_sort)
    new_pair = new_pair.drop_duplicates(subset = "id_near_id_sort").reset_index(drop=True)
    return new_pair
    
def merge_raw_data_4th(new_pair,test):
    use_cols = ["id","name","categories",'latitude', 'longitude','address','city','state']
    test_ = test[use_cols].copy()
    new_pair = new_pair.merge(test_,how="left",on="id")
    test_.columns = [f"near_{c}" for c in use_cols]
    new_pair = new_pair.merge(test_,how="left",on="near_id")
    return new_pair

def token_sort_4th(new_pair,tokenizer):
    token_len = []
    for t,n_t in zip(new_pair["text"].values,new_pair["near_text"].values):
        inputs = tokenizer.encode_plus(t, n_t, 
                                       add_special_tokens=True,
                                      return_offsets_mapping=False)
        token_len.append(len(inputs["input_ids"]))
    new_pair["token_len"] = token_len
    new_pair = new_pair.sort_values(by="token_len").reset_index(drop=True)
    return new_pair

def blocking_4th(new_pair,th):
    new_pair = new_pair[new_pair["pred"] > th].reset_index(drop=True)
    new_pair = new_pair[["id","near_id"]].reset_index(drop=True)
    new_pair_ = new_pair.copy()
    new_pair_.columns = ["near_id","id"]
    new_pair = pd.concat([new_pair,new_pair_]).reset_index(drop=True)
    return new_pair

In [13]:
# ================================
# Main
# ================================
if DEBUG:
    test = pd.read_csv(TRAIN_PATH)
    test = test[test["set"] == 0].reset_index(drop=True)
else:
    test = pd.read_csv(TEST_PATH)

sub = pd.read_csv(SUB_PATH)

In [14]:
# ================================
# model load
# ================================
first_stage_place_lgb = [ForestInference.load(i, output_class=False, model_type="lightgbm") for i in first_stage_place_lgb_path]

first_stage_name_lgb = [ForestInference.load(i, output_class=False, model_type="lightgbm") for i in first_stage_name_lgb_path]

second_stage_lgb = [ForestInference.load(i, output_class=False, model_type="lightgbm") for i in second_stage_lgb_path]
third_stage_cat = CatBoostClassifier()
third_stage_cat.load_model(third_stage_cat_path[0])

[W] [11:47:02.436184] Treelite currently does not support float64 model parameters. Accuracy may degrade slightly relative to native LightGBM invocation.
[W] [11:47:02.805842] Casting all thresholds and leaf values to float32, as FIL currently doesn't support inferencing models with float64 values. This may lead to predictions with reduced accuracy.
[W] [11:47:02.819376] Treelite currently does not support float64 model parameters. Accuracy may degrade slightly relative to native LightGBM invocation.
[W] [11:47:03.058562] Casting all thresholds and leaf values to float32, as FIL currently doesn't support inferencing models with float64 values. This may lead to predictions with reduced accuracy.
[W] [11:47:03.070476] Treelite currently does not support float64 model parameters. Accuracy may degrade slightly relative to native LightGBM invocation.
[W] [11:47:03.727153] Casting all thresholds and leaf values to float32, as FIL currently doesn't support inferencing models with float64 valu

<catboost.core.CatBoostClassifier at 0x7f1e9160d7d0>

In [15]:
# ================
# feのload
# ================
with open(fe045_categories_path, 'rb') as f:
    categories_dict = pickle.load(f)

with open(fe045_city_path, 'rb') as f:
    city_dict = pickle.load(f)

with open(fe045_country_path, 'rb') as f:
    country_dict = pickle.load(f)
    
with open(fe045_state_path , 'rb') as f:
    state_dict = pickle.load(f)

cat_dict_list = [categories_dict,
                 city_dict,
                 country_dict,
                 state_dict]

# ================
# fe46
# ================
with open(fe046_svd_path, 'rb') as f:
    svd = pickle.load(f)

In [16]:
# bert-base-multilingualのload
test["name_bert"] = test["name"].copy()
test["name_bert"] = test["name_bert"].astype(str)
test["name_bert"] = test["name_bert"].str.lower()

In [17]:

tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)

name = test["name_bert"].unique()
name2num_dict = {}
for n,i in enumerate(name):
    name2num_dict[i] = n  
    
id2num_dict = {}
for i,n in zip(test["id"].values,test["name_bert"].values):
    id2num_dict[i] = name2num_dict[n]
    
test = test.drop(columns = ["name_bert"])
del name2num_dict
gc.collect()

train_ = BertDataset(name, tokenizer, MAX_LEN)
train_loader = DataLoader(
        dataset=train_, batch_size=BS, shuffle=False)

In [18]:
model = bert_model()
model = model.to(device)
model.eval()
if DEBUG:
    bert_emb = np.load("../input/exp038-ex059-sub-fold2-all-distance-miss-gb-debug/bert_emb.npy")
else:
    bert_emb = make_emb(model,train_loader,svd=svd)
#np.save("bert_emb.npy",bert_emb)
gc.collect()

  0%|          | 0/1 [00:00<?, ?it/s]

442

In [19]:
sub_list = []
pair_list = []
if DEBUG:
    if DEBUG_COUNTRY[0] == "ALL":
        country_unique = test["country"].value_counts().index
    else:
        country_unique = DEBUG_COUNTRY
else:
    country_unique = test["country"].value_counts().index

In [20]:
third_model = None
third_tokenizer = AutoTokenizer.from_pretrained(THIRD_BERT_MODEL1)
third_model2 = None
third_tokenizer2 = AutoTokenizer.from_pretrained(THIRD_BERT_MODEL2)

In [21]:
for n,country in enumerate(country_unique):
    country_df = test[test["country"] == country].reset_index(drop=True)
    id_unique = country_df["id"].unique()
    # feature enginnering -> pred
    if len(country_df ) >= 2:
        # ==================================
        # first stage
        # ==================================
        print(f"{n},{country}")
        # place
        first_stage_columns = ['id','name','categories','latitude','longitude']
        concat_df = make_candidate_first_stage(country_df, 
                                               n_neighbors_first_stage,
                                               first_stage_columns)
        
        concat_df = delete_match_id(concat_df)
        
        concat_npy = np.zeros([len(concat_df),
                               len(first_stage_place_features)],
                              dtype=np.float32)
        cols =  ['latitude','longitude','rank','d_near','near_latitude', 'near_longitude']
        concat_npy, concat_df = df2numpy(concat_npy,
                                         concat_df,cols,
                                         first_stage_place_cols2num_dict)
        
        gc.collect()
        
       
        distance_columns = ['name','categories']
       

      
        for c in distance_columns:
            distance = calc_distance_first_stage(concat_df[c].values,
                                                 concat_df[f"near_{c}"].values)
            for n,f_c in enumerate([f"{c}_jaro"]):
                concat_npy[:,first_stage_place_cols2num_dict[f_c]] = distance[:,n].astype(np.float32)
            del distance
            gc.collect()  
        
      
        pred = make_pred(concat_npy,first_stage_place_lgb )
                
        concat_df["pred"] = pred
        if (DEBUG) & (DEBUG_COUNTRY[0] != "ALL"):
            concat_df.to_csv(f"1st_stage_place_{country}.csv",index=False)
            np.save(f"1st_stage_place_npy_{country}.npy",concat_npy)
        concat_df, concat_npy = remove_low_rank_place(concat_df, 
                                                concat_npy, 
                                                second_stage_rank,
                                                first_stage_place_cols2num_dict)
        del pred,concat_npy
        gc.collect()
        
        # ====================
        # name emb
        # ====================
        train_ = BertDataset(country_df["name"], tokenizer, MAX_LEN,True)
        train_loader = DataLoader(
                dataset=train_, batch_size=BS, shuffle=False)
        country_name_emb = make_emb(model,train_loader,svd=None)
        
        first_stage_columns = ['id','name','latitude','longitude']
        concat_name_df = make_candidate_name_first_stage(country_df, 
                                               country_name_emb,
                                               n_neighbors_first_stage,
                                               first_stage_columns)
        del country_name_emb
        gc.collect()
     
        concat_name_df = delete_match_id(concat_name_df)
        
      
        concat_name_npy = np.zeros([len(concat_name_df),
                               len(first_stage_name_features)],
                              dtype=np.float32)
        cols =  ['latitude','longitude','rank','d_near','near_latitude', 'near_longitude']
        concat_name_npy, concat_name_df = df2numpy(concat_name_npy,
                                                   concat_name_df,
                                                   cols,
                                                   first_stage_name_cols2num_dict)
        gc.collect()
        
     
        distance_columns = ['name']
      
    
        for c in distance_columns:
            distance = calc_distance_first_stage(concat_name_df[c].values,
                                                 concat_name_df[f"near_{c}"].values)
            for n,f_c in enumerate([f"{c}_jaro"]):
                concat_name_npy[:,first_stage_name_cols2num_dict[f_c]] = distance[:,n].astype(np.float32)
            del distance
            gc.collect() 


        concat_name_npy = make_place_distance(concat_name_npy,first_stage_name_cols2num_dict)
        
     
        pred = make_pred(concat_name_npy,first_stage_name_lgb )
        
        concat_name_df["pred"] = pred
        if (DEBUG) & (DEBUG_COUNTRY[0] != "ALL"):
            concat_name_df.to_csv(f"1st_stage_name_{country}.csv",index=False)
            np.save(f"1st_stage_name_npy_{country}.npy",concat_name_npy)
        #np.save("1st_stage")
        concat_name_df, concat_name_npy = remove_low_rank_name(concat_name_df, 
                                                concat_name_npy, 
                                                second_stage_rank,
                                                first_stage_name_cols2num_dict)
        del pred,concat_name_npy
        # remove
        remove_cols = ["categories","near_categories"]
        concat_df.drop(columns = remove_cols,inplace=True)
        concat_df = pd.concat([concat_name_df,concat_df]).reset_index(drop=True)
        del concat_name_df
        concat_df = concat_df.drop_duplicates(subset=["id","near_id"]).reset_index(drop=True)
        
        
        
        # ==================================
        # second stage
        # ==================================
        concat_npy2 = np.zeros([len(concat_df),len(second_stage_features)],dtype=np.float32)
      
        features = ['latitude','longitude','rank','d_near','near_latitude', 'near_longitude']
        concat_npy2,concat_df = move_features(concat_npy2, concat_df, features)
        gc.collect()
        # merge
        concat_df["near_id"] = concat_df["near_id"].astype("category")
        use_cols = ["id","address","city","state","zip","country","url","phone",'categories']
        concat_df = merge_raw_data(concat_df, country_df,use_cols)
        remain_cols = ["id","name","categories",'address', 'city', 'state']
        country_df.drop(columns = [c for c in country_df.columns if c not in remain_cols],inplace=True)
        gc.collect()
        
        distance_columns = ['name', 'address', 'city', 'state',
           'zip', 'url', 'phone', 'categories']
        concat_df, concat_npy2 = make_distance_second_stage(concat_df,
                                                           concat_npy2,
                                                           distance_columns,
                                                           second_stage_cols2num_dict)
  
        concat_npy2 = distance_agg(concat_npy2,concat_df,second_stage_cols2num_dict)
        cat_cols = ["categories","city","country","state"]
        concat_npy2, concat_df = make_cat_features(concat_npy2,concat_df,cat_dict_list,second_stage_cols2num_dict)
        concat_npy2 = concat_name_emb(concat_npy2,concat_df,id2num_dict,bert_emb,second_stage_cols2num_dict)
        
        gc.collect() 
        # predict
        pred = make_pred(concat_npy2,second_stage_lgb)
        if (DEBUG) & (DEBUG_COUNTRY[0] != "ALL"):
            #sub_.to_csv(f"second_{country}_sub.csv",index=False)
            np.save(f"second_{country}.npy",concat_npy2)
            concat_df.to_csv(f"second_{country}.csv",index=False)
            
        # ==================================
        # third stage
        # ==================================
        concat_df,concat_npy2 = blocking_and_cat_pred(concat_df,concat_npy2,pred,country_df,thrid_stage_blocking,third_stage_cat)
        concat_npy3 = move_and_sc_num_features(concat_df,concat_npy2,bert_num_cols1,second_stage_cols2num_dict,sc_dict)
                                 
        del pred,concat_npy2,country_df
        concat_df["text"] = concat_df["name"].astype(str).str.lower() + " " + concat_df["categories"].astype(str).str.lower()+\
                            concat_df['address'].astype(str).str.lower() + " " + concat_df['city'].astype(str).str.lower() + " " + concat_df['state'].astype(str).str.lower() 
        concat_df["near_text"] = concat_df["near_name"].astype(str).str.lower() + " " + concat_df["near_categories"].astype(str).str.lower()+\
                             concat_df['near_address'].astype(str).str.lower() + " " + concat_df['near_city'].astype(str).str.lower() + " " + concat_df['near_state'].astype(str).str.lower()
        concat_df,concat_npy3 = token_sort(concat_df,third_tokenizer,concat_npy3)  
        
        # 必要な特徴量のみ
        cols2num_bert_cols1 = {}
        for n,c in enumerate(bert_num_cols1):
            cols2num_bert_cols1[c] = n
        use_cols_index = []
        for i in bert_num_cols2:
            use_cols_index.append(cols2num_bert_cols1[i])
            
            
        test_ = FourSquareDataset(concat_df["text"].values,
                                  concat_df["near_text"].values,
                                  concat_npy3[:,use_cols_index],
                                  third_tokenizer, THIRD_MAX_LEN)
        
            
        test2_ = FourSquareDataset(concat_df["text"].values,
                                  concat_df["near_text"].values,
                                  concat_npy3,
                                  third_tokenizer2, THIRD_MAX_LEN2)
        
        test_loader = DataLoader(
            dataset=test_, batch_size=THIRD_BS, shuffle=False)
        test_loader2 = DataLoader(
            dataset=test2_, batch_size=THIRD_BS2, shuffle=False)
       
        if third_model is None:
            third_model =  FourSquare_model()
            third_model.load_state_dict(torch.load(third_stage_model1_path))
            third_model = third_model.to(device)
            third_model.eval()
        pred1 = make_thrid_pred_amp(third_model,test_loader)
        
        if third_model2 is None:
            third_model2 =  FourSquare_model2()
            third_model2.load_state_dict(torch.load(third_stage_model2_path))
            third_model2 = third_model2.to(device)
            third_model2.eval() 
        pred2 = make_thrid_pred(third_model2,test_loader2)
        
        concat_df["pred"] = concat_df["pred"]*w1 + pred1.reshape(-1)*w2 + pred2.reshape(-1)*w3 + concat_df["cat_pred"]*w4
        gc.collect()
        #print(pred)
        sub_ = pp(concat_df, "pred")
        if (DEBUG) & (DEBUG_COUNTRY[0] != "ALL"):
            sub_.to_csv(f"second_{country}_sub.csv",index=False)
            concat_df.to_csv(f"second_{country}.csv",index=False)
        del concat_df,concat_npy3,test_,test_loader
        gc.collect()
        # make sub
        pair_list.append(sub_[["id","near_id"]])
        sub_ = sub_.groupby("id")["near_id"].apply(join)
        sub_ = sub_.reset_index()
        sub_.columns = ["id","matches"]
        
        sub_list.append(sub_)
    else:
        sub_ = pd.DataFrame()
        sub_["id"] = id_unique
        sub_["matches"] = id_unique
        sub_list.append(sub_)

0,ID


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [22]:
del third_model,third_model2,model

In [23]:
sub = pd.concat(sub_list).reset_index(drop=True)
sub_pair = pd.concat(pair_list).reset_index(drop=True)

In [24]:
# ===================================================
# PP
# ===================================================
sub["matches_len"] = sub["matches"].map(lambda x:len(x.split(" ")))
sub_3 = sub[sub["matches_len"] > 3].reset_index(drop=True)
near_id_value = sub_3["matches"].values
id_list = sub_3["id"].values
key_id_list = []
for n,key_id in tqdm(enumerate(id_list),total=len(id_list)):
    if key_id in(key_id_list):
        pass
    else:
        len_list = []
        key_near_id = near_id_value[n]
        a = len(key_near_id.split(" "))
        for near_id in near_id_value:
            b = len(near_id.split(" "))
            c = len(set(key_near_id.split(" ")) & set(near_id.split(" ")))
            len_list.append([a,b,c])
        df = pd.DataFrame(len_list)
        df.columns = ["id_len","near_id_len","common_len"]
        df["id_rate"] =  df["common_len"] / df["id_len"]
        df["near_id_rate"] = df["common_len"] / df["near_id_len"]
        df["id"] = id_list
        df = df[df["common_len"] != 0].reset_index(drop=True)
        df = df[(df["id_rate"] >= 0.5) | (df["near_id_rate"] >= 0.5)].reset_index(drop=True)
        if len(df) > 1:
            for k in df["id"]:
                key_id_list.append(k)
            all_id = near_id_value[sub_3["id"].isin(df["id"])]
            all_id_concat = []
            for i in all_id:
                all_id_concat += i.split(" ")
            all_id_unique = list(set(all_id_concat))
            near_id_value[sub_3["id"].isin(df["id"])] = " ".join(all_id_unique)
sub_3["matches"] = near_id_value
sub_under_3 = sub[sub["matches_len"] <= 3].reset_index(drop=True)

0it [00:00, ?it/s]

In [25]:
# ===============================================================
# 4th stage
# ===============================================================
if len(sub_3) > 0:
    fourth_tokenizer = AutoTokenizer.from_pretrained(FOURTH_BERT_MODEL1)
    new_pair = make_pair_4th(sub_3,sub_pair)
    new_pair = merge_raw_data_4th(new_pair,test)
    new_pair["text"] = new_pair["name"].astype(str).str.lower() + " " + new_pair["categories"].astype(str).str.lower()+\
                            new_pair['address'].astype(str).str.lower() + " " + new_pair['city'].astype(str).str.lower() + " " + new_pair['state'].astype(str).str.lower() 
    new_pair["near_text"] = new_pair["near_name"].astype(str).str.lower() + " " + new_pair["near_categories"].astype(str).str.lower()+\
                             new_pair['near_address'].astype(str).str.lower() + " " + new_pair['near_city'].astype(str).str.lower() + " " + new_pair['near_state'].astype(str).str.lower()

    new_pair = token_sort_4th(new_pair,fourth_tokenizer)
    for c in ["latitude","longitude"]:
        new_pair[c] = (new_pair[c] - sc_dict[c][0]) / (sc_dict[c][1])

    num_features = new_pair[["latitude","longitude"]].values
    test_ = FourSquareDataset(new_pair["text"].values,
                              new_pair["near_text"].values,
                              num_features,
                              fourth_tokenizer, FOURTH_MAX_LEN)
    test_loader = DataLoader(
            dataset=test_, batch_size=FOURTH_BS, shuffle=False)
    fourth_model =  FourSquare_model3()
    fourth_model.load_state_dict(torch.load(fourth_stage_model1_path))
    fourth_model = fourth_model.to(device)
    fourth_model.eval()

    pred_pp = make_thrid_pred_amp(fourth_model,test_loader)
    new_pair["pred"] = pred_pp.reshape(-1)
    new_pair = blocking_4th(new_pair,fourth_stage_blocking)
    sub_pair = pd.concat([sub_pair,new_pair]).reset_index(drop=True) 
    sub_pair = sub_pair.groupby("id")["near_id"].apply(join)
    sub_pair = sub_pair.reset_index()
    sub_pair.columns = ["id","matches"]
    
else:
    sub_pair = sub_pair.groupby("id")["near_id"].apply(join)
    sub_pair = sub_pair.reset_index()
    sub_pair.columns = ["id","matches"]

if DEBUG:
    train_raw = pd.read_csv("../input/foursquare-fold/fold_train.csv")
    id2poi = get_id2poi(train_raw[["id","point_of_interest"]])
    poi2ids = get_poi2ids(train_raw[["id","point_of_interest"]])
    score = get_score(sub_pair)
    print(score)
else:
    # sub_pp = pd.concat([sub_under_3[["id","matches"]],sub_3[["id","matches"]]]).reset_index(drop=True)
    null_country = test[~test["id"].isin(sub_pair["id"])][["id"]].reset_index(drop=True)
    null_country["matches"] = null_country["id"].values
    sub_pp = pd.concat([sub_pair,null_country]).reset_index(drop=True)
    #sub_pp.to_csv("submission.csv",index=False)

In [26]:
# merge train pairs
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
train_dict = {}
ids = train["id"].values
names = train["name"].values
latitudes = train["latitude"].values
longitudes = train["longitude"].values
for idx in tqdm(range(len(train))):
    id = ids[idx]
    name = names[idx]
    latitude = latitudes[idx]
    longitude = longitudes[idx]
    train_dict[(name, latitude, longitude)] = id

del ids, names, latitudes, longitudes

rename_dict = {}
ids = test["id"].values
names = test["name"].values
latitudes = test["latitude"].values
longitudes = test["longitude"].values
for idx in tqdm(range(len(test))):
    id = ids[idx]
    name = names[idx]
    latitude = latitudes[idx]
    longitude = longitudes[idx]
    if (name, latitude, longitude) in train_dict:
        rename_dict[train_dict[(name, latitude, longitude)]] = id
del train_dict
del ids, names, latitudes, longitudes

train["id"] = train["id"].map(lambda x:rename_dict[x] if x in rename_dict else x+"_t")

ids = []
near_ids = []
for poi, poi_df in tqdm(train[["id", "point_of_interest"]].groupby("point_of_interest")):
    for id1 in poi_df["id"].values:
        for id2 in poi_df["id"].values:
            if not id1.endswith("_t") and not id2.endswith("_t"): # どちらもtestに含まれるもののみ残す
                ids.append(id1)
                near_ids.append(id2)

                
def matches2pairs(df_matches):
    pair_ids = []
    pair_near_ids = []
    ids_val = df_matches["id"].values
    matches_val = df_matches["matches"].values
    for i in tqdm(range(len(df_matches))):
        idx = ids_val[i]
        matches = matches_val[i].split()
        pair_ids += [idx]*len(matches)
        pair_near_ids += matches
    df_pairs = pd.DataFrame(data={"id":pair_ids, "near_id":pair_near_ids})
    return df_pairs

def join(df):
    x = [str(e) for e in list(df)]
    return " ".join(x)

def pairs2matches(df_pairs):
    df_matches = df_pairs.groupby("id")["near_id"].apply(join)
    df_matches = pd.DataFrame(df_matches).reset_index().sort_values(by="id").reset_index(drop=True)
    return df_matches.rename({"near_id":"matches"}, axis=1)


train_all_pair = pd.DataFrame(data={"id":ids, "near_id":near_ids})
del ids, near_ids

  0%|          | 0/1138812 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/739972 [00:00<?, ?it/s]

In [27]:
sub_pair = matches2pairs(sub_pp)

  0%|          | 0/5 [00:00<?, ?it/s]

In [28]:

sub_pair = sub_pair[(~(sub_pair["id"].isin(train["id"].values))) 
                      & (~(sub_pair["near_id"].isin(train["id"].values)))].reset_index(drop=True)

In [29]:
sub_pair = pd.concat([sub_pair, train_all_pair]).drop_duplicates().reset_index(drop=True)

In [30]:

sub_pair = sub_pair[(~sub_pair["id"].map(lambda x:x.endswith("_t")))&(~sub_pair["near_id"].map(lambda x:x.endswith("_t")))]

In [31]:
sub = pairs2matches(sub_pair)

In [32]:
sub.to_csv("submission.csv",index=False)