In [None]:
import pyproj
from tqdm import tqdm
import folium
import json
import glob
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import numpy as np
import random
import haversine as hs
from multiprocessing import Pool
import lightgbm as lgb
from catboost import CatBoostRegressor
import xgboost as xgb
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn import neighbors
from sklearn.linear_model import ElasticNet
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
start = datetime.datetime.now()

In [None]:
df_train = pd.read_csv('../官方資料集/training_data.csv')
df_valid = pd.concat([pd.read_csv('../官方資料集/public_dataset.csv'), pd.read_csv('../官方資料集/private_dataset.csv')]).reset_index(drop=True)
df_valid['單價'] = 0

### 官方原始欄位轉換，Rename columns and transform categorical cols with label encoder

In [None]:
column_transforms_with_le = {
    '建物型態': 'building_type',
    '主要建材': 'main_material',
    '主要用途': 'main_usage',
    '縣市': 'city_1',
    '鄉鎮市區': 'city_2',
    '路名': 'city_3',
    '使用分區': 'use_type'
}

for col, new_col in column_transforms_with_le.items():
    le = LabelEncoder()
    le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist())
    df_train[new_col] = le.transform(df_train[col].values.tolist())
    df_valid[new_col] = le.transform(df_valid[col].values.tolist())

In [None]:
column_transforms = {
    '移轉層次': 'floor',
    '總樓層數': 'total_floor',
    '土地面積': 'ground_area',
    '屋齡': 'age',
    '建物面積': 'building_area',
    '車位面積': 'car_area',
    '車位個數': 'car_cnt',
    '主建物面積': 'room_area',
    '陽台面積': 'room_area_balcony',
    '附屬建物面積': 'room_area_sub',
    '單價': 'price'
}
for col, new_col in column_transforms.items():
    df_train[new_col] = df_train[col].values
    df_valid[new_col] = df_valid[col].values

### 座標系統轉換，TWD97 coordinate to WGS84

In [None]:
twd97 = pyproj.Proj(init='epsg:3826')  # TWD97
wgs84 = pyproj.Proj(init='epsg:4326')  # WGS84

def get_coordinate(row):
    lon, lat = pyproj.transform(twd97, wgs84, row['橫坐標'], row['縱坐標'])
    return {
        'ID': row['ID'],
        'twd97_lon': lon,
        'twd97_lat': lat
    }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_coordinate, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features).fillna(-999999)
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_coordinate, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features).fillna(-999999)

df_train = df_train.merge(df_train_features, how='left', on='ID')
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

In [None]:
dict_datas = df_train.to_dict('records')

In [None]:
def get_distance_without_cache(lat1, lon1, lat2, lon2):
    p1 = (lat1, lon1)
    p2 = (lat2, lon2)
    distance = hs.haversine(p1, p2, unit=hs.Unit.METERS)
    return distance
    
def get_nearest_infos(id_ = 'id', n = 10, lat1 = 0, lon1 = 0):
    results = []
    for data in dict_datas:
        if data['ID'] == id_:
            continue
        results.append({
            'distance': get_distance_without_cache(lat1, lon1, data['twd97_lat'], data['twd97_lon']),
            'price': data['price'],
            'floor': data['floor'],
            'age': data['age'],
            'total_floor': data['total_floor']
        })
    results = sorted(results, key=lambda x: x["distance"])
    return results[:n]

def get_feature(row):
    infos = get_nearest_infos(row['ID'], nearest_n, row['twd97_lat'], row['twd97_lon'])
    tmp_df = pd.DataFrame(infos)
    return {
        'ID': row['ID'],
        f'nearest_{nearest_n}_price_mean': tmp_df['price'].mean(),
        f'nearest_{nearest_n}_price_median': tmp_df['price'].median(),
        f'nearest_{nearest_n}_price_std': tmp_df['price'].std(),
        f'nearest_{nearest_n}_price_max': tmp_df['price'].max(),
        f'nearest_{nearest_n}_price_min': tmp_df['price'].min(),
        
        f'nearest_{nearest_n}_age_median': tmp_df['age'].median(),
        f'nearest_{nearest_n}_age_std': tmp_df['age'].std(),
        f'nearest_{nearest_n}_age_max': tmp_df['age'].max(),
        f'nearest_{nearest_n}_age_min': tmp_df['age'].min(),

        f'nearest_{nearest_n}_floor_median': tmp_df['floor'].median(),
        f'nearest_{nearest_n}_total_floor_median': tmp_df['total_floor'].median()
    }

### 以距離產生每筆資料物理距離最近之4, 10, 30, 100筆資料之特徵

In [None]:
nearest_n = 100
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)

df_train = df_train.merge(df_train_features, how='left', on='ID')
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

In [None]:
nearest_n = 30
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)

df_train = df_train.merge(df_train_features, how='left', on='ID')
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

In [None]:
nearest_n = 10
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)

df_train = df_train.merge(df_train_features, how='left', on='ID')
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

In [None]:
nearest_n = 4
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)

df_train = df_train.merge(df_train_features, how='left', on='ID')
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

In [None]:
def get_ref_infos(id_ = 'id', distance_threshold = 1000, lat1 = 0, lon1 = 0):
    results = []
    for data in dict_datas:
        if data['ID'] == id_:
            continue
        distance = get_distance_without_cache(lat1, lon1, data['twd97_lat'], data['twd97_lon'])
        if distance <= distance_threshold:
            results.append({
                'distance': distance,
                'price': data['price'],
                'floor': data['floor'],
                'age': data['age'],
                'total_floor': data['total_floor']
            })
    return results

def get_ref_feature(row):
    infos = get_ref_infos(row['ID'], distance_threshold, row['twd97_lat'], row['twd97_lon'])
    tmp_df = pd.DataFrame(infos)
    if len(tmp_df) == 0:
        return {}
    return {
        'ID': row['ID'],
        f'ref_{distance_threshold}_price_mean': tmp_df['price'].mean(),
        f'ref_{distance_threshold}_price_median': tmp_df['price'].median(),
        f'ref_{distance_threshold}_price_std': tmp_df['price'].std(),
        f'ref_{distance_threshold}_price_max': tmp_df['price'].max(),
        f'ref_{distance_threshold}_price_min': tmp_df['price'].min(),
        
        f'ref_{distance_threshold}_age_median': tmp_df['age'].median(),
        f'ref_{distance_threshold}_age_std': tmp_df['age'].std(),
        f'ref_{distance_threshold}_age_max': tmp_df['age'].max(),
        f'ref_{distance_threshold}_age_min': tmp_df['age'].min(),
        
        f'ref_{distance_threshold}_floor_median': tmp_df['floor'].median(),
        f'ref_{distance_threshold}_total_floor_median': tmp_df['total_floor'].median()
    }

### 以距離產生每筆資料物理距離500, 1200, 3600公尺內資料之特徵

In [None]:
distance_threshold = 3600
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_ref_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_ref_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)

df_train = df_train.merge(df_train_features, how='left', on='ID')
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

In [None]:
distance_threshold = 1200
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_ref_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_ref_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)

df_train = df_train.merge(df_train_features, how='left', on='ID')
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

In [None]:
distance_threshold = 500
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_ref_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_ref_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)

df_train = df_train.merge(df_train_features, how='left', on='ID')
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

### 整合外部資料集，包含官方給予的外部資料集，以及我蒐集的外部資料集(警察局、消防局、加油站位置資訊)
### 以距離產生每筆資料物理距離200, 500, 1200, 3600公尺內外部資料集之counting特徵

In [None]:
extra_info_name_maps = {
    '便利商店': 'extra_info_c_store',
    'ATM資料': 'extra_info_atm',
    '公車站點資料': 'extra_info_bus_stop',
    '國小基本資料': 'extra_info_lv1_school',
    '國中基本資料': 'extra_info_lv2_school',
    '高中基本資料': 'extra_info_lv3_school',
    '大學基本資料': 'extra_info_lv4_school',
    '捷運站點資料': 'extra_info_mrt_station',
    '火車站點資料': 'extra_info_train_station',
    '腳踏車站點資料': 'extra_info_bike_station',
    '郵局據點資料': 'extra_info_post_office',
    '醫療機構基本資料': 'extra_info_hospital',
    '金融機構基本資料': 'extra_info_bank',
}
extra_infos = {}
for pathto_train in glob.glob('../官方資料集/external_data/*.csv'):
    print(pathto_train)
    df_extra = pd.read_csv(pathto_train)
    info_name = pathto_train.split('/')[-1].split('.')[0]
    if info_name in extra_info_name_maps:
        info_name = extra_info_name_maps[info_name]
    if 'lon' not in df_extra.columns:
        df_extra['lon'] = df_extra['lng'].values
    extra_use_cols = ['lon', 'lat']
    extra_infos[info_name] = df_extra[extra_use_cols].to_dict('records')
print(list(extra_infos.keys()))

In [None]:
def get_extra_infos(distance_threshold, lat1, lon1):
    infos = {}
    for extra_info_name in extra_infos.keys():
        cnt = 0
        for data in extra_infos[extra_info_name]:
            distance = get_distance_without_cache(lat1, lon1, data['lat'], data['lon'])
            if distance <= distance_threshold:
                cnt += 1
        infos.update({f'{extra_info_name}_{distance_threshold}_cnt': cnt})
    return infos

def get_extra_feature(row):
    features = {}
    features.update(get_extra_infos(200, row['twd97_lat'], row['twd97_lon']))
    features.update(get_extra_infos(500, row['twd97_lat'], row['twd97_lon']))
    features.update(get_extra_infos(1200, row['twd97_lat'], row['twd97_lon']))
    features.update(get_extra_infos(3600, row['twd97_lat'], row['twd97_lon']))
    features.update({'ID': row['ID']})
    return features

In [None]:
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_extra_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_extra_features = pd.DataFrame(features)

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_extra_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_extra_features = pd.DataFrame(features)

df_train = df_train.merge(df_train_extra_features, how='left', on='ID')
df_valid = df_valid.merge(df_valid_extra_features, how='left', on='ID')

### 製作面積相關額外特徵

In [None]:
area_cols = [c for c in df_train.columns if 'area' in c]
for area_col1 in area_cols:
    for area_col2 in area_cols:
        if area_col1 == area_col2:
            continue
        df_train[f'{area_col1}_add_{area_col2}'] = df_train[area_col1] + df_train[area_col2]
        df_valid[f'{area_col1}_add_{area_col2}'] = df_valid[area_col1] + df_valid[area_col2]
        df_train[f'{area_col1}_minus_{area_col2}'] = df_train[area_col1] - df_train[area_col2]
        df_valid[f'{area_col1}_minus_{area_col2}'] = df_valid[area_col1] - df_valid[area_col2]

### 製作縣市、鄉鎮市區統計特徵

In [None]:
city12_df = df_train.groupby(['city_1', 'city_2'])['price'].agg(['mean', 'std', 'max', 'min']).reset_index().rename(columns = {
    'mean': 'city12_price_mean',
    'std': 'city12_price_std',
    'max': 'city12_price_max',
    'min': 'city12_price_min',
})
df_train = df_train.merge(city12_df, how='left', on=['city_1', 'city_2']).fillna(-99999)
df_valid = df_valid.merge(city12_df, how='left', on=['city_1', 'city_2']).fillna(-99999)

In [None]:
df_train['floor_from_top'] = df_train['total_floor'] - df_train['floor']
df_valid['floor_from_top'] = df_valid['total_floor'] - df_valid['floor']

In [None]:
df_train['city12'] = df_train['city_1'].astype(str) + '_' + df_train['city_2'].astype(str)
df_valid['city12'] = df_valid['city_1'].astype(str) + '_' + df_valid['city_2'].astype(str)
col = 'city12'
new_col = 'city12'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())

### 製作實價登錄相關特徵，會用不同的方式mapping原始資料以及實價登錄資料，詳見下方的key欄位生成方式與對應程式

### 製作實價登錄不動產實際成交之特徵

In [None]:
df_external_gov_data = pd.read_csv('../外部資料集/實價登錄/external_gov_data.csv')

#### 以 縣市+鄉鎮市區+路名+主要用途+建物型態 對應

In [None]:
df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' +\
                    df_external_gov_data['路名'] + '_' +\
                    df_external_gov_data['主要用途'] + '_' + df_external_gov_data['建物型態']

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' +\
                    df_train['路名'] + '_' +\
                    df_train['主要用途'] + '_' + df_train['建物型態']

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' +\
                    df_valid['路名'] + '_' +\
                    df_valid['主要用途'] + '_' + df_valid['建物型態']

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            'externalkey_price_mean': subdf['單價'].mean(),
            'externalkey_price_std': subdf['單價'].std(),
            'externalkey_price_max': subdf['單價'].max(),
            'externalkey_price_min': subdf['單價'].min(),
            'externalkey_price_max_min_ratio': (subdf['單價'].max()-subdf['單價'].min()) / subdf['單價'].mean(),
            'externalkey_price_cnt': len(subdf),
            
            'externalkey_price_sameroad_50ma_mean': subdf['單價_同路_50MA'].mean(),
            'externalkey_price_sameroad_100ma_mean': subdf['單價_同路_100MA'].mean(),
            'externalkey_price_sameroad_200ma_mean': subdf['單價_同路_200MA'].mean(),
            'externalkey_price_sameroad_50ma_std_mean': subdf['單價_同路_50MA_STD'].mean(),
            'externalkey_price_sameroad_100ma_std_mean': subdf['單價_同路_100MA_STD'].mean(),
            'externalkey_price_sameroad_200ma_std_mean': subdf['單價_同路_200MA_STD'].mean(),
            
            'externalkey_price_samecity12_50ma_mean': subdf['單價_同鄉鎮市區_50MA'].mean(),
            'externalkey_price_samecity12_100ma_mean': subdf['單價_同鄉鎮市區_100MA'].mean(),
            'externalkey_price_samecity12_200ma_mean': subdf['單價_同鄉鎮市區_200MA'].mean(),
            'externalkey_price_samecity12_50ma_std_mean': subdf['單價_同鄉鎮市區_50MA_STD'].mean(),
            'externalkey_price_samecity12_100ma_std_mean': subdf['單價_同鄉鎮市區_100MA_STD'].mean(),
            'externalkey_price_samecity12_200ma_std_mean': subdf['單價_同鄉鎮市區_200MA_STD'].mean(),
            
            'externalkey_land1_mean': subdf['土地面積'].mean(),
            'externalkey_land2_mean': subdf['建物面積'].mean(),
            'externalkey_land3_mean': subdf['車位面積'].mean(),
            'externalkey_land4_mean': subdf['主建物面積'].mean(),
            'externalkey_land5_mean': subdf['陽台面積'].mean(),
            'externalkey_land6_mean': subdf['附屬建物面積'].mean(),

            'externalkey_age_diff_mean': subdf['屋齡'].mean() - row['屋齡'],
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

#### 以 縣市+鄉鎮市區+路名+主要用途+建物型態+總樓層數 對應

In [None]:
df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' +\
                    df_external_gov_data['路名'] + '_' +\
                    df_external_gov_data['總樓層數'].astype(str) + '_' + df_external_gov_data['主要用途'] + '_' + df_external_gov_data['建物型態']

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' +\
                    df_train['路名'] + '_' +\
                    df_train['總樓層數'].astype(str) + '_' + df_train['主要用途'] + '_' + df_train['建物型態']

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' +\
                    df_valid['路名'] + '_' +\
                    df_valid['總樓層數'].astype(str) + '_' + df_valid['主要用途'] + '_' + df_valid['建物型態']

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            'externalkey_samebuilding_price_mean': subdf['單價'].mean(),
            'externalkey_samebuilding_price_std': subdf['單價'].std(),
            'externalkey_samebuilding_price_max': subdf['單價'].max(),
            'externalkey_samebuilding_price_min': subdf['單價'].min(),
            'externalkey_samebuilding_price_max_min_ratio': (subdf['單價'].max()-subdf['單價'].min()) / subdf['單價'].mean(),
            'externalkey_samebuilding_price_cnt': len(subdf),

            
            'externalkey_samebuilding_price_sameroad_50ma_mean': subdf['單價_同路_50MA'].mean(),
            'externalkey_samebuilding_price_sameroad_100ma_mean': subdf['單價_同路_100MA'].mean(),
            'externalkey_samebuilding_price_sameroad_200ma_mean': subdf['單價_同路_200MA'].mean(),
            'externalkey_samebuilding_price_sameroad_50ma_std_mean': subdf['單價_同路_50MA_STD'].mean(),
            'externalkey_samebuilding_price_sameroad_100ma_std_mean': subdf['單價_同路_100MA_STD'].mean(),
            'externalkey_samebuilding_price_sameroad_200ma_std_mean': subdf['單價_同路_200MA_STD'].mean(),
            
            'externalkey_samebuilding_price_samecity12_50ma_mean': subdf['單價_同鄉鎮市區_50MA'].mean(),
            'externalkey_samebuilding_price_samecity12_100ma_mean': subdf['單價_同鄉鎮市區_100MA'].mean(),
            'externalkey_samebuilding_price_samecity12_200ma_mean': subdf['單價_同鄉鎮市區_200MA'].mean(),
            'externalkey_samebuilding_price_samecity12_50ma_std_mean': subdf['單價_同鄉鎮市區_50MA_STD'].mean(),
            'externalkey_samebuilding_price_samecity12_100ma_std_mean': subdf['單價_同鄉鎮市區_100MA_STD'].mean(),
            'externalkey_samebuilding_price_samecity12_200ma_std_mean': subdf['單價_同鄉鎮市區_200MA_STD'].mean(),
            
            'externalkey_samebuilding_land1_mean': subdf['土地面積'].mean(),
            'externalkey_samebuilding_land2_mean': subdf['建物面積'].mean(),
            'externalkey_samebuilding_land3_mean': subdf['車位面積'].mean(),
            'externalkey_samebuilding_land4_mean': subdf['主建物面積'].mean(),
            'externalkey_samebuilding_land5_mean': subdf['陽台面積'].mean(),
            'externalkey_samebuilding_land6_mean': subdf['附屬建物面積'].mean(),

            'externalkey_samebuilding_age_diff_mean': subdf['屋齡'].mean() - row['屋齡'],
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

#### 以 縣市+鄉鎮市區+路名+主要用途+建物型態+總樓層數+移轉層次 對應

In [None]:
df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' +\
                    df_external_gov_data['路名'] + '_' +\
                    df_external_gov_data['總樓層數'].astype(str) + '_' + df_external_gov_data['主要用途'] + '_' +\
                    df_external_gov_data['移轉層次'].astype(str) + '_' + df_external_gov_data['建物型態']

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' +\
                    df_train['路名'] + '_' +\
                    df_train['總樓層數'].astype(str) + '_' + df_train['主要用途'] + '_' +\
                    df_train['移轉層次'].astype(str) + '_' + df_train['建物型態']

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' +\
                    df_valid['路名'] + '_' +\
                    df_valid['總樓層數'].astype(str) + '_' + df_valid['主要用途'] + '_' +\
                    df_valid['移轉層次'].astype(str) + '_' + df_valid['建物型態']

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            'externalkey_samefloor_price_mean': subdf['單價'].mean(),
            'externalkey_samefloor_price_std': subdf['單價'].std(),
            'externalkey_samefloor_price_max': subdf['單價'].max(),
            'externalkey_samefloor_price_min': subdf['單價'].min(),
            'externalkey_samefloor_price_max_min_ratio': (subdf['單價'].max()-subdf['單價'].min()) / subdf['單價'].mean(),
            'externalkey_samefloor_price_cnt': len(subdf),
            'externalkey_samefloor_age_diff_mean': subdf['屋齡'].mean() - row['屋齡'],
            
            'externalkey_samefloor_price_sameroad_50ma_mean': subdf['單價_同路_50MA'].mean(),
            'externalkey_samefloor_price_sameroad_100ma_mean': subdf['單價_同路_100MA'].mean(),
            'externalkey_samefloor_price_sameroad_200ma_mean': subdf['單價_同路_200MA'].mean(),
            'externalkey_samefloor_price_sameroad_50ma_std_mean': subdf['單價_同路_50MA_STD'].mean(),
            'externalkey_samefloor_price_sameroad_100ma_std_mean': subdf['單價_同路_100MA_STD'].mean(),
            'externalkey_samefloor_price_sameroad_200ma_std_mean': subdf['單價_同路_200MA_STD'].mean(),
            
            'externalkey_samefloor_price_samecity12_50ma_mean': subdf['單價_同鄉鎮市區_50MA'].mean(),
            'externalkey_samefloor_price_samecity12_100ma_mean': subdf['單價_同鄉鎮市區_100MA'].mean(),
            'externalkey_samefloor_price_samecity12_200ma_mean': subdf['單價_同鄉鎮市區_200MA'].mean(),
            'externalkey_samefloor_price_samecity12_50ma_std_mean': subdf['單價_同鄉鎮市區_50MA_STD'].mean(),
            'externalkey_samefloor_price_samecity12_100ma_std_mean': subdf['單價_同鄉鎮市區_100MA_STD'].mean(),
            'externalkey_samefloor_price_samecity12_200ma_std_mean': subdf['單價_同鄉鎮市區_200MA_STD'].mean(),
            
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

#### 以 縣市+鄉鎮市區+路名+主要用途+建物型態+總樓層數+移轉層次+車位個數 對應

In [None]:
df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' +\
                    df_external_gov_data['路名'] + '_' +\
                    df_external_gov_data['總樓層數'].astype(str) + '_' + df_external_gov_data['主要用途'] + '_' +\
                    df_external_gov_data['移轉層次'].astype(str) + '_' + df_external_gov_data['建物型態'] + '_' + df_external_gov_data['車位個數'].astype(int).astype(str)

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' +\
                    df_train['路名'] + '_' +\
                    df_train['總樓層數'].astype(str) + '_' + df_train['主要用途'] + '_' +\
                    df_train['移轉層次'].astype(str) + '_' + df_train['建物型態'] + '_' + df_train['車位個數'].astype(int).astype(str)

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' +\
                    df_valid['路名'] + '_' +\
                    df_valid['總樓層數'].astype(str) + '_' + df_valid['主要用途'] + '_' +\
                    df_valid['移轉層次'].astype(str) + '_' + df_valid['建物型態'] + '_' + df_valid['車位個數'].astype(int).astype(str)

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            'externalkey_samefloor_samecar_price_mean': subdf['單價'].mean(),
            'externalkey_samefloor_samecar_price_cnt': subdf['單價'].count(),
            'externalkey_samefloor_samecar_age_diff_mean': subdf['屋齡'].mean() - row['屋齡']
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

#### 以 縣市+鄉鎮市區+路名+主要用途+建物型態+屋齡(差距絕對值小於1年) 對應

In [None]:
df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' + df_external_gov_data['主要用途'] + '_' +\
                    df_external_gov_data['路名'] + '_' + df_external_gov_data['建物型態'] 

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' + df_train['主要用途'] + '_' +\
                    df_train['路名'] + '_' + df_train['建物型態'] 

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' + df_valid['主要用途'] + '_' +\
                    df_valid['路名'] + '_' + df_valid['建物型態'] 

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        subdf['age_diff'] = np.abs(subdf['屋齡'] - row['屋齡'])
        subdf2 = subdf.query('age_diff <= 1')
        return {
            'ID': row['ID'],
            'externalkey_sameage_price_mean': subdf2['單價'].mean(),
            'externalkey_sameage_price_std': subdf2['單價'].std(),
            'externalkey_sameage_price_max': subdf2['單價'].max(),
            'externalkey_sameage_price_min': subdf2['單價'].min(),
            'externalkey_sameage_price_max_min_ratio': (subdf2['單價'].max()-subdf2['單價'].min()) / subdf2['單價'].mean(),

            'externalkey_sameage_price_sameroad_50ma_mean': subdf2['單價_同路_50MA'].mean(),
            'externalkey_sameage_price_sameroad_100ma_mean': subdf2['單價_同路_100MA'].mean(),
            'externalkey_sameage_price_sameroad_200ma_mean': subdf2['單價_同路_200MA'].mean(),
            'externalkey_sameage_price_sameroad_50ma_std_mean': subdf2['單價_同路_50MA_STD'].mean(),
            'externalkey_sameage_price_sameroad_100ma_std_mean': subdf2['單價_同路_100MA_STD'].mean(),
            'externalkey_sameage_price_sameroad_200ma_std_mean': subdf2['單價_同路_200MA_STD'].mean(),
            
            'externalkey_sameage_price_samecity12_50ma_mean': subdf2['單價_同鄉鎮市區_50MA'].mean(),
            'externalkey_sameage_price_samecity12_100ma_mean': subdf2['單價_同鄉鎮市區_100MA'].mean(),
            'externalkey_sameage_price_samecity12_200ma_mean': subdf2['單價_同鄉鎮市區_200MA'].mean(),
            'externalkey_sameage_price_samecity12_50ma_std_mean': subdf2['單價_同鄉鎮市區_50MA_STD'].mean(),
            'externalkey_sameage_price_samecity12_100ma_std_mean': subdf2['單價_同鄉鎮市區_100MA_STD'].mean(),
            'externalkey_sameage_price_samecity12_200ma_std_mean': subdf2['單價_同鄉鎮市區_200MA_STD'].mean(),
            
            'externalkey_sameage_land1_mean': subdf2['土地面積'].mean(),
            'externalkey_sameage_land2_mean': subdf2['建物面積'].mean(),
            'externalkey_sameage_land3_mean': subdf2['車位面積'].mean(),
            'externalkey_sameage_land4_mean': subdf2['主建物面積'].mean(),
            'externalkey_sameage_land5_mean': subdf2['陽台面積'].mean(),
            'externalkey_sameage_land6_mean': subdf2['附屬建物面積'].mean(),
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

#### 以 縣市+鄉鎮市區+路名+主要用途+建物型態+屋齡(差距絕對值小於0.5年) 對應

In [None]:
df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' + df_external_gov_data['主要用途'] + '_' +\
                    df_external_gov_data['路名'] + '_' + df_external_gov_data['建物型態'] 

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' + df_train['主要用途'] + '_' +\
                    df_train['路名'] + '_' + df_train['建物型態'] 

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' + df_valid['主要用途'] + '_' +\
                    df_valid['路名'] + '_' + df_valid['建物型態'] 

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        subdf['age_diff'] = np.abs(subdf['屋齡'] - row['屋齡'])
        subdf2 = subdf.query('age_diff <= 0.5')
        return {
            'ID': row['ID'],
            'externalkey_sameage_05_price_mean': subdf2['單價'].mean(),
            'externalkey_sameage_05_price_std': subdf2['單價'].std(),
            'externalkey_sameage_05_price_max': subdf2['單價'].max(),
            'externalkey_sameage_05_price_min': subdf2['單價'].min(),
            'externalkey_sameage_05_price_max_min_ratio': (subdf2['單價'].max()-subdf2['單價'].min()) / subdf2['單價'].mean(),
            
            'externalkey_sameage_05_price_sameroad_50ma_mean': subdf2['單價_同路_50MA'].mean(),
            'externalkey_sameage_05_price_sameroad_100ma_mean': subdf2['單價_同路_100MA'].mean(),
            'externalkey_sameage_05_price_sameroad_200ma_mean': subdf2['單價_同路_200MA'].mean(),
            'externalkey_sameage_05_price_sameroad_50ma_std_mean': subdf2['單價_同路_50MA_STD'].mean(),
            'externalkey_sameage_05_price_sameroad_100ma_std_mean': subdf2['單價_同路_100MA_STD'].mean(),
            'externalkey_sameage_05_price_sameroad_200ma_std_mean': subdf2['單價_同路_200MA_STD'].mean(),
            
            'externalkey_sameage_05_price_samecity12_50ma_mean': subdf2['單價_同鄉鎮市區_50MA'].mean(),
            'externalkey_sameage_05_price_samecity12_100ma_mean': subdf2['單價_同鄉鎮市區_100MA'].mean(),
            'externalkey_sameage_05_price_samecity12_200ma_mean': subdf2['單價_同鄉鎮市區_200MA'].mean(),
            'externalkey_sameage_05_price_samecity12_50ma_std_mean': subdf2['單價_同鄉鎮市區_50MA_STD'].mean(),
            'externalkey_sameage_05_price_samecity12_100ma_std_mean': subdf2['單價_同鄉鎮市區_100MA_STD'].mean(),
            'externalkey_sameage_05_price_samecity12_200ma_std_mean': subdf2['單價_同鄉鎮市區_200MA_STD'].mean(),
            
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

#### 以 縣市+鄉鎮市區+路名+主要用途+建物型態+屋齡(差距絕對值小於0.25年) 對應

In [None]:
df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' + df_external_gov_data['主要用途'] + '_' +\
                    df_external_gov_data['路名'] + '_' + df_external_gov_data['建物型態'] 

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' + df_train['主要用途'] + '_' +\
                    df_train['路名'] + '_' + df_train['建物型態'] 

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' + df_valid['主要用途'] + '_' +\
                    df_valid['路名'] + '_' + df_valid['建物型態'] 

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        subdf['age_diff'] = np.abs(subdf['屋齡'] - row['屋齡'])
        subdf2 = subdf.query('age_diff <= 0.25')
        return {
            'ID': row['ID'],
            'externalkey_sameage_0.25_price_mean': subdf2['單價'].mean(),
            'externalkey_sameage_0.25_price_std': subdf2['單價'].std(),
            'externalkey_sameage_0.25_price_max': subdf2['單價'].max(),
            'externalkey_sameage_0.25_price_min': subdf2['單價'].min(),
            'externalkey_sameage_0.25_price_max_min_ratio': (subdf2['單價'].max()-subdf2['單價'].min()) / subdf2['單價'].mean(),
            
            'externalkey_sameage_0.25_price_sameroad_50ma_mean': subdf2['單價_同路_50MA'].mean(),
            'externalkey_sameage_0.25_price_sameroad_100ma_mean': subdf2['單價_同路_100MA'].mean(),
            'externalkey_sameage_0.25_price_sameroad_200ma_mean': subdf2['單價_同路_200MA'].mean(),
            'externalkey_sameage_0.25_price_sameroad_50ma_std_mean': subdf2['單價_同路_50MA_STD'].mean(),
            'externalkey_sameage_0.25_price_sameroad_100ma_std_mean': subdf2['單價_同路_100MA_STD'].mean(),
            'externalkey_sameage_0.25_price_sameroad_200ma_std_mean': subdf2['單價_同路_200MA_STD'].mean(),
            
            'externalkey_sameage_0.25_price_samecity12_50ma_mean': subdf2['單價_同鄉鎮市區_50MA'].mean(),
            'externalkey_sameage_0.25_price_samecity12_100ma_mean': subdf2['單價_同鄉鎮市區_100MA'].mean(),
            'externalkey_sameage_0.25_price_samecity12_200ma_mean': subdf2['單價_同鄉鎮市區_200MA'].mean(),
            'externalkey_sameage_0.25_price_samecity12_50ma_std_mean': subdf2['單價_同鄉鎮市區_50MA_STD'].mean(),
            'externalkey_sameage_0.25_price_samecity12_100ma_std_mean': subdf2['單價_同鄉鎮市區_100MA_STD'].mean(),
            'externalkey_sameage_0.25_price_samecity12_200ma_std_mean': subdf2['單價_同鄉鎮市區_200MA_STD'].mean(),
            
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

#### 以 縣市+鄉鎮市區+路名+主要用途+建物型態+屋齡(差距小於0.1年)+車位個數+總樓層數+移轉層次+**附屬建物面積** 對應

In [None]:
df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' + df_external_gov_data['主要用途'] + '_' +\
                    df_external_gov_data['路名'] + '_' + df_external_gov_data['建物型態'] + '_' + \
                    df_external_gov_data['車位個數'].astype(int).astype(str) + '_'  + df_external_gov_data['總樓層數'].astype(str) + '_' +\
                    df_external_gov_data['移轉層次'].astype(str)

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' + df_train['主要用途'] + '_' +\
                    df_train['路名'] + '_' + df_train['建物型態'] + '_' + \
                    df_train['車位個數'].astype(int).astype(str) + '_'  + df_train['總樓層數'].astype(str) + '_' +\
                    df_train['移轉層次'].astype(str)

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' + df_valid['主要用途'] + '_' +\
                    df_valid['路名'] + '_' + df_valid['建物型態'] + '_' + \
                    df_valid['車位個數'].astype(int).astype(str) + '_'  + df_valid['總樓層數'].astype(str) + '_' +\
                    df_valid['移轉層次'].astype(str)

sub_area_0_external = df_external_gov_data['附屬建物面積'].value_counts().head(1).index[0]
sub_area_0_train = df_train['附屬建物面積'].value_counts().head(1).index[0]

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        subdf['age_diff'] = row['屋齡']  - subdf['屋齡']
        subdf2 = subdf.query('age_diff <= 0.1 and age_diff >= 0')
        if np.abs(row['附屬建物面積'] - sub_area_0_train) <= 0.0001:
            subdf2 = subdf2.query(f'附屬建物面積 <= {sub_area_0_external}')
        else:
            subdf2 = subdf2.query(f'附屬建物面積 > {sub_area_0_external}')
        return {
            'ID': row['ID'],
            'externalkey_exactly_same_price_mean': subdf2['單價'].mean(),
            #'externalkey_exactly_same_price_std': subdf2['單價'].std(),
            'externalkey_exactly_same_price_max': subdf2['單價'].max(),
            'externalkey_exactly_same_price_min': subdf2['單價'].min(),
            'externalkey_exactly_same_price_max_min_ratio': (subdf2['單價'].max()-subdf2['單價'].min()) / subdf2['單價'].mean(),
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

### 基於 縣市+鄉鎮市區+路名+主要用途+建物型態 對應產生2021Q2 - 2022Q4每一季度的均價特徵

In [None]:
df_external_gov_data = pd.read_csv('../外部資料集/實價登錄/external_gov_data_by_year.csv')
df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' +\
                    df_external_gov_data['路名'] + '_' + df_external_gov_data['主要用途'] + '_' +\
                    df_external_gov_data['建物型態'] 

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' +\
                    df_train['路名'] + '_' + df_train['主要用途'] + '_' +\
                    df_train['建物型態'] 

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' +\
                    df_valid['路名'] + '_' + df_valid['主要用途'] + '_' +\
                    df_valid['建物型態'] 

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            'externalkey_year_21q2_price_mean': subdf.query('Trade_YearQ == 20211')['單價'].mean(),
            'externalkey_year_21q3_price_mean': subdf.query('Trade_YearQ == 20212')['單價'].mean(),
            'externalkey_year_21q4_price_mean': subdf.query('Trade_YearQ == 20213')['單價'].mean(),
            
            'externalkey_year_22q1_price_mean': subdf.query('Trade_YearQ == 20220')['單價'].mean(),
            'externalkey_year_22q2_price_mean': subdf.query('Trade_YearQ == 20221')['單價'].mean(),
            'externalkey_year_22q3_price_mean': subdf.query('Trade_YearQ == 20222')['單價'].mean(),
            'externalkey_year_22q4_price_mean': subdf.query('Trade_YearQ == 20223')['單價'].mean(),
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

### 製作實價登錄不動產租賃之對應特徵

In [None]:
df_external_gov_data = pd.read_csv('../外部資料集/實價登錄/external_gov_data_lease.csv')

df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' +\
                    df_external_gov_data['路名'] + '_' +\
                    df_external_gov_data['主要用途'] + '_' + df_external_gov_data['建物型態']

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' +\
                    df_train['路名'] + '_' +\
                    df_train['主要用途'] + '_' + df_train['建物型態']

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' +\
                    df_valid['路名'] + '_' +\
                    df_valid['主要用途'] + '_' + df_valid['建物型態']

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            'externalkey_lease_price_mean': subdf['單價'].mean(),
            'externalkey_lease_price_std': subdf['單價'].std(),
            'externalkey_lease_price_max': subdf['單價'].max(),
            'externalkey_lease_price_min': subdf['單價'].min(),
            'externalkey_lease_price_max_min_ratio': (subdf['單價'].max()-subdf['單價'].min()) / subdf['單價'].mean(),
            'externalkey_lease_price_cnt': len(subdf),
            
            # 'externalkey_lease_land1_mean': subdf['土地面積'].mean(),
            # 'externalkey_lease_land2_mean': subdf['建物面積'].mean(),
            # 'externalkey_lease_land3_mean': subdf['車位面積'].mean(),
            # 'externalkey_lease_land4_mean': subdf['主建物面積'].mean(),
            # 'externalkey_lease_land5_mean': subdf['陽台面積'].mean(),
            # 'externalkey_lease_land6_mean': subdf['附屬建物面積'].mean(),

            'externalkey_lease_age_diff_mean': subdf['屋齡'].mean() - row['屋齡'],
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

In [None]:
df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' +\
                    df_external_gov_data['路名'] + '_' +\
                    df_external_gov_data['總樓層數'].astype(str) + '_' + df_external_gov_data['主要用途'] + '_' + df_external_gov_data['建物型態']

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' +\
                    df_train['路名'] + '_' +\
                    df_train['總樓層數'].astype(str) + '_' + df_train['主要用途'] + '_' + df_train['建物型態']

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' +\
                    df_valid['路名'] + '_' +\
                    df_valid['總樓層數'].astype(str) + '_' + df_valid['主要用途'] + '_' + df_valid['建物型態']

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            'externalkey_lease_samebuilding_price_mean': subdf['單價'].mean(),
            'externalkey_lease_samebuilding_price_std': subdf['單價'].std(),
            'externalkey_lease_samebuilding_price_max': subdf['單價'].max(),
            'externalkey_lease_samebuilding_price_min': subdf['單價'].min(),
            'externalkey_lease_samebuilding_price_max_min_ratio': (subdf['單價'].max()-subdf['單價'].min()) / subdf['單價'].mean(),
            'externalkey_lease_samebuilding_price_cnt': len(subdf),

            
            # 'externalkey_lease_samebuilding_land1_mean': subdf['土地面積'].mean(),
            # 'externalkey_lease_samebuilding_land2_mean': subdf['建物面積'].mean(),
            # 'externalkey_lease_samebuilding_land3_mean': subdf['車位面積'].mean(),
            # 'externalkey_lease_samebuilding_land4_mean': subdf['主建物面積'].mean(),
            # 'externalkey_lease_samebuilding_land5_mean': subdf['陽台面積'].mean(),
            # 'externalkey_lease_samebuilding_land6_mean': subdf['附屬建物面積'].mean(),

            'externalkey_lease_samebuilding_age_diff_mean': subdf['屋齡'].mean() - row['屋齡'],
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

In [None]:
df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' +\
                    df_external_gov_data['路名'] + '_' +\
                    df_external_gov_data['總樓層數'].astype(str) + '_' + df_external_gov_data['主要用途'] + '_' +\
                    df_external_gov_data['移轉層次'].astype(str) + '_' + df_external_gov_data['建物型態']

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' +\
                    df_train['路名'] + '_' +\
                    df_train['總樓層數'].astype(str) + '_' + df_train['主要用途'] + '_' +\
                    df_train['移轉層次'].astype(str) + '_' + df_train['建物型態']

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' +\
                    df_valid['路名'] + '_' +\
                    df_valid['總樓層數'].astype(str) + '_' + df_valid['主要用途'] + '_' +\
                    df_valid['移轉層次'].astype(str) + '_' + df_valid['建物型態']

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            'externalkey_lease_samefloor_price_mean': subdf['單價'].mean(),
            'externalkey_lease_samefloor_price_std': subdf['單價'].std(),
            'externalkey_lease_samefloor_price_max': subdf['單價'].max(),
            'externalkey_lease_samefloor_price_min': subdf['單價'].min(),
            'externalkey_lease_samefloor_price_max_min_ratio': (subdf['單價'].max()-subdf['單價'].min()) / subdf['單價'].mean(),
            'externalkey_lease_samefloor_price_cnt': len(subdf),
            'externalkey_lease_samefloor_age_diff_mean': subdf['屋齡'].mean() - row['屋齡']
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

In [None]:
df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' +\
                    df_external_gov_data['路名'] + '_' +\
                    df_external_gov_data['總樓層數'].astype(str) + '_' + df_external_gov_data['主要用途'] + '_' +\
                    df_external_gov_data['移轉層次'].astype(str) + '_' + df_external_gov_data['建物型態'] + '_' + df_external_gov_data['車位個數'].astype(int).astype(str)

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' +\
                    df_train['路名'] + '_' +\
                    df_train['總樓層數'].astype(str) + '_' + df_train['主要用途'] + '_' +\
                    df_train['移轉層次'].astype(str) + '_' + df_train['建物型態'] + '_' + df_train['車位個數'].astype(int).astype(str)

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' +\
                    df_valid['路名'] + '_' +\
                    df_valid['總樓層數'].astype(str) + '_' + df_valid['主要用途'] + '_' +\
                    df_valid['移轉層次'].astype(str) + '_' + df_valid['建物型態'] + '_' + df_valid['車位個數'].astype(int).astype(str)

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            'externalkey_lease_samefloor_samecar_price_mean': subdf['單價'].mean(),
            'externalkey_lease_samefloor_samecar_price_cnt': subdf['單價'].count(),
            'externalkey_lease_samefloor_samecar_age_diff_mean': subdf['屋齡'].mean() - row['屋齡']
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

In [None]:
df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' +\
                    df_external_gov_data['路名'] + '_' + df_external_gov_data['主要用途'] + '_' +\
                    df_external_gov_data['屋齡'].astype(int).astype(str) + '_' + df_external_gov_data['建物型態'] 

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' +\
                    df_train['路名'] + '_' + df_train['主要用途'] + '_' +\
                    df_train['屋齡'].astype(int).astype(str) + '_' + df_train['建物型態'] 

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' +\
                    df_valid['路名'] + '_' + df_valid['主要用途'] + '_' +\
                    df_valid['屋齡'].astype(int).astype(str) + '_' + df_valid['建物型態'] 

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            'externalkey_lease_sameage_price_mean': subdf['單價'].mean(),
            'externalkey_lease_sameage_price_std': subdf['單價'].std(),
            'externalkey_lease_sameage_price_max': subdf['單價'].max(),
            'externalkey_lease_sameage_price_min': subdf['單價'].min(),
            'externalkey_lease_sameage_price_max_min_ratio': (subdf['單價'].max()-subdf['單價'].min()) / subdf['單價'].mean(),
            'externalkey_lease_sameage_price_cnt': len(subdf),

            
            # 'externalkey_lease_sameage_land1_mean': subdf['土地面積'].mean(),
            # 'externalkey_lease_sameage_land2_mean': subdf['建物面積'].mean(),
            # 'externalkey_lease_sameage_land3_mean': subdf['車位面積'].mean(),
            # 'externalkey_lease_sameage_land4_mean': subdf['主建物面積'].mean(),
            # 'externalkey_lease_sameage_land5_mean': subdf['陽台面積'].mean(),
            # 'externalkey_lease_sameage_land6_mean': subdf['附屬建物面積'].mean(),
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

### 製作實價登錄預售屋之對應特徵

In [None]:
df_external_gov_data = pd.read_csv('../外部資料集/實價登錄/external_gov_data_pre_sale.csv')

df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' +\
                    df_external_gov_data['路名']

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' +\
                    df_train['路名']

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' +\
                    df_valid['路名']

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist() + df_external_gov_data[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())
df_external_gov_data[new_col] = le.transform(df_external_gov_data[col].values.tolist())

# external datas 
externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    if row['key'] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row['key']]
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            'externalkey_pre_sale_price_mean': subdf['單價'].mean(),
            'externalkey_pre_sale_price_std': subdf['單價'].std(),
            'externalkey_pre_sale_price_max': subdf['單價'].max(),
            'externalkey_pre_sale_price_min': subdf['單價'].min(),
            'externalkey_pre_sale_price_max_min_ratio': (subdf['單價'].max()-subdf['單價'].min()) / subdf['單價'].mean(),
            'externalkey_pre_sale_price_cnt': len(subdf),
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

### 基於實際距離(以訓練資料來獲得每個路名的中心點位置)製作實價登錄不動產交易之相關特徵
### 每段路如果中心點距離不超過500公尺，則會視做同一群體，詳見road_near_road_infos這個變數的生成

In [None]:
threshold = 500

tmp_use_cols = ['縣市', '鄉鎮市區', '路名', 'twd97_lon', 'twd97_lat']
df_all = pd.concat([df_train[tmp_use_cols], df_valid[tmp_use_cols]]).reset_index(drop=True)

road_coordinates = {}

for (city1, city2, city3), subdf in df_all.groupby(['縣市', '鄉鎮市區', '路名']):
    key = f'{city1}_{city2}_{city3}'
    road_coordinates[key] = (subdf['twd97_lon'].median(), subdf['twd97_lat'].median())

road_near_road_infos = {}
for key1 in tqdm(road_coordinates.keys()):
    road_near_road_infos[key1] = []
    for key2 in road_coordinates.keys():
        if key1 == key2:
            continue
        distance = get_distance_without_cache(road_coordinates[key1][1], road_coordinates[key1][0], road_coordinates[key2][1], road_coordinates[key2][0])
        if distance <= threshold:
            road_near_road_infos[key1].append(key2)

tmp_datas = []
for key1 in tqdm(road_near_road_infos.keys()):
    tmp_datas.append({
        'key': key1,
        'len': len(road_near_road_infos[key1])
    })
tmp_df = pd.DataFrame(tmp_datas)
tmp_df.describe()

In [None]:
df_external_gov_data = pd.read_csv('../外部資料集/實價登錄/external_gov_data.csv')

df_external_gov_data['key'] = df_external_gov_data['縣市'] + '_' +\
                    df_external_gov_data['鄉鎮市區'] + '_' +\
                    df_external_gov_data['路名']

df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' +\
                    df_train['路名']

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' +\
                    df_valid['路名']

externalkey2subdf = {}
for key, subdf in df_external_gov_data.groupby('key'):
    externalkey2subdf[key] = subdf

def get_external_same_building_feature(row):
    nearby_roads = road_near_road_infos[row['key']]
    if len(nearby_roads) == 0:
        return {}
    subdfs = []
    for nearby_road in nearby_roads:
        if nearby_road in externalkey2subdf:
            subdfs.append(externalkey2subdf[nearby_road])
    if len(subdfs) == 0:
        return {}
    subdf = pd.concat(subdfs).reset_index(drop=True)
    subdf = subdf.query(f'建物型態 == "{row["建物型態"]}" and 主要用途 == "{row["主要用途"]}"')
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            'externalkey_nearby_roads_price_mean': subdf['單價'].mean(),
            'externalkey_nearby_roads_price_std': subdf['單價'].std(),
        }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

### 原始資料集基於 縣市+鄉鎮市區+路名+總樓層數+主要用途+建物型態 所建之相關單價、屋齡、面積之統計特徵

In [None]:
df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' +\
                    df_train['路名'] + '_' +\
                    df_train['總樓層數'].astype(str) + '_' + df_train['主要用途'] + '_' + df_train['建物型態']

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' +\
                    df_valid['路名'] + '_' +\
                    df_valid['總樓層數'].astype(str) + '_' + df_valid['主要用途'] + '_' + df_valid['建物型態']

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())

key2subdf = {}
for key, subdf in df_train.groupby('key'):
    key2subdf[key] = subdf

def get_same_building_feature(row):
    if row['key'] not in key2subdf:
        return {}
    subdf = key2subdf[row['key']].query(f'ID != "{row["ID"]}"')
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            'key_price_mean': subdf['price'].mean(),
            'key_price_std': subdf['price'].std(),
            'key_price_max': subdf['price'].max(),
            'key_price_min': subdf['price'].min(),
            'key_price_max_min_ratio': (subdf['price'].max()-subdf['price'].min()) / subdf['price'].mean(),
            'key_price_cnt': len(subdf),
            
            'key_age_mean': subdf['age'].mean(),
            'key_age_std': subdf['age'].std(),
            'key_age_max': subdf['age'].max(),
            'key_age_min': subdf['age'].min(),
            'key_age_max_min_ratio': (subdf['age'].max()-subdf['age'].min()) / subdf['age'].mean(),
            #'key_age_divided_diff': subdf['age'].mean() / row['age'],
            'key_age_minus_diff':  row['age'] - subdf['age'].mean(),
            
            'key_building_area_minus_room_area_mean': subdf['building_area_minus_room_area'].mean(),
            'key_building_area_minus_room_area_std': subdf['building_area_minus_room_area'].std(),
            'key_building_area_minus_room_area_min': subdf['building_area_minus_room_area'].min(),
            'key_building_area_minus_room_area_max': subdf['building_area_minus_room_area'].max(),
            
            'key_room_area_minus_building_area_mean': subdf['room_area_minus_building_area'].mean(),
            'key_room_area_minus_building_area_std': subdf['room_area_minus_building_area'].std(),
            'key_room_area_minus_building_area_min': subdf['room_area_minus_building_area'].min(),
            'key_room_area_minus_building_area_max': subdf['room_area_minus_building_area'].max(),
        }
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

### 原始資料集基於 縣市+鄉鎮市區+主要用途+建物型態 所建之相關 屋齡 之統計特徵

In [None]:
df_train['key'] = df_train['縣市'] + '_' +\
                    df_train['鄉鎮市區'] + '_' +\
                    df_train['主要用途'] + '_' + df_train['建物型態']

df_valid['key'] = df_valid['縣市'] + '_' +\
                    df_valid['鄉鎮市區'] + '_' +\
                    df_valid['主要用途'] + '_' + df_valid['建物型態']

col = 'key'
new_col = 'key'
le = LabelEncoder()
le.fit(df_train[col].values.tolist() + df_valid[col].values.tolist())
df_train[new_col] = le.transform(df_train[col].values.tolist())
df_valid[new_col] = le.transform(df_valid[col].values.tolist())

key2subdf = {}
for key, subdf in df_train.groupby('key'):
    key2subdf[key] = subdf

def get_same_building_feature(row):
    if row['key'] not in key2subdf:
        return {}
    subdf = key2subdf[row['key']].query(f'ID != "{row["ID"]}"')
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            'key_city12_age_mean': subdf['age'].mean(),
            'key_city12_age_std': subdf['age'].std(),
            'key_city12_age_max': subdf['age'].max(),
            'key_city12_age_min': subdf['age'].min(),
            'key_city12_age_max_min_ratio': (subdf['age'].max()-subdf['age'].min()) / subdf['age'].mean(),
            #'key_city12_age_divided_diff': subdf['age'].mean() / row['age'],
            'key_city12_age_minus_diff':  row['age'] - subdf['age'].mean(),
        }
with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features)
df_train = df_train.merge(df_train_features, how='left', on='ID')

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
df_valid_features = pd.DataFrame(features)
df_valid = df_valid.merge(df_valid_features, how='left', on='ID')

### 各種方式獲得的均價特徵的再製造(彼此相減)

In [None]:
price_mean_cols = [c for c in df_train.columns if c.endswith('_price_mean')]
price_mean_cols = [
    'ref_500_price_mean',
    'ref_1200_price_mean',
    'ref_3600_price_mean',
    'nearest_100_price_mean',
    'nearest_30_price_mean',
    'nearest_4_price_mean',
    'city12_price_mean',
    'key_price_mean',
    'externalkey_price_mean',
    'externalkey_samebuilding_price_mean',
    'externalkey_sameage_price_mean',
    'externalkey_sameage_0.25_price_mean',
    #'externalkey_nearby_roads_price_mean',
    #'key_nearyby_roads_price_mean'
]
len(price_mean_cols)

In [None]:
for price_mean_col1 in price_mean_cols:
    for price_mean_col2 in price_mean_cols:
        if price_mean_col1 == price_mean_col2:
            continue
        new_col = f'price_minus_feature_{price_mean_col1}_minus_{price_mean_col2}'
        df_train[new_col] = df_train[price_mean_col1] - df_train[price_mean_col2]
        df_valid[new_col] = df_valid[price_mean_col1] - df_valid[price_mean_col2]

In [None]:
price_mean_cols = [
    'ref_500_price_mean',
    'ref_1200_price_mean',
    'ref_3600_price_mean',
    'nearest_100_price_mean',
    'nearest_30_price_mean',
    'nearest_4_price_mean',
    'city12_price_mean',
    'key_price_mean',
    'externalkey_price_mean',
    'externalkey_samebuilding_price_mean',
    'externalkey_sameage_price_mean',
    'externalkey_sameage_0.25_price_mean',
]

for price_mean_col1 in ['externalkey_exactly_same_price_mean']:
    for price_mean_col2 in price_mean_cols:
        if price_mean_col1 == price_mean_col2:
            continue
        new_col = f'price_minus_feature_{price_mean_col1}_minus_{price_mean_col2}'
        df_train[new_col] = df_train[price_mean_col1] - df_train[price_mean_col2]
        df_valid[new_col] = df_valid[price_mean_col1] - df_valid[price_mean_col2]

### 額外資料特徵

In [None]:
df_low_income = pd.read_csv('../外部資料集/low_income_info.csv')
df_train = df_train.merge(df_low_income, how='left', on=['縣市', '鄉鎮市區'])
df_valid = df_valid.merge(df_low_income, how='left', on=['縣市', '鄉鎮市區'])

df_population = pd.read_csv('../外部資料集/population_info.csv')
df_train = df_train.merge(df_population, how='left', on=['縣市', '鄉鎮市區'])
df_valid = df_valid.merge(df_population, how='left', on=['縣市', '鄉鎮市區'])

column_transforms = {
    '低收入戶戶數': 'low_income_group_cnt',
    '低收入戶人口數': 'low_income_people_cnt',
    '行政區人口數': 'population_cnt',
    '行政區土地面積': 'population_area',
    '行政區人口密度': 'population_density'
}
for col, new_col in column_transforms.items():
    df_train[new_col] = df_train[col].values
    df_valid[new_col] = df_valid[col].values

df_train['low_income_rate'] = df_train['低收入戶人口數'] / df_train['行政區人口數']
df_valid['low_income_rate'] = df_valid['低收入戶人口數'] / df_valid['行政區人口數']

In [None]:
area_cols1 = [
    'room_area', 
    'car_area',
    'building_area',
]
area_cols2 = [
    'externalkey_land2_mean',
    'externalkey_land3_mean',
    'externalkey_land4_mean',
    'externalkey_samebuilding_land2_mean',
    'externalkey_samebuilding_land3_mean',
    'externalkey_samebuilding_land4_mean',
]
new_area_cols = []
for area_col1 in area_cols1:
    for area_col2 in area_cols2:
        if area_col1 == area_col2:
            continue
        df_train[f'{area_col1}_minus_{area_col2}'] = df_train[area_col1] - df_train[area_col2]
        df_valid[f'{area_col1}_minus_{area_col2}'] = df_valid[area_col1] - df_valid[area_col2]
        new_area_cols.append(f'{area_col1}_minus_{area_col2}')

In [None]:
def get_floor_cat(floor):
    if floor <= 3:
        return 1
    elif floor <= 6:
        return 2
    elif floor <= 12:
        return 3
    elif floor < 20:
        return 4
    else:
        return 5

def get_age_cat(age):
    if age <= 2:
        return 1
    elif age <= 5:
        return 2
    elif age <= 10:
        return 3
    elif age <= 20:
        return 4
    elif age <= 30:
        return 5
    elif age <= 40:
        return 6
    return 7
    
df_train['floor_cat'] = df_train['floor'].apply(get_floor_cat)
df_valid['floor_cat'] = df_valid['floor'].apply(get_floor_cat)
df_train['total_floor_cat'] = df_train['total_floor'].apply(get_floor_cat)
df_valid['total_floor_cat'] = df_valid['total_floor'].apply(get_floor_cat)
df_train['age_cat'] = df_train['age'].apply(get_age_cat)
df_valid['age_cat'] = df_valid['age'].apply(get_age_cat)

### 移除不必要的欄位，這邊用.isascii()來刪除原始的欄位，僅保留alphabet的欄位

In [None]:
use_cols = [c for c in df_train.columns if c.isascii()]
use_cols = [c for c in use_cols if c != 'key']

In [None]:
df_train[use_cols].to_csv('../官方資料集/final_feature_engineering_train.csv', index=False)
df_valid[use_cols].to_csv('../官方資料集/final_feature_engineering_valid.csv', index=False)