In [None]:
import glob
import pandas as pd
import matplotlib.pyplot as plt

import pyproj
from tqdm import tqdm
import folium
import json
import glob
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import numpy as np
import random
import haversine as hs
from multiprocessing import Pool
import lightgbm as lgb
from catboost import CatBoostRegressor
import xgboost as xgb
import datetime
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn import neighbors
from sklearn.linear_model import ElasticNet
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei'] # 修改中文字體
plt.rcParams['axes.unicode_minus'] = False # 顯示負號
pd.set_option('display.max_columns', None)

In [None]:
df_train = pd.read_csv('../官方資料集/training_data.csv')
df_valid = pd.read_csv('../官方資料集/public_dataset.csv')
df_test = pd.read_csv('../官方資料集/private_dataset.csv')

In [None]:
df_external_gov_data = pd.read_csv('../外部資料集/實價登錄/external_gov_data.csv')

In [None]:
by = ['縣市', '鄉鎮市區', '路名', '主要用途', '建物型態']
df_train['key'] = df_train[by].apply(lambda x: '_'.join([str(v) for v in x]), axis=1)
df_train['key']

In [None]:
# external datas 
externalkey2subdf = {}
key_col = 'key'
_new_col_name = ''

def get_external_same_building_feature(row):
    global _new_col_name
    if row[key_col] not in externalkey2subdf:
        return {}
    subdf = externalkey2subdf[row[key_col]]
    if len(subdf) == 0:
        return {}
    else:
        return {
            'ID': row['ID'],
            f'{_new_col_name}_mean': subdf['單價'].mean(),
            f'{_new_col_name}_std': subdf['單價'].std(),
            f'{_new_col_name}_max': subdf['單價'].max(),
            f'{_new_col_name}_min': subdf['單價'].min(),
            f'{_new_col_name}_max_min_ratio': (subdf['單價'].max()-subdf['單價'].min()) / subdf['單價'].mean(),
        }
        
def mapping_external_gov_data_price(
    df_train, 
    df_valid, 
    df_external_gov_data, 
    by = ['縣市', '鄉鎮市區', '路名', '主要用途', '建物型態'], 
    new_col_name = 'externalkey_sameroad_price'):
    global _new_col_name
    _new_col_name = new_col_name
    
    df_train[key_col] = df_train[by].apply(lambda x: '_'.join([str(v) for v in x]), axis=1)
    df_valid[key_col] = df_valid[by].apply(lambda x: '_'.join([str(v) for v in x]), axis=1)
    df_external_gov_data[key_col] = df_external_gov_data[by].apply(lambda x: '_'.join([str(v) for v in x]), axis=1)
    
    le = LabelEncoder()
    le.fit(df_train[key_col].values.tolist() + df_valid[key_col].values.tolist() + df_external_gov_data[key_col].values.tolist())
    df_train[key_col] = le.transform(df_train[key_col].values.tolist())
    df_valid[key_col] = le.transform(df_valid[key_col].values.tolist())
    df_external_gov_data[key_col] = le.transform(df_external_gov_data[key_col].values.tolist())


    
    global externalkey2subdf
    externalkey2subdf = {}
    for key, subdf in df_external_gov_data.groupby(key_col):
        externalkey2subdf[key] = subdf
    
    with Pool(22) as pool:
        features = list(tqdm(pool.imap(get_external_same_building_feature, df_train.to_dict('records')), total=len(df_train)))
    df_train_features = pd.DataFrame(features)
    df_train = df_train.merge(df_train_features, how='left', on='ID')
    
    with Pool(22) as pool:
        features = list(tqdm(pool.imap(get_external_same_building_feature, df_valid.to_dict('records')), total=len(df_valid)))
    df_valid_features = pd.DataFrame(features)
    df_valid = df_valid.merge(df_valid_features, how='left', on='ID')
    return df_train, df_valid

In [None]:
df_train, df_valid = mapping_external_gov_data_price(
    df_train, 
    df_valid, 
    df_external_gov_data, 
    by = ['縣市', '鄉鎮市區', '路名'], 
    new_col_name = 'external_key1_price'
)
na_cnt = sum(df_train['external_key1_price_mean'].isna())
mapping_rate = 1 - na_cnt / len(df_train)
print(f'mapping_rate = {round(mapping_rate*100, 3)}%')
df_train[['單價', 'external_key1_price_mean']].corr()

In [None]:
df_train, df_valid = mapping_external_gov_data_price(
    df_train, 
    df_valid, 
    df_external_gov_data, 
    by = ['縣市', '鄉鎮市區', '路名', '主要用途', '建物型態'], 
    new_col_name = 'external_key2_price'
)
na_cnt = sum(df_train['external_key2_price_mean'].isna())
mapping_rate = 1 - na_cnt / len(df_train)
print(f'mapping_rate = {round(mapping_rate*100, 3)}%')
df_train[['單價', 'external_key2_price_mean']].corr()

In [None]:
df_train, df_valid = mapping_external_gov_data_price(
    df_train, 
    df_valid, 
    df_external_gov_data, 
    by = ['縣市', '鄉鎮市區', '路名', '主要用途', '建物型態', '總樓層數'], 
    new_col_name = 'external_key3_price'
)
na_cnt = sum(df_train['external_key3_price_mean'].isna())
mapping_rate = 1 - na_cnt / len(df_train)
print(f'mapping_rate = {round(mapping_rate*100, 3)}%')
df_train[['單價', 'external_key3_price_mean']].corr()

In [None]:
df_train, df_valid = mapping_external_gov_data_price(
    df_train, 
    df_valid, 
    df_external_gov_data, 
    by = ['縣市', '鄉鎮市區', '路名', '主要用途', '建物型態', '總樓層數', '移轉層次'], 
    new_col_name = 'external_key4_price'
)
na_cnt = sum(df_train['external_key4_price_mean'].isna())
mapping_rate = 1 - na_cnt / len(df_train)
print(f'mapping_rate = {round(mapping_rate*100, 3)}%')
df_train[['單價', 'external_key4_price_mean']].corr()

In [None]:
df_train['附屬建物面積'].value_counts().reset_index().head()

In [None]:
df_valid['附屬建物面積'].value_counts().reset_index().head()

In [None]:
df_test['附屬建物面積'].value_counts().reset_index().head()

In [None]:
df_external_gov_data['附屬建物面積'].value_counts().reset_index().head()

In [None]:
df_extra = pd.read_csv('../官方資料集/external_data/國小基本資料.csv')
df_extra.head(1)

In [None]:
df_extra = pd.read_csv('../官方資料集/external_data/國中基本資料.csv')
df_extra.head(1)

In [None]:
df_extra = pd.read_csv('../官方資料集/external_data/大學基本資料.csv')
df_extra.head(1)

In [None]:
pd.set_option('display.max_columns', None)
df_dataset = pd.read_csv('../官方資料集/public_dataset.csv')
df_dataset.sample()

In [None]:
df_train = pd.read_csv('../官方資料集/training_data.csv')

In [None]:
df_train['屋齡'].describe()

In [None]:
df_train.sample()

In [None]:
len(df_train.groupby(['縣市', '鄉鎮市區']))

In [None]:
df_train['主要用途'].unique()

In [None]:
df_train['主要用途'].value_counts()

In [None]:
df_train.query('主要用途 == "住家用"')['單價'].describe()

In [None]:
df_train.query('主要用途 == "集合住宅"')['單價'].describe()

In [None]:
df_dataset['路名'].apply(lambda x: x[-1]).value_counts()

In [None]:
df_dataset['縣市'].unique()

In [None]:
df_dataset['縣市'].value_counts()

In [None]:
df_dataset['縣市'].nunique()

In [None]:
df_train = pd.read_csv('../官方資料集/training_data.csv')
for col in df_train.columns:
    print(df_train[col].describe())
    print()

In [None]:
df_train['車位面積'].value_counts()

In [None]:
df_train.columns

In [None]:
df_train['縣市'].value_counts()

In [None]:
df_train['縣市'].unique()

In [None]:
df_train['主要建材'].value_counts()

In [None]:
df_train['建物型態'].value_counts()

In [None]:
df_train['備註'].value_counts().head(10)

In [None]:
df_train['屋齡'].value_counts().head(10)

In [None]:
import pyproj
twd97 = pyproj.Proj(init='epsg:3826')  # TWD97
wgs84 = pyproj.Proj(init='epsg:4326')  # WGS84

df_train = pd.read_csv('../官方資料集/training_data.csv')

In [None]:
def get_coordinate(row):
    lon, lat = pyproj.transform(twd97, wgs84, row['橫坐標'], row['縱坐標'])
    return {
        'ID': row['ID'],
        'Lon': lon,
        'Lat': lat
    }

with Pool(22) as pool:
    features = list(tqdm(pool.imap(get_coordinate, df_train.to_dict('records')), total=len(df_train)))
df_train_features = pd.DataFrame(features).fillna(-999999)
df_train = df_train.merge(df_train_features, how='left', on='ID')

In [None]:
import folium
import json
m = folium.Map(location=(df_train['Lat'].values[0], df_train['Lon'].values[0]), zoom_start=17)
for i, row in tqdm(df_train.iterrows()):
    folium.Marker(
        location=[row['Lat'], row['Lon']],
        popup=row.to_dict(),
        icon=folium.Icon(icon="info"),
    ).add_to(m)
    if i > 100:
        break
m