In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import lightgbm as lgb
import time

from sklearn.model_selection import KFold

In [2]:
def num_null(df):
    missing = df.isnull().sum()
    print('Show #missing in the columns:')
    for i in range(df.shape[1]):
        if missing[i]:
            print(missing.index[i], ':', missing[i])

            
def metric(truth, pred):
    truth = np.array(truth)
    pred = np.array(pred)
    diff = abs(pred - truth) / truth
    print(list(diff <= 0.1).count(True) / len(diff))
    
    
def metric_df(df):
    truth = np.array(df['truth'])
    pred = np.array(df['pred'])
    diff = abs(pred - truth) / truth
#     print(list(diff <= 0.1).count(True) / len(diff))
    return list(diff <= 0.1).count(True) / len(diff)

In [3]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

In [4]:
num_null(train)

Show #missing in the columns:
parking_area : 56897
parking_price : 46065
txn_floor : 15902
village_income_median : 1142


In [5]:
train['total_price'] = np.log1p(train["total_price"] / train["building_area"])

In [6]:
X = pd.concat([train.drop(['total_price'], axis=1), test])
X.reset_index(inplace=True, drop=True)

#### Imputation

missing value imputation strategy:   
parking_area：補零  
parking_price：中位數  
txn_floor：中位數  
village_income_median：以經緯度取kNN

In [7]:
num_null(X)

Show #missing in the columns:
parking_area : 66397
parking_price : 53775
txn_floor : 18541
village_income_median : 1326


In [8]:
pd.options.mode.chained_assignment = None
# from sklearn.neighbors import NearestNeighbors
# def knn_impute(df, imputed_column, by_column_list):

#     imputed_index = df.index[df[imputed_column].isnull()]
# #     print(imputed_index)
#     knner = df.loc[:, by_column_list]
#     knner.drop(imputed_index, inplace=True)
#     neigh = NearestNeighbors()
#     neigh.fit(knner)
    
# #     print(df.loc[:, by_column_list].iloc[imputed_index, :].shape)
#     neighbor_index = neigh.kneighbors(df.loc[:, by_column_list].iloc[imputed_index, :],return_distance=False)  #get the index
# #     print(neighbor_index.shape)
#     for count, index in enumerate(imputed_index):
#         df[imputed_column][index] = df.iloc[neighbor_index[count], :][imputed_column].mean()

In [9]:
X_imp = X.copy()
X_imp['parking_area'].fillna(0, inplace=True)
X_imp['parking_price'].fillna(X_imp['parking_price'].median(), inplace=True)
X_imp['txn_floor'].fillna(X_imp['txn_floor'].median(), inplace=True)
# X_imp['txn_floor'].fillna(X_imp['total_floor'] / 2, inplace=True)

In [10]:
# knn_impute(X_imp, 'village_income_median', ['lat', 'lon'])
vimm = X_imp.groupby('village').agg({'village_income_median':'mean'})

vim_isnan = np.isnan(X_imp['village_income_median'])
for i in range(len(X_imp)):
    if vim_isnan[i]:
        X_imp['village_income_median'][i] = vimm.loc[X_imp['village'][i], 'village_income_median']

num_null(X_imp)


tvimm = X_imp.groupby('town').agg({'village_income_median':'median'})

vim_isnan = np.isnan(X_imp['village_income_median'])
for i in range(len(X_imp)):
    if vim_isnan[i]:
        X_imp['village_income_median'][i] = tvimm.loc[X_imp['town'][i], 'village_income_median']

# knn_impute(X_imp, 'village_income_median', ['lat', 'lon'])

num_null(X_imp)

Show #missing in the columns:
village_income_median : 938
Show #missing in the columns:


In [11]:
# X_imp['dt_diff'] = X_imp['txn_dt'] - X_imp['building_complete_dt']

In [12]:
# X_imp.drop(list(X_imp.filter(regex=('_MIN'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_index_'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('^N'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_10$'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_50$'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_100$'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_250$'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_500$'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_1000$'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_5000$'))), axis=1, inplace=True)
# print(X_imp.shape)

In [13]:
# concat_train = pd.concat([X_train, Y_train], axis=1)
concat_train = train.copy()

# cat_mean_df = concat_train.groupby('building_material').agg({'total_price':'mean'})
# X_imp['building_material'] = list(cat_mean_df.loc[list(X_imp['building_material']), :]['total_price'])

# cat_mean_df = concat_train.groupby('city').agg({'total_price':'mean'})
# X_imp['city'] = list(cat_mean_df.loc[list(X_imp['city']), :]['total_price'])

# cat_mean_df = concat_train.groupby('town').agg({'total_price':'mean'})
# X_imp['town'] = list(cat_mean_df.loc[list(X_imp['town']), :]['total_price'])

# cat_mean_df = concat_train.groupby('village').agg({'total_price':'mean'})
# X_imp['village'] = list(cat_mean_df.loc[list(X_imp['village']), :]['total_price'])

# cat_mean_df = concat_train.groupby('building_type').agg({'total_price':'mean'})
# X_imp['building_type'] = list(cat_mean_df.loc[list(X_imp['building_type']), :]['total_price'])

# cat_mean_df = concat_train.groupby('building_use').agg({'total_price':'mean'})
# X_imp['building_use'] = list(cat_mean_df.loc[list(X_imp['building_use']), :]['total_price'])

# cat_mean_df = concat_train.groupby('parking_way').agg({'total_price':'mean'})
# X_imp['parking_way'] = list(cat_mean_df.loc[list(X_imp['parking_way']), :]['total_price'])





building_material_dummies = pd.get_dummies(X_imp['building_material'])
building_material_dummies.columns = ['building_material_' + str(col) for col in building_material_dummies.columns]
X_imp = pd.concat([X_imp, building_material_dummies], axis=1)
X_imp.drop(['building_material', building_material_dummies.columns[0]], inplace=True, axis=1)

city_dummies = pd.get_dummies(X_imp['city'])
city_dummies.columns = ['city_' + str(col) for col in city_dummies.columns]
X_imp = pd.concat([X_imp, city_dummies], axis=1)
X_imp.drop(['city', city_dummies.columns[0]], inplace=True, axis=1)

# town_dummies = pd.get_dummies(X_imp['town'])
# town_dummies.columns = ['town_' + str(col) for col in town_dummies.columns]
# X_imp = pd.concat([X_imp, town_dummies], axis=1)
# X_imp.drop(['town', town_dummies.columns[0]], inplace=True, axis=1)

# village_dummies = pd.get_dummies(X_imp['village'])
# village_dummies.columns = ['village_' + str(col) for col in village_dummies.columns]
# X_imp = pd.concat([X_imp, village_dummies], axis=1)
# X_imp.drop(['village', village_dummies.columns[0]], inplace=True, axis=1)

building_type_dummies = pd.get_dummies(X_imp['building_type'])
building_type_dummies.columns = ['building_type_' + str(col) for col in building_type_dummies.columns]
X_imp = pd.concat([X_imp, building_type_dummies], axis=1)
X_imp.drop(['building_type', building_type_dummies.columns[0]], inplace=True, axis=1)

building_use_dummies = pd.get_dummies(X_imp['building_use'])
building_use_dummies.columns = ['building_use_' + str(col) for col in building_use_dummies.columns]
X_imp = pd.concat([X_imp, building_use_dummies], axis=1)
X_imp.drop(['building_use', building_use_dummies.columns[0]], inplace=True, axis=1)

parking_way_dummies = pd.get_dummies(X_imp['parking_way'])
parking_way_dummies.columns = ['parking_way_' + str(col) for col in parking_way_dummies.columns]
X_imp = pd.concat([X_imp, parking_way_dummies], axis=1)
X_imp.drop(['parking_way', parking_way_dummies.columns[0]], inplace=True, axis=1)


In [14]:
# X_imp.drop(['parking_price', 'parking_area', 'parking_way'], axis=1, inplace = True)

In [15]:
post_train = X_imp.head(60000)
X_test = X_imp.tail(10000)

post_train.drop('building_id', axis=1, inplace=True)
post_train['total_price'] = train.head(60000)['total_price']

test_building_id = X_test['building_id']
X_test.drop('building_id', axis=1, inplace=True)

In [16]:
# some original columns removed by our processing above
recover_colnames = ['building_material', 'building_use', 'building_type', 'parking_way', 'city']
post_train['building_material'] = train['building_material']
post_train['city'] = train['city']
post_train['building_type'] = train['building_type']
post_train['building_use'] = train['building_use']
post_train['parking_way'] = train['parking_way']

In [17]:
# X_train, X_valid, Y_train, Y_valid = train_test_split(
#                         post_train.drop('total_price', axis=1), post_train['total_price'], test_size=0.2, random_state=42)
kf = KFold(n_splits=5, shuffle=True)
kf.get_n_splits(post_train)

columns = list(train.columns) + ['pred', 'truth']
valid_result = pd.DataFrame(columns=columns)

for train_index, test_index in kf.split(post_train):
    X_train, X_valid = post_train.loc[train_index, post_train.columns!='total_price'], post_train.loc[test_index, post_train.columns!='total_price']
    Y_train, Y_valid = post_train.loc[train_index, 'total_price'], post_train.loc[test_index, 'total_price']
    
    X_train.drop(recover_colnames, axis=1, inplace=True)
    recover_columns_df = X_valid.loc[:, recover_colnames].copy()
    X_valid.drop(recover_colnames, axis=1, inplace=True)

    ts = time.time()

    model = lgb.LGBMRegressor(learning_rate=0.1, num_leaves=1291, max_bin=500)

    model.fit(
        X_train, 
        Y_train)

    print(time.time() - ts)

    Y_valid_predict = model.predict(X_valid)
    Y_valid_predict = np.floor(np.expm1(Y_valid_predict))
    Y_valid = np.floor(np.expm1(Y_valid))
    metric(Y_valid, Y_valid_predict)
    
    # plot-result
    valid_subresult = pd.concat([recover_columns_df, X_valid, Y_valid], axis=1)
    valid_subresult["pred"] = Y_valid_predict
    valid_subresult["truth"] = Y_valid
    
    valid_result = pd.concat([valid_result, valid_subresult])

        
valid_result.to_pickle('./output/lgbm_area_kfold_valid_result.pkl')
post_train.drop(recover_colnames, axis=1, inplace=True)

1 building_material
57.6415536403656
0.5528333333333333


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




1 building_material
60.93144750595093
0.5556666666666666
1 building_material
58.31770849227905
0.5499166666666667
1 building_material
66.04477214813232
0.5473333333333333
1 building_material
62.071139097213745
0.54625


Unnamed: 0,txn_dt,total_floor,building_complete_dt,parking_area,parking_price,txn_floor,land_area,building_area,town,lat,...,building_use_3,building_use_4,building_use_5,building_use_6,building_use_7,building_use_8,building_use_10,parking_way_1,parking_way_2,total_price
0,18674,4,6271,0.000000,43791.947141,3.0,18.144460,3.418175,334,-39.14,...,0,0,0,0,0,0,0,0,1,12.151933
1,18800,5,7885,0.000000,43791.947141,5.0,11.387227,4.041309,180,-37.66,...,0,0,0,0,0,0,0,0,1,13.619345
2,19289,4,6028,0.000000,43791.947141,1.0,21.426802,5.584279,180,-37.67,...,0,0,0,0,0,0,0,0,1,14.354282
3,20385,24,18325,0.000000,81138.889762,13.0,11.387227,13.563031,343,-39.13,...,0,0,0,0,0,0,0,0,0,13.862462
4,20657,2,6880,0.000000,43791.947141,4.0,61.306524,4.688108,102,-39.24,...,0,0,0,0,0,0,0,0,1,11.999613
5,18394,5,6576,0.000000,43791.947141,4.0,14.803422,3.623131,6,-39.14,...,0,0,0,0,0,0,0,0,1,11.602981
6,20474,15,16559,0.000000,61488.568950,8.0,11.387227,7.726227,84,-37.69,...,0,0,0,0,0,0,0,0,0,13.912167
7,18185,8,11934,0.000000,6721.823057,4.0,11.387227,7.974018,61,-38.05,...,0,0,0,0,0,0,0,1,0,12.257183
8,18794,5,7640,0.000000,43791.947141,1.0,13.106100,4.041309,49,-39.19,...,0,0,0,0,0,0,0,0,1,12.405883
9,19927,10,9863,0.000000,61488.568950,2.0,11.387227,7.480303,180,-37.67,...,0,0,0,0,0,0,0,0,0,14.272642


#### train by all train data and make prediction for test data

In [21]:
ts = time.time()

model = lgb.LGBMRegressor(learning_rate=0.1, num_leaves=1291, max_bin=500)

model.fit(
    post_train.drop('total_price', axis=1), 
    post_train['total_price'])

print(time.time() - ts)

63.96231198310852


In [22]:
Y_test_predict = model.predict(X_test)
Y_test_predict = np.floor(np.expm1(Y_test_predict)) * X_test['building_area']

In [23]:
submission = pd.DataFrame({
    "building_id": test_building_id, 
    "total_price": Y_test_predict
})
submission.to_csv('./output/submission_lgbm_area_kfold_trainByAlltrain.csv', index=False)

In [24]:
submission

Unnamed: 0,building_id,total_price
60000,X5gsdTWGS3W7JJQB,1.138752e+07
60001,BTshNOJyKHnT2YIT,3.783904e+06
60002,dhdymr0lV8N5kZOT,1.040318e+07
60003,VEwyGGMcD56w5BOc,5.944155e+06
60004,wmUeMoJZfsqaSX9b,1.012244e+06
60005,EtBjGAHmHCe9t7TZ,3.064548e+06
60006,hPNH34vmaZtvBtqc,1.207845e+07
60007,wXjeI38bYDMJJwZC,6.115362e+06
60008,fxZSGX6aPAFKU8W4,1.625644e+06
60009,ewr0Fx6ign87OwaV,4.651442e+06
