In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import lightgbm as lgb

import time

In [2]:
def num_null(df):
    missing = df.isnull().sum()
    print('Show #missing in the columns:')
    for i in range(df.shape[1]):
        if missing[i]:
            print(missing.index[i], ':', missing[i])


def metric(truth, pred):
    truth = np.array(truth)
    pred = np.array(pred)
    diff = abs(pred - truth) / truth
    print(list(diff <= 0.1).count(True) / len(diff))

In [3]:
train = pd.read_csv('./dataset/train.csv')
test = pd.read_csv('./dataset/test.csv')

In [4]:
# train['town'] = train['city'] * 1000 + train['town']
# test['town'] = test['city'] * 1000 + test['town']

In [5]:
train['total_price'] = np.log1p(train["total_price"] / train["building_area"])
# train['total_price'] = np.log1p(train["total_price"])

In [6]:
X = pd.concat([train.drop(['total_price'], axis=1), test])
X.reset_index(inplace=True, drop=True)

In [7]:
num_null(X)

Show #missing in the columns:
parking_area : 66397
parking_price : 53775
txn_floor : 18541
village_income_median : 1326


In [8]:
pd.options.mode.chained_assignment = None

In [9]:
X_imp = X.copy()
X_imp['parking_area'].fillna(0, inplace=True)
X_imp['parking_price'].fillna(X_imp['parking_price'].median(), inplace=True)
X_imp['txn_floor'].fillna(-1, inplace=True)

In [10]:
vimm = X_imp.groupby('village').agg({'village_income_median':'mean'})

vim_isnan = np.isnan(X_imp['village_income_median'])
for i in range(len(X_imp)):
    if vim_isnan[i]:
        X_imp['village_income_median'][i] = vimm.loc[X_imp['village'][i], 'village_income_median']

In [11]:
num_null(X_imp)

Show #missing in the columns:
village_income_median : 938


In [12]:
tvimm = X_imp.groupby('town').agg({'village_income_median':'median'})

vim_isnan = np.isnan(X_imp['village_income_median'])
for i in range(len(X_imp)):
    if vim_isnan[i]:
        X_imp['village_income_median'][i] = tvimm.loc[X_imp['town'][i], 'village_income_median']

In [13]:
num_null(X_imp)

Show #missing in the columns:


In [14]:
# X_imp['dt_diff'] = X_imp['txn_dt'] - X_imp['building_complete_dt']

In [15]:
# X_imp.drop(list(X_imp.filter(regex=('_MIN'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_index_'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('^N'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_10$'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_50$'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_100$'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_250$'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_500$'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_1000$'))), axis=1, inplace=True)
# print(X_imp.shape)
# X_imp.drop(list(X_imp.filter(regex=('_5000$'))), axis=1, inplace=True)
# print(X_imp.shape)

In [16]:
# concat_train = pd.concat([X_train, Y_train], axis=1)
concat_train = train.copy()

# cat_mean_df = concat_train.groupby('building_material').agg({'total_price':'mean'})
# X_imp['building_material'] = list(cat_mean_df.loc[list(X_imp['building_material']), :]['total_price'])

# cat_mean_df = concat_train.groupby('city').agg({'total_price':'mean'})
# X_imp['city'] = list(cat_mean_df.loc[list(X_imp['city']), :]['total_price'])

# cat_mean_df = concat_train.groupby('town').agg({'total_price':'mean'})
# X_imp['town'] = list(cat_mean_df.loc[list(X_imp['town']), :]['total_price'])

# cat_mean_df = concat_train.groupby('village').agg({'total_price':'mean'})
# X_imp['village'] = list(cat_mean_df.loc[list(X_imp['village']), :]['total_price'])

# cat_mean_df = concat_train.groupby('building_type').agg({'total_price':'mean'})
# X_imp['building_type'] = list(cat_mean_df.loc[list(X_imp['building_type']), :]['total_price'])

# cat_mean_df = concat_train.groupby('building_use').agg({'total_price':'mean'})
# X_imp['building_use'] = list(cat_mean_df.loc[list(X_imp['building_use']), :]['total_price'])

# cat_mean_df = concat_train.groupby('parking_way').agg({'total_price':'mean'})
# X_imp['parking_way'] = list(cat_mean_df.loc[list(X_imp['parking_way']), :]['total_price'])





building_material_dummies = pd.get_dummies(X_imp['building_material'])
building_material_dummies.columns = ['building_material_' + str(col) for col in building_material_dummies.columns]
X_imp = pd.concat([X_imp, building_material_dummies], axis=1)
X_imp.drop(['building_material', building_material_dummies.columns[0]], inplace=True, axis=1)

city_dummies = pd.get_dummies(X_imp['city'])
city_dummies.columns = ['city_' + str(col) for col in city_dummies.columns]
X_imp = pd.concat([X_imp, city_dummies], axis=1)
X_imp.drop(['city', city_dummies.columns[0]], inplace=True, axis=1)

# town_dummies = pd.get_dummies(X_imp['town'])
# town_dummies.columns = ['town_' + str(col) for col in town_dummies.columns]
# X_imp = pd.concat([X_imp, town_dummies], axis=1)
# X_imp.drop(['town', town_dummies.columns[0]], inplace=True, axis=1)

# village_dummies = pd.get_dummies(X_imp['village'])
# village_dummies.columns = ['village_' + str(col) for col in village_dummies.columns]
# X_imp = pd.concat([X_imp, village_dummies], axis=1)
# X_imp.drop(['village', village_dummies.columns[0]], inplace=True, axis=1)

building_type_dummies = pd.get_dummies(X_imp['building_type'])
building_type_dummies.columns = ['building_type_' + str(col) for col in building_type_dummies.columns]
X_imp = pd.concat([X_imp, building_type_dummies], axis=1)
X_imp.drop(['building_type', building_type_dummies.columns[0]], inplace=True, axis=1)

building_use_dummies = pd.get_dummies(X_imp['building_use'])
building_use_dummies.columns = ['building_use_' + str(col) for col in building_use_dummies.columns]
X_imp = pd.concat([X_imp, building_use_dummies], axis=1)
X_imp.drop(['building_use', building_use_dummies.columns[0]], inplace=True, axis=1)

parking_way_dummies = pd.get_dummies(X_imp['parking_way'])
parking_way_dummies.columns = ['parking_way_' + str(col) for col in parking_way_dummies.columns]
X_imp = pd.concat([X_imp, parking_way_dummies], axis=1)
X_imp.drop(['parking_way', parking_way_dummies.columns[0]], inplace=True, axis=1)


In [17]:
# X_imp.drop(['parking_price', 'parking_area', 'parking_way'], axis=1, inplace = True)

In [18]:
post_train = X_imp.head(60000)
X_test = X_imp.tail(10000)

post_train.drop('building_id', axis=1, inplace=True)
post_train['total_price'] = train.head(60000)['total_price']

test_building_id = X_test['building_id']
X_test.drop('building_id', axis=1, inplace=True)

In [19]:
# rm_outlier_cols = ['total_price']
# for col in rm_outlier_cols:
#     col99 = np.percentile(post_train[col], 99)
# #     col01 = np.percentile(post_train[col], 1)
#     post_train = post_train[post_train[col] <= col99]
# #     post_train = post_train[post_train[col] >= col01]

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_valid, Y_train, Y_valid = train_test_split(
                        post_train.drop('total_price', axis=1), post_train['total_price'], test_size=0.2, random_state=42)


In [21]:
# for 
# Fit regression model
model = lgb.LGBMRegressor(learning_rate=0.1, num_leaves=1291, max_bin=500, nthread=20)

ts = time.time()
model.fit(
    X_train, 
    Y_train)
print(time.time() - ts)


Y_valid_predict = model.predict(X_valid)
Y_valid_predict = np.floor(np.expm1(Y_valid_predict))
Y_valid = np.floor(np.expm1(Y_valid))
metric(Y_valid, Y_valid_predict)

15.677560567855835
0.5583333333333333


In [22]:
Y_test_predict = model.predict(X_test)
Y_test_predict = np.floor(np.expm1(Y_test_predict)) * X_test['building_area']

#### train by all train data and make prediction for test data

In [23]:
ts = time.time()

model = lgb.LGBMRegressor(learning_rate=0.1, num_leaves=1291, max_bin=500)

model.fit(
    post_train.drop('total_price', axis=1), 
    post_train['total_price'])

print(time.time() - ts)

16.540805339813232


In [24]:
Y_test_predict = model.predict(X_test)
Y_test_predict = np.floor(np.expm1(Y_test_predict)) * X_test['building_area']

In [25]:
submission = pd.DataFrame({
    "building_id": test_building_id, 
    "total_price": Y_test_predict
})
submission.to_csv('./output/submission_lgbm_area_trainByAlltrain.csv', index=False)

In [26]:
submission

Unnamed: 0,building_id,total_price
60000,X5gsdTWGS3W7JJQB,9.825002e+06
60001,BTshNOJyKHnT2YIT,3.856616e+06
60002,dhdymr0lV8N5kZOT,9.501463e+06
60003,VEwyGGMcD56w5BOc,6.006644e+06
60004,wmUeMoJZfsqaSX9b,1.098023e+06
60005,EtBjGAHmHCe9t7TZ,3.126398e+06
60006,hPNH34vmaZtvBtqc,1.251853e+07
60007,wXjeI38bYDMJJwZC,5.832763e+06
60008,fxZSGX6aPAFKU8W4,1.677061e+06
60009,ewr0Fx6ign87OwaV,4.288866e+06
