# `如何使用機器學習提高房仲業潛在成交率_系列4_商業應用與綜整分析`
## 作者：陳政廷、王裕萍、謝豐檍(臺灣行銷研究特邀作者)、鍾皓軒(臺灣行銷研究有限公司創辦人）

## 原始資料請見[本連結](https://www.kaggle.com/c/two-sigma-connect-rental-listing-inquiries/data)，下載下來後與本ipynb檔案放於同一個工作目錄中，再執行下方程式即可

In [60]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN
from sklearn.metrics import accuracy_score
import lightgbm as lgb
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN

## 系列1_資料前處理

In [61]:
df_train=pd.read_json(open(r"train.json"))
#系列1_資料前處理
df_train["num_photos"]=df_train["photos"].apply(len)
df_train["num_description_words"]=df_train["description"].apply(lambda x:len(x.split(" "))) #根據空白切割，去算字數
def caculate(n_words):
    description_test=[]
    for i in n_words:
        description_test.append(len(i.split(" ")))
    return description_test #效果和上面一樣
df_train["created"]=pd.to_datetime(df_train["created"])
df_train["created_year"]=df_train["created"].dt.year
df_train["created_month"] = df_train["created"].dt.month
df_train["created_day"] = df_train["created"].dt.day

## 系列2_特徵處理：篩選頻率大於100之字詞

In [62]:
bigdic = {}
for i in df_train['features']:
    for word in i:
        if word not in bigdic:
            bigdic[word] = 1
        else:
            bigdic[word] += 1
data = {'word': [*bigdic], 'number': [*bigdic.values()]}   
bigdf = pd.DataFrame.from_dict(data).sort_values(by='number', ascending=False)
keyword = bigdf[bigdf['number'] > 100]['word'].to_list()
print(keyword)
#
def get_dummy(keyword, alist):
    countlist = []
    for row in df_train['features']:
        if keyword in row:
            countlist.append(1)
        else:
            countlist.append(0)
    df_train[keyword] = countlist
for word in keyword:
    get_dummy(word, df_train['features'])
#
choose_columns=["bathrooms", "bedrooms", "latitude", "longitude","num_photos","price",'Elevator', 'Cats Allowed', 'Hardwood Floors', 'Dogs Allowed', 'Doorman', 'Dishwasher', 'No Fee', 'Laundry in Building', 'Fitness Center', 'Pre-War', 'Laundry in Unit', 'Roof Deck', 'Outdoor Space', 'Dining Room', 'High Speed Internet', 'Balcony', 'Swimming Pool', 'Laundry In Building', 'New Construction', 'Terrace', 'Exclusive', 'Loft', 'Garden/Patio', 'Wheelchair Access', 'Common Outdoor Space', 'HARDWOOD', 'Fireplace', 'SIMPLEX', 'prewar', 'LOWRISE', 'Garage', 'Laundry Room', 'Reduced Fee', 'Laundry In Unit', 'Furnished', 'Multi-Level', 'Private Outdoor Space', 'Prewar', 'PublicOutdoor', 'Parking Space', 'Roof-deck', 'dishwasher', 'High Ceilings', 'elevator', 'Renovated', 'Pool', 'LAUNDRY', 'Green Building', 'HIGH CEILINGS', 'LIVE IN SUPER', 'High Ceiling', 'Washer in Unit', 'Dryer in Unit', 'Storage', 'Stainless Steel Appliances', 'On-site laundry', 'Concierge', 'Newly renovated', 'On-site Laundry', 'Live In Super', 'Hardwood', 'Light', 'On-site Garage', 'Washer/Dryer', 'Granite Kitchen', 'Gym/Fitness', 'Pets on approval', 'Marble Bath', 'Walk in Closet(s)']

['Elevator', 'Cats Allowed', 'Hardwood Floors', 'Dogs Allowed', 'Doorman', 'Dishwasher', 'No Fee', 'Laundry in Building', 'Fitness Center', 'Pre-War', 'Laundry in Unit', 'Roof Deck', 'Outdoor Space', 'Dining Room', 'High Speed Internet', 'Balcony', 'Swimming Pool', 'Laundry In Building', 'New Construction', 'Terrace', 'Exclusive', 'Loft', 'Garden/Patio', 'Wheelchair Access', 'Common Outdoor Space', 'HARDWOOD', 'Fireplace', 'SIMPLEX', 'prewar', 'LOWRISE', 'Garage', 'Laundry Room', 'Reduced Fee', 'Laundry In Unit', 'Furnished', 'Multi-Level', 'Private Outdoor Space', 'Prewar', 'PublicOutdoor', 'Parking Space', 'Roof-deck', 'dishwasher', 'High Ceilings', 'elevator', 'Renovated', 'Pool', 'LAUNDRY', 'Green Building', 'HIGH CEILINGS', 'LIVE IN SUPER', 'High Ceiling', 'Washer in Unit', 'Dryer in Unit', 'Storage', 'Stainless Steel Appliances', 'On-site laundry', 'Concierge', 'Newly renovated', 'On-site Laundry', 'Live In Super', 'Hardwood', 'Light', 'On-site Garage', 'Washer/Dryer', 'Granite K

## LGBM：特徵篩選、標準化、模型訓練

In [63]:
#lgb
X=df_train[choose_columns]
Y=df_train["interest_level"]
#
lgb = lgb.LGBMClassifier().fit(X,Y)	
for feature in zip(choose_columns, lgb.feature_importances_):
    print((feature))

selector = SelectFromModel(lgb,prefit = True,threshold=80)
print("Features selected by SelectFromModel: "
      f"{np.array(choose_columns)[selector.get_support()]}")
X = X[np.array(choose_columns)[selector.get_support()]]
#
X_train_lgb, X_test_lgb, Y_train_lgb, Y_test_lgb = train_test_split(X, Y, test_size=0.2, random_state = 0)

('bathrooms', 230)
('bedrooms', 806)
('latitude', 1655)
('longitude', 1456)
('num_photos', 602)
('price', 1824)
('Elevator', 95)
('Cats Allowed', 69)
('Hardwood Floors', 103)
('Dogs Allowed', 63)
('Doorman', 47)
('Dishwasher', 79)
('No Fee', 205)
('Laundry in Building', 90)
('Fitness Center', 55)
('Pre-War', 122)
('Laundry in Unit', 84)
('Roof Deck', 30)
('Outdoor Space', 65)
('Dining Room', 48)
('High Speed Internet', 33)
('Balcony', 63)
('Swimming Pool', 46)
('Laundry In Building', 125)
('New Construction', 34)
('Terrace', 52)
('Exclusive', 73)
('Loft', 36)
('Garden/Patio', 22)
('Wheelchair Access', 44)
('Common Outdoor Space', 27)
('HARDWOOD', 69)
('Fireplace', 16)
('SIMPLEX', 10)
('prewar', 6)
('LOWRISE', 3)
('Garage', 11)
('Laundry Room', 4)
('Reduced Fee', 78)
('Laundry In Unit', 60)
('Furnished', 114)
('Multi-Level', 15)
('Private Outdoor Space', 55)
('Prewar', 2)
('PublicOutdoor', 3)
('Parking Space', 28)
('Roof-deck', 5)
('dishwasher', 11)
('High Ceilings', 16)
('elevator', 19

In [64]:
regression_col_lgb= ["bathrooms", "bedrooms", "latitude", "longitude","num_photos","price"]
classification_col_lgb = [ 'Elevator', 'Hardwood Floors', 'No Fee','Pre-War', 'Laundry In Building', 'Furnished']
sc_lgb = StandardScaler()
X_train_lgb_r = sc_lgb.fit_transform(X_train_lgb[regression_col_lgb])
X_test_lgb_r = sc_lgb.transform(X_test_lgb[regression_col_lgb])
X_train_lgb_r = pd.DataFrame(X_train_lgb_r).reset_index(drop=True)
X_test_lgb_r = pd.DataFrame(X_test_lgb_r).reset_index(drop=True)
X_train_lgb_c = X_train_lgb[classification_col_lgb].reset_index(drop=True)
X_test_lgb_c = X_test_lgb[classification_col_lgb].reset_index(drop=True)
X_train_lgb_without_ros = pd.concat([X_train_lgb_r, X_train_lgb_c],axis=1)
X_test_lgb_without_ros = pd.concat([X_test_lgb_r, X_test_lgb_c], axis = 1)
#
import lightgbm as lgb
clf_lgb= lgb.LGBMClassifier(random_state=0)
clf_lgb.fit(X_train_lgb, Y_train_lgb)
print('Log loss on trainging_lgb dataset: ',log_loss(Y_train_lgb, clf_lgb.predict_proba(X_train_lgb)))
print('Log loss on testing_lgb dataset: ',log_loss(Y_test_lgb, clf_lgb.predict_proba(X_test_lgb)))

Log loss on trainging_lgb dataset:  0.5303269259011405
Log loss on testing_lgb dataset:  0.5919384052931106


In [65]:
#預測成果
predict_interest=clf_lgb.predict(X_test_lgb)
#建構turnover
lgb_turnover=[]
for i in Y_test_lgb:
    if i == "low":
        lgb_turnover.append(0.11)
    elif i == "medium":
        lgb_turnover.append(0.33)
    else:
        lgb_turnover.append(0.5)

In [66]:
# 建立dataframe
lgb_df=pd.DataFrame({"Predict_interest":predict_interest,"Actual_interest":Y_test_lgb,"Turnover":lgb_turnover,"Price":X_test_lgb.price}).reset_index(drop=True)
import random
lgb_df_select=lgb_df[lgb_df["Predict_interest"]=="high"].sample(n=200,replace=False,random_state=514).reset_index(drop=True)
lgb_df_select

Unnamed: 0,Predict_interest,Actual_interest,Turnover,Price
0,high,medium,0.33,1600
1,high,high,0.50,1525
2,high,medium,0.33,1675
3,high,high,0.50,1375
4,high,high,0.50,1650
...,...,...,...,...
195,high,medium,0.33,1800
196,high,high,0.50,1400
197,high,medium,0.33,1650
198,high,high,0.50,1800


In [67]:
lgb_df_select["sales_cost"]=5
lgb_df_select["share"]=0.15
lgb_df_select["revenue"]=lgb_df_select["Turnover"]*lgb_df_select["Price"]*lgb_df_select["share"]-lgb_df_select["sales_cost"]
lgb_df_select

Unnamed: 0,Predict_interest,Actual_interest,Turnover,Price,sales_cost,share,revenue
0,high,medium,0.33,1600,5,0.15,74.2000
1,high,high,0.50,1525,5,0.15,109.3750
2,high,medium,0.33,1675,5,0.15,77.9125
3,high,high,0.50,1375,5,0.15,98.1250
4,high,high,0.50,1650,5,0.15,118.7500
...,...,...,...,...,...,...,...
195,high,medium,0.33,1800,5,0.15,84.1000
196,high,high,0.50,1400,5,0.15,100.0000
197,high,medium,0.33,1650,5,0.15,76.6750
198,high,high,0.50,1800,5,0.15,130.0000


In [74]:
lgb_df_select.to_csv("使用機器學習方法_LGBM.csv",encoding="utf-8-sig")

In [68]:
lgb_df_select.groupby("Actual_interest",as_index=False).sum("revenue")

Unnamed: 0,Actual_interest,Turnover,Price,sales_cost,share,revenue
0,high,51.0,205211,510,15.3,14880.825
1,low,3.63,75820,165,4.95,1086.03
2,medium,21.45,146876,325,9.75,6945.362


## 隨機挑選之模型

In [69]:
X=df_train[choose_columns]
Y=df_train["interest_level"]
X_train_random, X_test_random, Y_train_random, Y_test_random = train_test_split(X, Y, test_size=0.2, random_state = 0)
#建構turnover
random_turnover=[]
for i in Y_test_random:
    if i == "low":
        random_turnover.append(0.11)
    elif i == "medium":
        random_turnover.append(0.33)
    else:
        random_turnover.append(0.5)
#建構df
random_df=pd.DataFrame({"Actual_interest":Y_test_random,"Turnover":random_turnover,"Price":X_test_random.price}).reset_index(drop=True)
random_df_select=random_df.sample(n=200,replace=False,random_state=514).reset_index(drop=True)
random_df_select

Unnamed: 0,Actual_interest,Turnover,Price
0,low,0.11,2495
1,medium,0.33,3250
2,low,0.11,15000
3,low,0.11,6738
4,medium,0.33,2500
...,...,...,...
195,medium,0.33,5600
196,low,0.11,3871
197,medium,0.33,4200
198,medium,0.33,2631


In [70]:
random_df_select["sales_cost"]=5
random_df_select["share"]=0.15
random_df_select["revenue"]=random_df_select["Turnover"]*random_df_select["Price"]*random_df_select["share"]-random_df_select["sales_cost"]
random_df_select

Unnamed: 0,Actual_interest,Turnover,Price,sales_cost,share,revenue
0,low,0.11,2495,5,0.15,36.1675
1,medium,0.33,3250,5,0.15,155.8750
2,low,0.11,15000,5,0.15,242.5000
3,low,0.11,6738,5,0.15,106.1770
4,medium,0.33,2500,5,0.15,118.7500
...,...,...,...,...,...,...
195,medium,0.33,5600,5,0.15,272.2000
196,low,0.11,3871,5,0.15,58.8715
197,medium,0.33,4200,5,0.15,202.9000
198,medium,0.33,2631,5,0.15,125.2345


In [75]:
random_df_select.to_csv("未使用機器學習方法_Random.csv",encoding="utf-8-sig")

In [71]:
random_df_select.groupby("Actual_interest",as_index=False).sum("revenue")

Unnamed: 0,Actual_interest,Turnover,Price,sales_cost,share,revenue
0,high,7.5,43060,75,2.25,3154.5
1,low,15.4,591679,700,21.0,9062.7035
2,medium,14.85,152341,225,6.75,7315.8795
