# 賃料推定エンジンの構築

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
import math
import xgboost
import matplotlib.pyplot as plt
%matplotlib inline

## データ整形

In [2]:
data = pd.read_csv('train_aparts.csv')

In [110]:
#data.head(100)

In [7]:
pd.get_dummies(data.loc[:, 'prefecture']).head()

Unnamed: 0,千葉県,埼玉県,東京都,神奈川県
0,0.0,1.0,0.0,0.0
1,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0


In [12]:
pd.get_dummies(data.loc[:, 'type']).head()

Unnamed: 0,12SLDK,1DK,1K,1LDK,1LK,1SDK,1SK,1SLDK,1SLK,2DK,2K,2LDK,2LK,2SDK,2SLDK,3DK,3LDK,3SLDK,4LDK,ワンルーム
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [3]:
data_to_dummy = pd.concat([data, pd.get_dummies(data.loc[:, 'type']), pd.get_dummies(data.loc[:, 'prefecture'])], axis = 1).drop(['type', 'hougaku', 'kouzo', 'prefecture'], axis = 1).dropna()

In [4]:
len(data_to_dummy)

36491

In [5]:
Y = data_to_dummy.iloc[:, 0]
Y = Y.astype(np.float64)
X = data_to_dummy.iloc[:, 1:]

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

### ランダムフォレスト

In [50]:
randomforest = RandomForestRegressor()

In [51]:
randomforest.fit(X_train, Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [54]:
#len(data_to_dummy[data_to_dummy['dummy_ivc'] == 1])

67

In [65]:
RFpredict = randomforest.predict(X_test)
y_test = [float(y) for y in list(Y_test)]

In [11]:
def cal_rate(predict, y_test):
    y_test = y_test = [float(y) for y in list(y_test)]
    total = len(predict)
    count_five = 0.0
    count_ten = 0.0
    for i in range(total):
        rate = math.fabs(predict[i] - y_test[i]) / y_test[i]
        if rate > 0.05:
            count_five += 1
        if rate > 0.1:
            count_ten += 1
    return 1 - (count_five / total), 1 - (count_ten / total)

In [71]:
cal_rate(RFpredict, y_test)

(0.6825605612188973, 0.8778910446125178)

In [94]:
params = {'n_estimators': [50, 60, 70, 80],
          'max_features': [9, 10, 11, 12, 13, 15, 20]
          #'max_depth': [3, 5, 7, 9]
          }

In [95]:
rf_gs = GridSearchCV(RandomForestRegressor(), params)

In [96]:
rf_gs.fit(X_test, Y_test)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': [9, 10, 11, 12, 13, 15, 20], 'n_estimators': [50, 60, 70, 80]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [97]:
rf_gs_predict = rf_gs.predict(X_test)

In [98]:
rf_gs.best_params_

{'max_features': 13, 'n_estimators': 50}

In [99]:
cal_rate(rf_gs_predict, y_test)

(0.8877562205414885, 0.9832292009207497)

### XGboost

In [7]:
xgboost = xgboost.XGBRegressor()

In [8]:
xgboost.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [9]:
xgb_predict = xgboost.predict(X_test)

In [13]:
cal_rate(xgb_predict, Y_test)

(0.4030472432313932, 0.6928641894113778)

In [120]:
# Grid Search
params={'max_depth': [3, 5, 7, 10],
        'subsample': [0.5, 0.75, 1],
        'colsample_bytree': [0.5, 0.75, 1],
        'n_estimators': [100, 300, 500]
}

gs = GridSearchCV(xgboost,
                  params,
                  n_jobs= -1
                  )
gs.fit(X_train, Y_train)
predict = gs.predict(X_test)

In [121]:
cal_rate(predict, y_test)

(0.11816288501589389, 0.22952975994738578)

In [None]:
plt.plot(xgb_predict, Y_test)