In [7]:
"""
使用基本的特征构建基本的 xgboost 模型
@author: MarkLiu
@time  : 17-5-25 下午9:03
"""
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
sys.path.append(module_path)

import numpy as np
import pandas as pd
import xgboost as xgb
# remove warnings
import warnings

warnings.filterwarnings('ignore')
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import train_test_split
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso

from readData import readData, data_preprocess

# my own module
from features import data_utils

In [2]:
X_train, X_test, y_train = readData(isLog1p=True)
#X_train, X_test = data_preprocess(X_train, X_test)

conbined_data: (36650, 702)


In [3]:
def CV(train, y, model, test_size=.2, isLog1p=True):
    X_train, X_val, y_train, y_val=train_test_split(train, y, test_size=test_size, 
                                                random_state=123)

    model.fit(X_train, y_train)
    
    if (isLog1p):
        train_rmse = mean_squared_error(y_train, model.predict(X_train))
        val_rmse = mean_squared_error(y_val, model.predict(X_val))
    else:
        train_rmse = mean_squared_error(np.log1p(y_train), np.log1p(model.predict(X_train)))
        val_rmse = mean_squared_error(np.log1p(y_val), np.log1p(model.predict(X_val)))
    print 'train_rmse =', np.sqrt(train_rmse), ', val_rmse =', np.sqrt(val_rmse)

In [4]:
# xgb_params = {
#         'learning_rate': 0.05,
#         'max_depth': 5,
#         'subsample': 0.7,
#         'colsample_bytree': 0.7,
#         'objective': 'reg:linear',
#         #'eval_metric': 'rmse',
#         'min_child_weight': 1,
#         'silent': 1,
#         'seed':5
#     }

xgb_params = {'learning_rate': 0.05,
              'max_depth': 4,
              'subsample': 0.95,
              'reg_alpha': 0,
              'min_child_weight':4,
              'colsample_bytree': 0.95,
              'gamma':.4,
              'objective': 'reg:linear',
              'silent': 1,
              #'booster' :'gbtree',
              #'tuneLength': 3,
              'seed': 5 }

dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test)

cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=40, nfold=5,\
    verbose_eval=50, show_stdv=False)
cv_output[['train-rmse-mean', 'test-rmse-mean']].plot()

print(len(cv_output)) #309

xgb_params['n_estimators']=len(cv_output)

model = XGBRegressor(**xgb_params)

# model = XGBRegressor(seed=5, learning_rate=.05, subsample=.7, max_depth=5, #min_child_weight=3, 
#             n_estimators=281, 
#             colsample_bytree=0.7
#         )

[0]	train-rmse:14.4027	test-rmse:14.4028
[50]	train-rmse:1.15845	test-rmse:1.1601
[100]	train-rmse:0.324909	test-rmse:0.33664
[150]	train-rmse:0.303985	test-rmse:0.321419
[200]	train-rmse:0.297296	test-rmse:0.319335
[250]	train-rmse:0.291982	test-rmse:0.318278
[300]	train-rmse:0.287582	test-rmse:0.317583
[350]	train-rmse:0.283497	test-rmse:0.317135
[400]	train-rmse:0.279629	test-rmse:0.316825
[450]	train-rmse:0.275974	test-rmse:0.316666
[500]	train-rmse:0.272538	test-rmse:0.316542
[550]	train-rmse:0.269331	test-rmse:0.316507
[600]	train-rmse:0.266131	test-rmse:0.316368
[650]	train-rmse:0.263164	test-rmse:0.316343
620


In [None]:
xgb_params = {'learning_rate': 0.05,
              'max_depth': 4,
              'subsample': 0.95,
              'reg_alpha': 0,
              'min_child_weight':4,
              'colsample_bytree': 0.95,
              'gamma':.4,
              'objective': 'reg:linear',
              'silent': 1,
              #'booster' :'gbtree',
              #'tuneLength': 3,
              'seed': 5 }
#print(len(cv_output)) #309

xgb_params['n_estimators']=620 #len(cv_output)

model = XGBRegressor(**xgb_params)


In [5]:
rf_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 12,
    'min_samples_leaf': 2,
}
model1 = RandomForestRegressor(**rf_params)

In [6]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
    'random_state': 10
}
model2 = ExtraTreesRegressor(**et_params)

In [7]:
model3=GradientBoostingRegressor(learning_rate=0.02, n_estimators=500,min_samples_leaf=70, 
                                 min_samples_split=200, max_features='sqrt',max_depth=6,subsample=0.85,
                                 random_state=10)

In [9]:
X_train, X_test = data_preprocess(X_train, X_test)
for m, name in zip([model, model1, model2, model3],['xgb','rf','et','gbm']):
    print(name)
    CV(X_train, y_train, m)

xgb
train_rmse = 0.266837631073 , val_rmse = 0.31900151058
rf
train_rmse = 0.241249700631 , val_rmse = 0.324802592639
et
train_rmse = 0.247191933346 , val_rmse = 0.324373969057
gbm
train_rmse = 0.290022115756 , val_rmse = 0.319881574992


In [12]:
X_train, X_test = data_preprocess(X_train, X_test)
rd_params = {'alpha': .5}
model = Ridge(**rd_params)

In [14]:
ls_params = {
    'alpha': 0.005
}
model = Lasso(**ls_params)

In [15]:
CV(X_train, y_train, model)
#not log1p, original train_rmse = 0.252746403304 , val_rmse = 0.320226748185
#log1p, orginal train_rmse = 0.257648786803 , val_rmse = 0.320517933831, 

#log1p, my deleted train_rmse = 0.257648786803 , val_rmse = 0.320517933831, 
#log1p, my deleted added train_rmse = 0.257648786803 , val_rmse = 0.320517933831

#log1p seed=5, learning_rate=.05, subsample=.7, max_depth=5, #min_child_weight=3, n_estimators=281, 
#colsample_bytree=0.7 train_rmse = 0.266050150088 , val_rmse = 0.319972286248

#jun23 data log1p xgb seed=5, learning_rate=.05, subsample=.7, max_depth=5, #min_child_weight=3, n_estimators=309, 
#colsample_bytree=0.7 train_rmse = 0.261787007971 , val_rmse = 0.31980116091


#jun24 data log1p xgb seed=5, learning_rate=.05, subsample=.7, max_depth=5, #min_child_weight=3, n_estimators=332, 
#colsample_bytree=0.7, train_rmse = 0.258649992867 , val_rmse = 0.320011541733

#jun24 data log1p xgb_params = {'learning_rate': 0.05,'max_depth': 4,'subsample': 0.95,'reg_alpha': 0,
#'min_child_weight':4,'colsample_bytree': 0.95,'gamma':.4,'objective': 'reg:linear',
#'seed': 5, 'n_estimators':620 train_rmse = 0.265971694683 , val_rmse = 0.319114085918

#jun24 data log1p GradientBoostingRegressor(learning_rate=0.02, n_estimators=500,min_samples_leaf=70, 
#min_samples_split=200, max_features='sqrt',max_depth=6,subsample=0.85,random_state=10)
#train_rmse = 0.265971694683 , val_rmse = 0.319114085918

#jun24 data log1p RF rf_params = {'n_jobs': 16,'n_estimators': 100,'max_features': 0.2,'max_depth': 12,
#'min_samples_leaf': 2}train_rmse = 0.241249700631 , val_rmse = 0.324802592639

#jun24 data log1p ET et_params = {'n_jobs': 16,'n_estimators': 100,'max_features': 0.5,'max_depth': 12,
#'min_samples_leaf': 2,'random_state': 10} train_rmse = 0.247191933346 , val_rmse = 0.324373969057

#jun25 data log1p xgb_params = {'learning_rate': 0.05,'max_depth': 4,'subsample': 0.95,'reg_alpha': 0,
#'min_child_weight':4,'colsample_bytree': 0.95,'gamma':.4,'objective': 'reg:linear',
#'seed': 5, 'n_estimators':620 train_rmse = 0.265971694683 , val_rmse = 0.319114085918

#jun25 data log1p, rd alpha=10, train_rmse = 0.321207016244 , val_rmse = 0.33232610611
#jun25 data log1p, rd alpha=1, 0.320498636946 , val_rmse = 0.332092993125

#jun25 data log1p, ls alpha=.005, train_rmse = 0.334846893112 , val_rmse = 0.339948393494

train_rmse = 0.334846893112 , val_rmse = 0.339948393494


isLog1p=True
if (True):
    train, test, macro = data_utils.load_data()

    mult = .969

    train['price_doc'] = train["price_doc"] * mult + 10
    if (isLog1p):
        train['price_doc'] = np.log1p(train['price_doc'])
    ylog_train_all = train['price_doc']
    id_train = train['id']
    train.drop(['id', 'price_doc'], axis=1, inplace=True)
    #submit_ids = test['id']
    submit_ids = pd.read_csv('../../input/test.csv')['id']
    test.drop(['id'], axis=1, inplace=True)

    # 合并训练集和测试集
    conbined_data = pd.concat([train[test.columns.values], test])
    # macro_cols = ["balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
    #               "micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
    #               "income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build", "timestamp"]
    # conbined_data = pd.merge_ordered(conbined_data, macro[macro_cols], on='timestamp', how='left')

    conbined_data.drop(['timestamp'], axis=1, inplace=True)
    print "conbined_data:", conbined_data.shape

    # Deal with categorical values
    for c in conbined_data.columns:
        if conbined_data[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(conbined_data[c].values))
            conbined_data[c] = lbl.transform(list(conbined_data[c].values))

    train = conbined_data.iloc[:train.shape[0], :]
    test = conbined_data.iloc[train.shape[0]:, :]

    test_size = (1.0 * test.shape[0]) / train.shape[0]
    print "submit test size:", test_size

    # Convert to numpy values
    X_all = train.values

    # Create a validation set, with last 20% of data
    num_train = int(train.shape[0] / (1+test_size))

    X_train_all = X_all
    X_test = test

    # X_train = X_all[:num_train]
    # X_val = X_all[num_train:]
    # ylog_train = ylog_train_all[:num_train]
    # ylog_val = ylog_train_all[num_train:]

    X_train, X_val, ylog_train, ylog_val=train_test_split(X_all, ylog_train_all, test_size=test_size, 
                                                random_state=123)

    print "validate size:", 1.0*X_val.shape[0] / X_train.shape[0]

    df_columns = train.columns

    print('X_train_all shape is', X_train_all.shape)
    print('X_train shape is', X_train.shape)
    print('y_train shape is', ylog_train.shape)
    print('X_val shape is', X_val.shape)
    print('y_val shape is', ylog_val.shape)
    print('X_test shape is', X_test.shape)
    
    dtrain_all = xgb.DMatrix(X_train_all, ylog_train_all, feature_names=df_columns)
    dtrain = xgb.DMatrix(X_train, ylog_train, feature_names=df_columns)
    dval = xgb.DMatrix(X_val, ylog_val, feature_names=df_columns)
    dtest = xgb.DMatrix(X_test, feature_names=df_columns)

    xgb_params = {
        'eta': 0.05,
        'max_depth': 5,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1,
        'seed':5
    }

    num_round = 1000
    xgb_params['nthread'] = 24
    evallist = [(dval, 'eval')]

    bst = xgb.train(xgb_params, dtrain, num_round, evallist, early_stopping_rounds=40, verbose_eval=10)
    
    if (isLog1p):
        train_rmse = mean_squared_error(ylog_train, bst.predict(dtrain))
        val_rmse = mean_squared_error(ylog_val, bst.predict(dval))
    else:
        train_rmse = mean_squared_error(np.log1p(ylog_train), np.log1p(bst.predict(dtrain)))
        val_rmse = mean_squared_error(np.log1p(ylog_val), np.log1p(bst.predict(dval)))
    print 'train_rmse =', np.sqrt(train_rmse), ', val_rmse =', np.sqrt(val_rmse)