In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
import copy
warnings.filterwarnings('ignore')
from matplotlib import style
style.use("ggplot")
%matplotlib inline

from scipy import sparse

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder 

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import chi2, SelectPercentile,f_classif
import lightgbm as lgb

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from itertools import product
 
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=5, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
 
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})
        col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
        col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
        col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new


In [4]:
def readData():
    train = pd.read_csv("../Data/train.csv")
    test = pd.read_csv("../Data/test.csv")
    print(train.shape,test.shape)
    columns = ["time","department","rent_num","floor","total_floor","square","direction","living_state","num_bedroom",
          "num_living_room","num_bath_room","rent_type","district","position","metro_line","station","distance","decoration","month_rent"]
    #------------------------------
    train.columns = columns
    test.columns = ["id"] + columns[:-1]
    #-------------------------------
    train = train.drop("time",axis=1).reset_index()
    train = train.rename(columns = {"index":"id"})
    test["month_rent"] = -1
    data = pd.concat([train,test[train.columns]],axis=0)
    data = data.reset_index(drop=True)
    return data

In [5]:
def basicClean(df=None,is_del=True):
    data = copy.deepcopy(df)
    print("handle the null feature")
    if is_del:
        data = data.drop(["living_state","rent_type","decoration"],axis=1)
    else:
        for i in ["living_state","rent_type","decoration"]:
            data[i] = pd.factorize(data[i])[0]
    #-----异常值清洗square
    #data.loc[data["square"]>=0.1,"square"] = data.loc[data["square"]<0.1,"square"].mean()

    #--------空值处理-------------
    rent_median = data["rent_num"].median()
    print(rent_median)
    data["rent_num"] = data["rent_num"].fillna(data["rent_num"].median())
    data["distance"] = data["distance"].fillna(0)
    data["metro_line"] = data["metro_line"].fillna("none")
    data["station"] = data["station"].fillna("none")
    data["district"]  =data["district"].fillna("none")
    data["position"]  =data["position"].fillna("none")
    #-----log变换-----------
    log_cols = ["rent_num","square","distance","total_floor"]
    print(data.dtypes)
    for i in log_cols:
        data['log_'+i] = data[i].map(lambda x:np.log(x+0.0000001))
    #顺序变化
    enc_cols =["department","direction","position","station","metro_line","district"]
    for i in enc_cols:
        data[i] = pd.factorize(data[i])[0]
    return data  

In [6]:
def getDirection(df):
    '''
    将朝向拆开
    '''
    data = copy.deepcopy(df)
    for i,j in zip(['东南', '东', '西北', '西南', '北', '南', '西', '东北'],
     ["east_south","east","west_north","west_south","north","south","west","east_north"]):
        print(i)
        data["is_"+j] = data["direction"].map(lambda x:1 if i in x else 0)
    columns = list(filter(lambda x:"is_" in x,data.columns))
    return data[columns]

In [7]:
def cleanRoomNum(df):
    '''
    清洗客房、卧室、卫的数量
    '''
    #卫数
    data = copy.deepcopy(df)
    data["num_bath_room"] = data["num_bath_room"].map(lambda x:3 if x not in [1,2] else x) #3代表除了卫生间有1、2个
    data["num_bedroom"] = data["num_bedroom"].map(lambda x: 5 if x not in [1,2,3,4] else x) #5代表5个卧室以上
    data["num_living_room"] = data["num_living_room"].map(lambda x: 3 if x not in [0,1,2] else x)
    
    #组合，即xx 房 xx厅 xx卫
    data["bath_bed_living"] = data["num_bath_room"].astype("str") + "_" +  data["num_bedroom"].astype("str") + "_" + data["num_living_room"].astype("str")
    data["bath_bed_living"] = pd.factorize(data["bath_bed_living"])[0]
    columns = ["num_bath_room","num_bedroom","num_living_room","bath_bed_living"]
    return data[columns]

In [8]:
def countFeature(df=None):
    data = copy.deepcopy(df)
    columns = ["month_rent","department","district","position","direction"]
    data = data[columns]
    for i in columns[1:]:
        tmp = data.groupby(i,as_index=False)["month_rent"].count().rename(columns = {"month_rent":"{0}_cnt".format(i)})
        data = data.merge(tmp,how="left",on=i)
    cnt_cols = list(filter(lambda x:"cnt" in x,data.columns))
    return data[cnt_cols]

In [9]:
from sklearn.cross_validation import StratifiedKFold
def cvMeanEncoding(df,fea):
    data = copy.deepcopy(df)
    data = data.reset_index().rename(columns = {"index":"order"})
    new_train = None #用于存储
    train_data = data[data["month_rent"]!=-1]
    test_data = data[data["month_rent"]==-1]
    y = train_data["month_rent"]
    test_data[fea+"_mean_target"] = 0
    #cv-folds 的策略进行编码
    skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
    for i, (train_index, val_index) in enumerate(skf):
        print(i)
        train,val = train_data.iloc[train_index],train_data.iloc[val_index]
        maper = train.groupby(fea)["month_rent"].median()
        means = val[fea].map(maper)
        val[fea+"_mean_target"] = means
        new_train = pd.concat([new_train,val],axis=0,ignore_index=True)
        test_data[fea+"_mean_target"]= test_data[fea+"_mean_target"] + test_data[fea].map(maper)
    test_data[fea+"_mean_target"] = test_data[fea+"_mean_target"]/5
    data = pd.concat([train_data,test_data],axis=0,ignore_index=True)
    #全局平均值
    g_means = data[fea+"_mean_target"].median()
    data[fea+"_mean_target"]  = data[fea+"_mean_target"].fillna(g_means)
    data = data.sort_values(by="order")
    data = data.drop("order",axis=1)
    return data
    

In [10]:
def goupbyMeanEncoding(df,fea1,fea2):
    data = copy.deepcopy(df)
    data = data.reset_index().rename(columns = {"index":"order"})
    new_train = None #用于存储
    train_data = data[data["month_rent"]!=-1]
    test_data = data[data["month_rent"]==-1]
    y = train_data["month_rent"]
    colname = fea1+"_"+fea2+"_mean_target"
    test_data[colname] = 0
    print(colname)
    #cv-folds 的策略进行编码
    skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
    for i, (train_index, val_index) in enumerate(skf):
        print(i)
        train,val = train_data.iloc[train_index],train_data.iloc[val_index]
        maper = train.groupby([fea1,fea2],as_index=False)["month_rent"].median()
        maper.columns = [fea1,fea2,colname]
        val = pd.merge(val,maper,on=[fea1,fea2])
        new_train = pd.concat([new_train,val],axis=0,ignore_index=True)
        #对于测试集来说
        tmp = test_data.drop(colname,axis=1).merge(maper,on=[fea1,fea2],how="left")
        test_data[colname]= test_data[colname].values + tmp[colname].values
    test_data[colname] = test_data[colname]/5
    data = pd.concat([train_data,test_data],axis=0,ignore_index=True)
    #全局平均值
    g_means = data[colname].median()
    data[colname]  = data[colname].fillna(g_means)
    data = data.sort_values(by="order")
    data = data.drop("order",axis=1)
    return data
    

In [11]:
def mergeFeature():
    save_path= "../Feature/_featureEnineering_v3.0.pickle"
    if os.path.exists(save_path):
        print(save_path,"已经存在")
    else:
        data = readData()
        direction = getDirection(data) #one-hot
        data = basicClean(data)
        RoomNum = cleanRoomNum(data)
        count_fea = countFeature(data)
        #合并特征
        data = data.drop(["id","num_bath_room","num_bedroom","num_living_room"],axis=1)
        data = pd.concat([data,direction],axis=1)
        data = pd.concat([data,RoomNum],axis=1)
        data = pd.concat([data,count_fea],axis=1)
# #         #反解密
# #         data["total_floor"] = data["total_floor"].map(getTotalFloor)
# #         data["rent_num"] = data["rent_num"].map(getRentNum) 
#         #平均值编码1
#         for i in ["department","position","direction","district","station"]:
#             print(i)
#             data = cvMeanEncoding(data,i)
            
#         for i,j in (["department","district"],["station","position"]):
#             data = goupbyMeanEncoding(data,i,j)
        
#         #平均值编码2
#         train = data[data["month_rent"]!=-1]
#         test = data[data["month_rent"]==-1]
#         meanEncoder = MeanEncoder(categorical_features=["department","position","direction","district","station"],target_type = "regression",n_splits=10)
#         new_train = meanEncoder.fit_transform(train.drop("month_rent",axis=1),train["month_rent"])
#         new_test = meanEncoder.transform(test.drop("month_rent",axis=1))
#         #
#         new_train["month_rent"] = train["month_rent"].values
#         new_test["month_rent"] =-1
#         data = pd.concat([new_train,new_test[new_train.columns]],axis=0,ignore_index=True)
#         print(data.columns)
#         drop_cols = ["rent_num","square","distance","total_floor"]
#         data = data.drop(drop_cols,axis=1)
        data.to_pickle(save_path)
        print(data.shape)
        print("保存在",save_path)

In [22]:
if __name__ == '__main__':
    mergeFeature()

(196539, 19) (56279, 19)
东南
东
西北
西南
北
南
西
东北
handle the null feature
0.08203125
id                   int64
department           int64
rent_num           float64
floor                int64
total_floor        float64
square             float64
direction           object
num_bedroom          int64
num_living_room      int64
num_bath_room        int64
district            object
position            object
metro_line          object
station             object
distance           float64
month_rent         float64
dtype: object
(252818, 32)
保存在 ../Feature/_featureEnineering_v3.0.pickle


'F:\\机器学习\\机器学习大赛\\月租金大赛\\Script'

In [23]:
data = pd.read_pickle("../Feature/_featureEnineering_v3.0.pickle")
train = data[data["month_rent"]!=-1]
test = data[data["month_rent"]==-1]
X = train.drop(["month_rent"],axis=1).values
y = train["month_rent"].values
X_test = test.drop(["month_rent"],axis=1).values
cols = test.drop(["month_rent"],axis=1).columns

print(X.shape,y.shape,X_test.shape)

(196539, 31) (196539,) (56279, 31)


In [24]:
from sklearn.cross_validation import StratifiedKFold
seed_ls = []
# 五折交叉训练，构造五个模型
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    #------------------------------------#
    # 模型部分
    
#     model = lgb.LGBMRegressor(objective='regression',num_leaves=60,
#                               learning_rate=0.05, n_estimators=2000,
#                               max_bin = 55, bagging_fraction = 0.8,
#                               bagging_freq = 5, feature_fraction = 0.2319,
#                               feature_fraction_seed=9, bagging_seed=9,
#                               min_data_in_leaf =6, min_sum_hessian_in_leaf = 11,random_seed=2018)
#logloss: [1.9440240545361742, 1.940272674000025, 1.9222839817175252, 1.901934152677352, 1.8703667334744163] 1.9157763192810986
    model = lgb.LGBMRegressor(objective='regression',num_leaves=125,
                              learning_rate=0.05, n_estimators=2500,boosting_type="gbdt",max_depth=-1,
                             seed=2018,num_thread=-1,max_bin=425,bagging_fraction=0.8,colsample_bytree=0.9,subsample=0.8,lambda_l2=0.20)
    #
    #------------------------------------#
    lgb_model = model.fit(X[train_index], y[train_index],
                          eval_names =['train','valid'],
                          eval_metric= 'rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(lgb_model.best_score_['valid']['rmse'])
    loss += lgb_model.best_score_['valid']['rmse']
    test_pred= lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
print('logloss:', baseloss, loss/5)

Fold 0
[1]	train's rmse: 6.06336	valid's rmse: 6.21511
Training until validation scores don't improve for 100 rounds.
[2]	train's rmse: 5.85183	valid's rmse: 6.00149
[3]	train's rmse: 5.65045	valid's rmse: 5.8012
[4]	train's rmse: 5.4612	valid's rmse: 5.61227
[5]	train's rmse: 5.28265	valid's rmse: 5.43409
[6]	train's rmse: 5.11669	valid's rmse: 5.26773
[7]	train's rmse: 4.96129	valid's rmse: 5.11163
[8]	train's rmse: 4.81512	valid's rmse: 4.96613
[9]	train's rmse: 4.67675	valid's rmse: 4.82702
[10]	train's rmse: 4.54693	valid's rmse: 4.69603
[11]	train's rmse: 4.42429	valid's rmse: 4.57456
[12]	train's rmse: 4.30963	valid's rmse: 4.45903
[13]	train's rmse: 4.20229	valid's rmse: 4.35374
[14]	train's rmse: 4.10228	valid's rmse: 4.25429
[15]	train's rmse: 4.00695	valid's rmse: 4.16107
[16]	train's rmse: 3.9183	valid's rmse: 4.07335
[17]	train's rmse: 3.83328	valid's rmse: 3.98698
[18]	train's rmse: 3.75438	valid's rmse: 3.90888
[19]	train's rmse: 3.67905	valid's rmse: 3.83269
[20]	train'

In [25]:
submission= pd.DataFrame(predict.mean(axis=0))
test = pd.read_csv("../Data/test.csv")
submission["id"] = test["id"].values
submission.columns=["price","id"]
submission[["id","price"]].to_csv("../Result/_baseline_线下1.515_v3.0.csv",index=False,encoding="utf-8",sep=",")

##### 添加布尔特征

In [12]:
def bool_feature():
    data = readData()
    columns = ["decoration","station","living_state","rent_type"]
    data = data[columns]
    for i in columns:
        data["is_has_"+i] = data[i].map(lambda x:0 if str(x)=='nan' else 1)
        data = data.drop(i,axis=1)
    return data

In [29]:
bool_fea = bool_feature()

(196539, 19) (56279, 19)


In [32]:
data = pd.read_pickle("../Feature/_featureEnineering_v3.0.pickle")
data = pd.concat([data,bool_fea],axis=1)
train = data[data["month_rent"]!=-1]
test = data[data["month_rent"]==-1]
X = train.drop(["month_rent"],axis=1).values
y = train["month_rent"].values
X_test = test.drop(["month_rent"],axis=1).values
cols = test.drop(["month_rent"],axis=1).columns

print(X.shape,y.shape,X_test.shape)

(196539, 35) (196539,) (56279, 35)


In [33]:
from sklearn.cross_validation import StratifiedKFold
seed_ls = []
# 五折交叉训练，构造五个模型
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    #------------------------------------#
    # 模型部分
    
#     model = lgb.LGBMRegressor(objective='regression',num_leaves=60,
#                               learning_rate=0.05, n_estimators=2000,
#                               max_bin = 55, bagging_fraction = 0.8,
#                               bagging_freq = 5, feature_fraction = 0.2319,
#                               feature_fraction_seed=9, bagging_seed=9,
#                               min_data_in_leaf =6, min_sum_hessian_in_leaf = 11,random_seed=2018)
#logloss: [1.9440240545361742, 1.940272674000025, 1.9222839817175252, 1.901934152677352, 1.8703667334744163] 1.9157763192810986
    model = lgb.LGBMRegressor(objective='regression',num_leaves=125,
                              learning_rate=0.05, n_estimators=2500,boosting_type="gbdt",max_depth=-1,
                             seed=2018,num_thread=-1,max_bin=425,bagging_fraction=0.8,colsample_bytree=0.9,subsample=0.8,lambda_l2=0.20)
    #
    #------------------------------------#
    lgb_model = model.fit(X[train_index], y[train_index],
                          eval_names =['train','valid'],
                          eval_metric= 'rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(lgb_model.best_score_['valid']['rmse'])
    loss += lgb_model.best_score_['valid']['rmse']
    test_pred= lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
print('logloss:', baseloss, loss/5)

Fold 0
[1]	train's rmse: 6.05103	valid's rmse: 6.20146
Training until validation scores don't improve for 100 rounds.
[2]	train's rmse: 5.83787	valid's rmse: 5.99019
[3]	train's rmse: 5.6383	valid's rmse: 5.79313
[4]	train's rmse: 5.4503	valid's rmse: 5.60525
[5]	train's rmse: 5.27396	valid's rmse: 5.42964
[6]	train's rmse: 5.11144	valid's rmse: 5.26663
[7]	train's rmse: 4.95668	valid's rmse: 5.11249
[8]	train's rmse: 4.81214	valid's rmse: 4.96892
[9]	train's rmse: 4.67736	valid's rmse: 4.83517
[10]	train's rmse: 4.54582	valid's rmse: 4.70573
[11]	train's rmse: 4.42316	valid's rmse: 4.58317
[12]	train's rmse: 4.30966	valid's rmse: 4.47043
[13]	train's rmse: 4.19956	valid's rmse: 4.36072
[14]	train's rmse: 4.09964	valid's rmse: 4.26047
[15]	train's rmse: 4.00442	valid's rmse: 4.16734
[16]	train's rmse: 3.91482	valid's rmse: 4.07732
[17]	train's rmse: 3.83189	valid's rmse: 3.99467
[18]	train's rmse: 3.75374	valid's rmse: 3.91734
[19]	train's rmse: 3.68117	valid's rmse: 3.84411
[20]	train

In [34]:
submission= pd.DataFrame(predict.mean(axis=0))
test = pd.read_csv("../Data/test.csv")
submission["id"] = test["id"].values
submission.columns=["price","id"]
submission[["id","price"]].to_csv("../Result/_baseline_线下1.4013_v3.1.csv",index=False,encoding="utf-8",sep=",")

##### 平均值编码

In [13]:
def mergeFeature():
    save_path= "../Feature/_featureEnineering_v4.0_meanencoding.pickle"
    if os.path.exists(save_path):
        print(save_path,"已经存在")
    else:
        data = readData()
        direction = getDirection(data) #one-hot
        data = basicClean(data)
        RoomNum = cleanRoomNum(data)
        count_fea = countFeature(data)
        bool_fea = bool_feature()
        #合并特征
        data = data.drop(["id","num_bath_room","num_bedroom","num_living_room"],axis=1)
        data = pd.concat([data,direction],axis=1)
        data = pd.concat([data,RoomNum],axis=1)
        data = pd.concat([data,count_fea],axis=1)
        data = pd.concat([data,bool_fea],axis=1)
# #         #反解密
# #         data["total_floor"] = data["total_floor"].map(getTotalFloor)
# #         data["rent_num"] = data["rent_num"].map(getRentNum) 
#         #平均值编码1
        for i in ["department","position","direction","district","station"]:
            print(i)
            data = cvMeanEncoding(data,i)
            
        for i,j in (["department","district"],["station","position"]):
            data = goupbyMeanEncoding(data,i,j)
        
#         #平均值编码2
#         train = data[data["month_rent"]!=-1]
#         test = data[data["month_rent"]==-1]
#         meanEncoder = MeanEncoder(categorical_features=["department","position","direction","district","station"],target_type = "regression",n_splits=10)
#         new_train = meanEncoder.fit_transform(train.drop("month_rent",axis=1),train["month_rent"])
#         new_test = meanEncoder.transform(test.drop("month_rent",axis=1))
#         #
#         new_train["month_rent"] = train["month_rent"].values
#         new_test["month_rent"] =-1
#         data = pd.concat([new_train,new_test[new_train.columns]],axis=0,ignore_index=True)
#         print(data.columns)
#         drop_cols = ["rent_num","square","distance","total_floor"]
#         data = data.drop(drop_cols,axis=1)
        data.to_pickle(save_path)
        print(data.shape)
        print("保存在",save_path)

In [37]:
mergeFeature()

(196539, 19) (56279, 19)
东南
东
西北
西南
北
南
西
东北
handle the null feature
0.08203125
id                   int64
department           int64
rent_num           float64
floor                int64
total_floor        float64
square             float64
direction           object
num_bedroom          int64
num_living_room      int64
num_bath_room        int64
district            object
position            object
metro_line          object
station             object
distance           float64
month_rent         float64
dtype: object
(196539, 19) (56279, 19)
department
0
1
2
3
4
position
0
1
2
3
4
direction
0
1
2
3
4
district
0
1
2
3
4
station
0
1
2
3
4
department_district_mean_target
0
1
2
3
4
station_position_mean_target
0
1
2
3
4
(252818, 43)
保存在 ../Feature/_featureEnineering_v4.0_meanencoding.pickle


In [14]:
data = pd.read_pickle("../Feature/_featureEnineering_v4.0_meanencoding.pickle")
train = data[data["month_rent"]!=-1]
test = data[data["month_rent"]==-1]
X = train.drop(["month_rent"],axis=1).values
y = train["month_rent"].values
X_test = test.drop(["month_rent"],axis=1).values
cols = test.drop(["month_rent"],axis=1).columns

print(X.shape,y.shape,X_test.shape)

NameError: name 'bool_fea' is not defined

In [39]:
from sklearn.cross_validation import StratifiedKFold
seed_ls = []
# 五折交叉训练，构造五个模型
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    model = lgb.LGBMRegressor(objective='regression',num_leaves=125,
                              learning_rate=0.05, n_estimators=2500,boosting_type="gbdt",max_depth=-1,
                             seed=2018,num_thread=-1,max_bin=425,bagging_fraction=0.8,colsample_bytree=0.9,subsample=0.8,lambda_l2=0.20)
    #
    #------------------------------------#
    lgb_model = model.fit(X[train_index], y[train_index],
                          eval_names =['train','valid'],
                          eval_metric= 'rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(lgb_model.best_score_['valid']['rmse'])
    loss += lgb_model.best_score_['valid']['rmse']
    test_pred= lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
print('logloss:', baseloss, loss/5)

Fold 0
[1]	train's rmse: 6.05257	valid's rmse: 6.20612
Training until validation scores don't improve for 100 rounds.
[2]	train's rmse: 5.84471	valid's rmse: 5.99586
[3]	train's rmse: 5.64454	valid's rmse: 5.7969
[4]	train's rmse: 5.45678	valid's rmse: 5.61045
[5]	train's rmse: 5.27934	valid's rmse: 5.4334
[6]	train's rmse: 5.11428	valid's rmse: 5.26806
[7]	train's rmse: 4.95899	valid's rmse: 5.1126
[8]	train's rmse: 4.81346	valid's rmse: 4.96424
[9]	train's rmse: 4.67541	valid's rmse: 4.82722
[10]	train's rmse: 4.54829	valid's rmse: 4.69948
[11]	train's rmse: 4.42614	valid's rmse: 4.57986
[12]	train's rmse: 4.31221	valid's rmse: 4.46682
[13]	train's rmse: 4.20341	valid's rmse: 4.35951
[14]	train's rmse: 4.10344	valid's rmse: 4.25863
[15]	train's rmse: 4.00735	valid's rmse: 4.1643
[16]	train's rmse: 3.91972	valid's rmse: 4.0769
[17]	train's rmse: 3.83722	valid's rmse: 3.99527
[18]	train's rmse: 3.75871	valid's rmse: 3.91745
[19]	train's rmse: 3.68593	valid's rmse: 3.8452
[20]	train's r

In [40]:
submission= pd.DataFrame(predict.mean(axis=0))
test = pd.read_csv("../Data/test.csv")
submission["id"] = test["id"].values
submission.columns=["price","id"]
submission[["id","price"]].to_csv("../Result/_baseline_线下1.4005_v4.0.csv",index=False,encoding="utf-8",sep=",")

#### 根据逻辑口径

In [15]:
data.columns

Index(['bath_bed_living', 'department', 'department_cnt',
       'department_district_mean_target', 'department_mean_target',
       'direction', 'direction_cnt', 'direction_mean_target', 'distance',
       'district', 'district_cnt', 'district_mean_target', 'floor', 'is_east',
       'is_east_north', 'is_east_south', 'is_has_decoration',
       'is_has_living_state', 'is_has_rent_type', 'is_has_station', 'is_north',
       'is_south', 'is_west', 'is_west_north', 'is_west_south', 'log_distance',
       'log_rent_num', 'log_square', 'log_total_floor', 'metro_line',
       'month_rent', 'num_bath_room', 'num_bedroom', 'num_living_room',
       'position', 'position_cnt', 'position_mean_target', 'rent_num',
       'square', 'station', 'station_mean_target',
       'station_position_mean_target', 'total_floor'],
      dtype='object')

In [14]:
def logicFea():
    data = readData()
    data["bath_living"] = data["num_bath_room"] + data["num_living_room"]
    data["bed_bath"] = data["num_bedroom"] + data["num_bath_room"]
    data["total_room"] = data["num_bath_room"] + data["num_living_room"] + data["num_bedroom"]
    
    #平均占地面积
    data["average_square"] = data["square"] / data["total_room"]
    return data[["bath_living","bed_bath","total_room","average_square"]]

In [38]:
data = pd.read_pickle("../Feature/_featureEnineering_v4.0_meanencoding.pickle")
logic_fea = logicFea()
data = pd.concat([data,logic_fea],axis=1)
train = data[data["month_rent"]!=-1]
test = data[data["month_rent"]==-1]
X = train.drop(["month_rent"],axis=1).values
y = train["month_rent"].values
X_test = test.drop(["month_rent"],axis=1).values
cols = test.drop(["month_rent"],axis=1).columns

print(X.shape,y.shape,X_test.shape)

(196539, 19) (56279, 19)
(196539, 46) (196539,) (56279, 46)


In [39]:
data.columns

Index(['bath_bed_living', 'department', 'department_cnt',
       'department_district_mean_target', 'department_mean_target',
       'direction', 'direction_cnt', 'direction_mean_target', 'distance',
       'district', 'district_cnt', 'district_mean_target', 'floor', 'is_east',
       'is_east_north', 'is_east_south', 'is_has_decoration',
       'is_has_living_state', 'is_has_rent_type', 'is_has_station', 'is_north',
       'is_south', 'is_west', 'is_west_north', 'is_west_south', 'log_distance',
       'log_rent_num', 'log_square', 'log_total_floor', 'metro_line',
       'month_rent', 'num_bath_room', 'num_bedroom', 'num_living_room',
       'position', 'position_cnt', 'position_mean_target', 'rent_num',
       'square', 'station', 'station_mean_target',
       'station_position_mean_target', 'total_floor', 'bath_living',
       'bed_bath', 'total_room', 'average_square'],
      dtype='object')

In [40]:
from sklearn.cross_validation import StratifiedKFold
seed_ls = []
# 五折交叉训练，构造五个模型
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    model = lgb.LGBMRegressor(objective='regression',num_leaves=125,
                              learning_rate=0.05, n_estimators=2500,boosting_type="gbdt",max_depth=-1,
                             seed=2018,num_thread=-1,max_bin=425,bagging_fraction=0.8,colsample_bytree=0.9,subsample=0.8,lambda_l2=0.20)
    #
    #------------------------------------#
    lgb_model = model.fit(X[train_index], y[train_index],
                          eval_names =['train','valid'],
                          eval_metric= 'rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(lgb_model.best_score_['valid']['rmse'])
    loss += lgb_model.best_score_['valid']['rmse']
    test_pred= lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
print('logloss:', baseloss, loss/5)

Fold 0
[1]	train's rmse: 6.05192	valid's rmse: 6.20507
Training until validation scores don't improve for 100 rounds.
[2]	train's rmse: 5.8434	valid's rmse: 5.99527
[3]	train's rmse: 5.64338	valid's rmse: 5.79648
[4]	train's rmse: 5.45556	valid's rmse: 5.61031
[5]	train's rmse: 5.27779	valid's rmse: 5.43201
[6]	train's rmse: 5.11112	valid's rmse: 5.26552
[7]	train's rmse: 4.95464	valid's rmse: 5.10962
[8]	train's rmse: 4.80913	valid's rmse: 4.96205
[9]	train's rmse: 4.67154	valid's rmse: 4.82648
[10]	train's rmse: 4.54433	valid's rmse: 4.69944
[11]	train's rmse: 4.42055	valid's rmse: 4.57735
[12]	train's rmse: 4.30697	valid's rmse: 4.46461
[13]	train's rmse: 4.19644	valid's rmse: 4.35426
[14]	train's rmse: 4.09564	valid's rmse: 4.2535
[15]	train's rmse: 3.99847	valid's rmse: 4.159
[16]	train's rmse: 3.91018	valid's rmse: 4.07078
[17]	train's rmse: 3.82672	valid's rmse: 3.98761
[18]	train's rmse: 3.74778	valid's rmse: 3.90776
[19]	train's rmse: 3.6732	valid's rmse: 3.83571
[20]	train's 

In [41]:
submission= pd.DataFrame(predict.mean(axis=0))
test = pd.read_csv("../Data/test.csv")
submission["id"] = test["id"].values
submission.columns=["price","id"]
submission[["id","price"]].to_csv("../Result/_baseline_线下1.3618_v4.1.csv",index=False,encoding="utf-8",sep=",")

#### 重写计数 （目前最好分数--20181121 0:35）

In [88]:
def countFeature(df=None):
    data = copy.deepcopy(df)
    columns = ["month_rent","department","district","position","direction","floor","station"]
    data = data[columns]
    for i in columns[1:]:
        tmp = data.groupby(i,as_index=False)["month_rent"].count().rename(columns = {"month_rent":"{0}_cnt".format(i)})
        data = data.merge(tmp,how="left",on=i)
    cnt_cols = list(filter(lambda x:"cnt" in x,data.columns))
    return data[cnt_cols]

In [89]:
def mergeFeature():
    save_path= "../Feature/_featureEnineering_v5.pickle"
    if os.path.exists(save_path):
        print(save_path,"已经存在")
    else:
        data = readData()
        direction = getDirection(data) #one-hot
        data = basicClean(data)
        RoomNum = cleanRoomNum(data)
        count_fea = countFeature(data)
        bool_fea = bool_feature()
        logic_fea = logicFea()
        #合并特征
        data = data.drop(["id","num_bath_room","num_bedroom","num_living_room"],axis=1)
        data = pd.concat([data,direction],axis=1)
        data = pd.concat([data,RoomNum],axis=1)
        data = pd.concat([data,count_fea],axis=1)
        data = pd.concat([data,bool_fea],axis=1)
        data = pd.concat([data,logic_fea],axis=1)
# #         #反解密
# #         data["total_floor"] = data["total_floor"].map(getTotalFloor)
# #         data["rent_num"] = data["rent_num"].map(getRentNum) 
#         #平均值编码1
        for i in ["department","position","direction","district","station"]:
            print(i)
            data = cvMeanEncoding(data,i)
            
        for i,j in (["department","district"],["station","position"]):
            data = goupbyMeanEncoding(data,i,j)
            
        data.to_pickle(save_path)
        print(data.shape)
        print("保存在",save_path)

In [90]:
mergeFeature()

(196539, 19) (56279, 19)
东南
东
西北
西南
北
南
西
东北
handle the null feature
0.08203125
id                   int64
department           int64
rent_num           float64
floor                int64
total_floor        float64
square             float64
direction           object
num_bedroom          int64
num_living_room      int64
num_bath_room        int64
district            object
position            object
metro_line          object
station             object
distance           float64
month_rent         float64
dtype: object
(196539, 19) (56279, 19)
(196539, 19) (56279, 19)
department
0
1
2
3
4
position
0
1
2
3
4
direction
0
1
2
3
4
district
0
1
2
3
4
station
0
1
2
3
4
department_district_mean_target
0
1
2
3
4
station_position_mean_target
0
1
2
3
4
(252818, 49)
保存在 ../Feature/_featureEnineering_v5.pickle


In [91]:
data = pd.read_pickle("../Feature/_featureEnineering_v5.pickle")
train = data[data["month_rent"]!=-1]
test = data[data["month_rent"]==-1]
X = train.drop(["month_rent"],axis=1).values
y = train["month_rent"].values
X_test = test.drop(["month_rent"],axis=1).values
cols = test.drop(["month_rent"],axis=1).columns

print(X.shape,y.shape,X_test.shape)

(196539, 48) (196539,) (56279, 48)


In [92]:
from sklearn.cross_validation import StratifiedKFold
seed_ls = []
# 五折交叉训练，构造五个模型
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    model = lgb.LGBMRegressor(objective='regression',num_leaves=125,
                              learning_rate=0.05, n_estimators=2500,boosting_type="gbdt",max_depth=-1,
                             seed=2018,num_thread=-1,max_bin=425,bagging_fraction=0.8,colsample_bytree=0.9,subsample=0.8,lambda_l2=0.20)
    #
    #------------------------------------#
    lgb_model = model.fit(X[train_index], y[train_index],
                          eval_names =['train','valid'],
                          eval_metric= 'rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(lgb_model.best_score_['valid']['rmse'])
    loss += lgb_model.best_score_['valid']['rmse']
    test_pred= lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
print('logloss:', baseloss, loss/5)

Fold 0
[1]	train's rmse: 6.05213	valid's rmse: 6.20632
Training until validation scores don't improve for 100 rounds.
[2]	train's rmse: 5.83899	valid's rmse: 5.9941
[3]	train's rmse: 5.63942	valid's rmse: 5.79583
[4]	train's rmse: 5.45161	valid's rmse: 5.60905
[5]	train's rmse: 5.27449	valid's rmse: 5.43292
[6]	train's rmse: 5.10872	valid's rmse: 5.26491
[7]	train's rmse: 4.95295	valid's rmse: 5.1085
[8]	train's rmse: 4.80761	valid's rmse: 4.96215
[9]	train's rmse: 4.67029	valid's rmse: 4.82461
[10]	train's rmse: 4.54563	valid's rmse: 4.70068
[11]	train's rmse: 4.42226	valid's rmse: 4.57831
[12]	train's rmse: 4.30649	valid's rmse: 4.4633
[13]	train's rmse: 4.19762	valid's rmse: 4.35725
[14]	train's rmse: 4.09606	valid's rmse: 4.2571
[15]	train's rmse: 3.99861	valid's rmse: 4.1605
[16]	train's rmse: 3.90931	valid's rmse: 4.07086
[17]	train's rmse: 3.82555	valid's rmse: 3.98888
[18]	train's rmse: 3.74644	valid's rmse: 3.9108
[19]	train's rmse: 3.66884	valid's rmse: 3.83358
[20]	train's r

In [93]:
submission= pd.DataFrame(predict.mean(axis=0))
test = pd.read_csv("../Data/test.csv")
submission["id"] = test["id"].values
submission.columns=["price","id"]
submission[["id","price"]].to_csv("../Result/_baseline_线下1.3560_v5.1.csv",index=False,encoding="utf-8",sep=",")

#### 重写逻辑特征（线下rmse下降，线上rmse上升）

In [95]:
def logicFea():
    data = readData()
    data["bath_living"] = data["num_bath_room"] + data["num_living_room"]
    data["bed_bath"] = data["num_bedroom"] + data["num_bath_room"]
    data["total_room"] = data["num_bath_room"] + data["num_living_room"] + data["num_bedroom"]
    
    #平均占地面积
    data["average_square"] = data["square"] / data["total_room"]
    #楼层比例
    data["floor/total_floor"] = data["floor"] / data["total_floor"]
    return data[["bath_living","bed_bath","total_room","average_square","floor/total_floor"]]

In [96]:
def mergeFeature():
    save_path= "../Feature/_featureEnineering_v6.pickle"
    if os.path.exists(save_path):
        print(save_path,"已经存在")
    else:
        data = readData()
        direction = getDirection(data) #one-hot
        data = basicClean(data)
        RoomNum = cleanRoomNum(data)
        count_fea = countFeature(data)
        bool_fea = bool_feature()
        logic_fea = logicFea()
        #合并特征
        data = data.drop(["id","num_bath_room","num_bedroom","num_living_room"],axis=1)
        data = pd.concat([data,direction],axis=1)
        data = pd.concat([data,RoomNum],axis=1)
        data = pd.concat([data,count_fea],axis=1)
        data = pd.concat([data,bool_fea],axis=1)
        data = pd.concat([data,logic_fea],axis=1)
# #         #反解密
# #         data["total_floor"] = data["total_floor"].map(getTotalFloor)
# #         data["rent_num"] = data["rent_num"].map(getRentNum) 
#         #平均值编码1
        for i in ["department","position","direction","district","station"]:
            print(i)
            data = cvMeanEncoding(data,i)
            
        for i,j in (["department","district"],["station","position"]):
            data = goupbyMeanEncoding(data,i,j)
            
        data.to_pickle(save_path)
        print(data.shape)
        print("保存在",save_path)

In [97]:
mergeFeature()

(196539, 19) (56279, 19)
东南
东
西北
西南
北
南
西
东北
handle the null feature
0.08203125
id                   int64
department           int64
rent_num           float64
floor                int64
total_floor        float64
square             float64
direction           object
num_bedroom          int64
num_living_room      int64
num_bath_room        int64
district            object
position            object
metro_line          object
station             object
distance           float64
month_rent         float64
dtype: object
(196539, 19) (56279, 19)
(196539, 19) (56279, 19)
department
0
1
2
3
4
position
0
1
2
3
4
direction
0
1
2
3
4
district
0
1
2
3
4
station
0
1
2
3
4
department_district_mean_target
0
1
2
3
4
station_position_mean_target
0
1
2
3
4
(252818, 50)
保存在 ../Feature/_featureEnineering_v6.pickle


In [98]:
data = pd.read_pickle("../Feature/_featureEnineering_v6.pickle")
train = data[data["month_rent"]!=-1]
test = data[data["month_rent"]==-1]
X = train.drop(["month_rent"],axis=1).values
y = train["month_rent"].values
X_test = test.drop(["month_rent"],axis=1).values
cols = test.drop(["month_rent"],axis=1).columns

print(X.shape,y.shape,X_test.shape)

(196539, 49) (196539,) (56279, 49)


In [99]:
from sklearn.cross_validation import StratifiedKFold
seed_ls = []
# 五折交叉训练，构造五个模型
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    model = lgb.LGBMRegressor(objective='regression',num_leaves=125,
                              learning_rate=0.05, n_estimators=2500,boosting_type="gbdt",max_depth=-1,
                             seed=2018,num_thread=-1,max_bin=425,bagging_fraction=0.8,colsample_bytree=0.9,subsample=0.8,lambda_l2=0.20)
    #
    #------------------------------------#
    lgb_model = model.fit(X[train_index], y[train_index],
                          eval_names =['train','valid'],
                          eval_metric= 'rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(lgb_model.best_score_['valid']['rmse'])
    loss += lgb_model.best_score_['valid']['rmse']
    test_pred= lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
print('logloss:', baseloss, loss/5)

Fold 0
[1]	train's rmse: 6.052	valid's rmse: 6.20505
Training until validation scores don't improve for 100 rounds.
[2]	train's rmse: 5.83957	valid's rmse: 5.99376
[3]	train's rmse: 5.64145	valid's rmse: 5.79832
[4]	train's rmse: 5.45455	valid's rmse: 5.60972
[5]	train's rmse: 5.27909	valid's rmse: 5.43234
[6]	train's rmse: 5.11232	valid's rmse: 5.26648
[7]	train's rmse: 4.95589	valid's rmse: 5.1105
[8]	train's rmse: 4.80979	valid's rmse: 4.96236
[9]	train's rmse: 4.66855	valid's rmse: 4.82255
[10]	train's rmse: 4.53567	valid's rmse: 4.68778
[11]	train's rmse: 4.41317	valid's rmse: 4.56427
[12]	train's rmse: 4.29929	valid's rmse: 4.45083
[13]	train's rmse: 4.18895	valid's rmse: 4.34128
[14]	train's rmse: 4.09025	valid's rmse: 4.2431
[15]	train's rmse: 3.99383	valid's rmse: 4.15025
[16]	train's rmse: 3.90141	valid's rmse: 4.05681
[17]	train's rmse: 3.81496	valid's rmse: 3.97041
[18]	train's rmse: 3.73732	valid's rmse: 3.89273
[19]	train's rmse: 3.66093	valid's rmse: 3.81745
[20]	train's

In [100]:
submission= pd.DataFrame(predict.mean(axis=0))
test = pd.read_csv("../Data/test.csv")
submission["id"] = test["id"].values
submission.columns=["price","id"]
submission[["id","price"]].to_csv("../Result/_baseline_线下1.3397_v6.1.csv",index=False,encoding="utf-8",sep=",")

##### 再次重写(线上降分）

In [101]:
def logicFea():
    data = readData()
    data["bath_living"] = data["num_bath_room"] + data["num_living_room"]
    data["bed_bath"] = data["num_bedroom"] + data["num_bath_room"]
    data["total_room"] = data["num_bath_room"] + data["num_living_room"] + data["num_bedroom"]
    
    #平均占地面积
    data["average_square"] = data["square"] / data["total_room"]
    #楼层比例
    data["floor/total_floor"] = data["floor"] / data["total_floor"]
    data['to_middle_foloor'] = abs(data['floor'] - data['total_floor']/ 0.018182 * 0.5)
    return data[["bath_living","bed_bath","total_room","average_square","floor/total_floor","to_middle_foloor"]]

In [102]:
def mergeFeature():
    save_path= "../Feature/_featureEnineering_v6.1.pickle"
    if os.path.exists(save_path):
        print(save_path,"已经存在")
    else:
        data = readData()
        direction = getDirection(data) #one-hot
        data = basicClean(data)
        RoomNum = cleanRoomNum(data)
        count_fea = countFeature(data)
        bool_fea = bool_feature()
        logic_fea = logicFea()
        #合并特征
        data = data.drop(["id","num_bath_room","num_bedroom","num_living_room"],axis=1)
        data = pd.concat([data,direction],axis=1)
        data = pd.concat([data,RoomNum],axis=1)
        data = pd.concat([data,count_fea],axis=1)
        data = pd.concat([data,bool_fea],axis=1)
        data = pd.concat([data,logic_fea],axis=1)
# #         #反解密
# #         data["total_floor"] = data["total_floor"].map(getTotalFloor)
# #         data["rent_num"] = data["rent_num"].map(getRentNum) 
#         #平均值编码1
        for i in ["department","position","direction","district","station"]:
            print(i)
            data = cvMeanEncoding(data,i)
            
        for i,j in (["department","district"],["station","position"]):
            data = goupbyMeanEncoding(data,i,j)
            
        data.to_pickle(save_path)
        print(data.shape)
        print("保存在",save_path)

In [103]:
mergeFeature()

(196539, 19) (56279, 19)
东南
东
西北
西南
北
南
西
东北
handle the null feature
0.08203125
id                   int64
department           int64
rent_num           float64
floor                int64
total_floor        float64
square             float64
direction           object
num_bedroom          int64
num_living_room      int64
num_bath_room        int64
district            object
position            object
metro_line          object
station             object
distance           float64
month_rent         float64
dtype: object
(196539, 19) (56279, 19)
(196539, 19) (56279, 19)
department
0
1
2
3
4
position
0
1
2
3
4
direction
0
1
2
3
4
district
0
1
2
3
4
station
0
1
2
3
4
department_district_mean_target
0
1
2
3
4
station_position_mean_target
0
1
2
3
4
(252818, 51)
保存在 ../Feature/_featureEnineering_v6.1.pickle


In [104]:
data = pd.read_pickle("../Feature/_featureEnineering_v6.1.pickle")
train = data[data["month_rent"]!=-1]
test = data[data["month_rent"]==-1]
X = train.drop(["month_rent"],axis=1).values
y = train["month_rent"].values
X_test = test.drop(["month_rent"],axis=1).values
cols = test.drop(["month_rent"],axis=1).columns

print(X.shape,y.shape,X_test.shape)

(196539, 50) (196539,) (56279, 50)


In [105]:
from sklearn.cross_validation import StratifiedKFold
seed_ls = []
# 五折交叉训练，构造五个模型
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    model = lgb.LGBMRegressor(objective='regression',num_leaves=125,
                              learning_rate=0.05, n_estimators=2500,boosting_type="gbdt",max_depth=-1,
                             seed=2018,num_thread=-1,max_bin=425,bagging_fraction=0.8,colsample_bytree=0.9,subsample=0.8,lambda_l2=0.20)
    #
    #------------------------------------#
    lgb_model = model.fit(X[train_index], y[train_index],
                          eval_names =['train','valid'],
                          eval_metric= 'rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(lgb_model.best_score_['valid']['rmse'])
    loss += lgb_model.best_score_['valid']['rmse']
    test_pred= lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
print('logloss:', baseloss, loss/5)

Fold 0
[1]	train's rmse: 6.0518	valid's rmse: 6.20558
Training until validation scores don't improve for 100 rounds.
[2]	train's rmse: 5.83932	valid's rmse: 5.99342
[3]	train's rmse: 5.63983	valid's rmse: 5.79423
[4]	train's rmse: 5.45239	valid's rmse: 5.60806
[5]	train's rmse: 5.27504	valid's rmse: 5.42959
[6]	train's rmse: 5.10877	valid's rmse: 5.26288
[7]	train's rmse: 4.95238	valid's rmse: 5.10453
[8]	train's rmse: 4.80725	valid's rmse: 4.95758
[9]	train's rmse: 4.6683	valid's rmse: 4.8193
[10]	train's rmse: 4.536	valid's rmse: 4.68691
[11]	train's rmse: 4.41557	valid's rmse: 4.56644
[12]	train's rmse: 4.30122	valid's rmse: 4.45316
[13]	train's rmse: 4.18984	valid's rmse: 4.3421
[14]	train's rmse: 4.09008	valid's rmse: 4.24207
[15]	train's rmse: 3.99259	valid's rmse: 4.14503
[16]	train's rmse: 3.90509	valid's rmse: 4.0578
[17]	train's rmse: 3.81905	valid's rmse: 3.9731
[18]	train's rmse: 3.74084	valid's rmse: 3.89495
[19]	train's rmse: 3.66345	valid's rmse: 3.81889
[20]	train's rms

In [106]:
submission= pd.DataFrame(predict.mean(axis=0))
test = pd.read_csv("../Data/test.csv")
submission["id"] = test["id"].values
submission.columns=["price","id"]
submission[["id","price"]].to_csv("../Result/_baseline_线下1.3378_v6.1.1.csv",index=False,encoding="utf-8",sep=",")

##### 添20181121的对district 进行排序赋值

In [39]:
def rankFeature(fea):
    data = readData()
    data = basicClean(data)
    data = data[[fea,"month_rent"]]
    train = data[data["month_rent"]!=-1]
    tmp = train.groupby(fea,as_index=False)[["month_rent"]].median()
    tmp["rank"] = a["month_rent"].rank()
    maper = pd.Series(tmp["rank"],index=tmp[fea])
    #train
    train[fea+"_rank"] = train[fea].map(maper)
    test[fea+"_rank"] = test[fea].map(maper)
    data = pd.concat([train,test],axis=0,ignore_index=True)
    return data[[fea+"_rank"]]

In [48]:
rank_fea = rankFeature("district")
data = pd.read_pickle("../Feature/_featureEnineering_v5.pickle")
print(data.shape)
print(rank_fea.shape)
data = pd.concat([data,rank_fea],axis=1)

(196539, 19) (56279, 19)
handle the null feature
0.08203125
id                   int64
department           int64
rent_num           float64
floor                int64
total_floor        float64
square             float64
direction           object
num_bedroom          int64
num_living_room      int64
num_bath_room        int64
district            object
position            object
metro_line          object
station             object
distance           float64
month_rent         float64
dtype: object
(252818, 49)
(252818, 1)


In [43]:
train = data[data["month_rent"]!=-1]
test = data[data["month_rent"]==-1]
X = train.drop(["month_rent"],axis=1).values
y = train["month_rent"].values
X_test = test.drop(["month_rent"],axis=1).values
cols = test.drop(["month_rent"],axis=1).columns

print(X.shape,y.shape,X_test.shape)

(449357, 49) (449357,) (56279, 49)


In [44]:
from sklearn.cross_validation import StratifiedKFold
seed_ls = []
# 五折交叉训练，构造五个模型
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    model = lgb.LGBMRegressor(objective='regression',num_leaves=125,
                              learning_rate=0.05, n_estimators=2500,boosting_type="gbdt",max_depth=-1,
                             seed=2018,num_thread=-1,max_bin=425,bagging_fraction=0.8,colsample_bytree=0.9,subsample=0.8,lambda_l2=0.20)
    #
    #------------------------------------#
    lgb_model = model.fit(X[train_index], y[train_index],
                          eval_names =['train','valid'],
                          eval_metric= 'rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(lgb_model.best_score_['valid']['rmse'])
    loss += lgb_model.best_score_['valid']['rmse']
    test_pred= lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
print('logloss:', baseloss, loss/5)

KeyboardInterrupt: 

#### xgboost

In [16]:
import xgboost as xgb

In [26]:
data = pd.read_pickle("../Feature/_featureEnineering_v5.pickle").sample(frac=0.01).reset_index(drop=True)
train = data[data["month_rent"]!=-1]
test = data[data["month_rent"]==-1]
X = train.drop(["month_rent"],axis=1).values
y = train["month_rent"].values
X_test = test.drop(["month_rent"],axis=1).values
cols = test.drop(["month_rent"],axis=1).columns

print(X.shape,y.shape,X_test.shape)

(1954, 48) (1954,) (574, 48)


In [37]:
from sklearn.cross_validation import StratifiedKFold
seed_ls = []
# 五折交叉训练，构造五个模型
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    model = xgb.XGBRegressor(learning_rate=0.05, n_estimators=1000,booster='gbtree',max_depth=10,
                             seed=2018,num_thread=-1,bagging_fraction=0.8,colsample_bytree=0.9,subsample=0.8,reg_lambda=0.20)
    #
    #------------------------------------#
    xgb_model = model.fit(X[train_index], y[train_index],
                          eval_metric= 'rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(xgb_model.best_score)
    loss += xgb_model.best_score
    test_pred= xgb_model.predict(X_test, ntree_limit=xgb_model.best_iteration)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
print('logloss:', baseloss, loss/5)

Fold 0
[0]	validation_0-rmse:9.4427	validation_1-rmse:9.70342
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 100 rounds.
[1]	validation_0-rmse:9.0157	validation_1-rmse:9.28747
[2]	validation_0-rmse:8.6219	validation_1-rmse:8.9479
[3]	validation_0-rmse:8.23211	validation_1-rmse:8.62327
[4]	validation_0-rmse:7.85508	validation_1-rmse:8.27279
[5]	validation_0-rmse:7.49494	validation_1-rmse:7.95865
[6]	validation_0-rmse:7.1498	validation_1-rmse:7.67953
[7]	validation_0-rmse:6.83646	validation_1-rmse:7.42823
[8]	validation_0-rmse:6.53843	validation_1-rmse:7.19318
[9]	validation_0-rmse:6.24117	validation_1-rmse:6.97523
[10]	validation_0-rmse:5.96928	validation_1-rmse:6.76574
[11]	validation_0-rmse:5.71147	validation_1-rmse:6.58553
[12]	validation_0-rmse:5.47291	validation_1-rmse:6.40388
[13]	validation_0-rmse:5.23106	validation_1-rmse:6.22976
[14]	validation_0-rmse:5.007	validation_1-rmse:6.0

In [36]:
?xgb_model.predict

array([[ 7.2738376,  4.052149 ,  7.772101 , ...,  8.147953 , 12.519183 ,
         5.780589 ],
       [ 8.990437 ,  3.323268 , 10.979627 , ...,  8.72552  , 12.715401 ,
         5.4123545],
       [ 8.852818 ,  3.419693 ,  9.8699665, ...,  8.195219 , 13.667052 ,
         6.303879 ],
       [ 7.8261533,  3.7398946,  9.817156 , ...,  9.915251 , 13.398383 ,
         5.722551 ],
       [ 6.517917 ,  4.150977 ,  7.199833 , ...,  7.481711 ,  9.976807 ,
         5.231155 ]], dtype=float32)