In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import copy
warnings.filterwarnings('ignore')
from matplotlib import style
style.use("ggplot")
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder 

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import chi2, SelectPercentile,f_classif
import lightgbm as lgb



In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from itertools import product
 
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=5, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
 
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})
        col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
        col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
        col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new


In [3]:
def readData():
    train = pd.read_csv("../Data/train.csv")
    test = pd.read_csv("../Data/test.csv")
    print(train.shape,test.shape)
    columns = ["time","department","rent_num","floor","total_floor","square","direction","living_state","num_bedroom",
          "num_living_room","num_bath_room","rent_type","district","position","metro_line","station","distance","decoration","month_rent"]
    #------------------------------
    train.columns = columns
    test.columns = ["id"] + columns[:-1]
    #-------------------------------
    train = train.drop("time",axis=1).reset_index()
    train = train.rename(columns = {"index":"id"})
    test["month_rent"] = -1
    data = pd.concat([train,test[train.columns]],axis=0)
    data = data.reset_index(drop=True)
    return data

In [4]:
def basicClean(df=None,is_del=True):
    data = copy.deepcopy(df)
    print("handle the null feature")
    if is_del:
        data = data.drop(["living_state","rent_type","decoration"],axis=1)
    else:
        pass
    #-----异常值清洗square
    #data.loc[data["square"]>=0.1,"square"] = data.loc[data["square"]<0.1,"square"].mean()

    #--------空值处理-------------
    rent_median = data["rent_num"].median()
    print(rent_median)
    data["rent_num"] = data["rent_num"].fillna(data["rent_num"].median())
    data["distance"] = data["distance"].fillna(0)
    data["metro_line"] = data["metro_line"].fillna("none")
    data["station"] = data["station"].fillna("none")
    data["district"]  =data["district"].fillna("none")
    data["position"]  =data["position"].fillna("none")
    #-----log变换-----------
    log_cols = ["rent_num","square","distance","total_floor"]
    print(data.dtypes)
    for i in log_cols:
        data['log_'+i] = data[i].map(lambda x:np.log(x+0.0000001))
    #顺序变化
    enc_cols =["department","direction","position","station","metro_line","district"]
    for i in enc_cols:
        data[i] = pd.factorize(data[i])[0]
    return data  

In [5]:
def getDirection(df):
    '''
    将朝向拆开
    '''
    data = copy.deepcopy(df)
    for i,j in zip(['东南', '东', '西北', '西南', '北', '南', '西', '东北'],
     ["east_south","east","west_north","west_south","north","south","west","east_north"]):
        print(i)
        data["is_"+j] = data["direction"].map(lambda x:1 if i in x else 0)
    columns = list(filter(lambda x:"is_" in x,data.columns))
    return data[columns]

In [6]:
def cleanRoomNum(df):
    '''
    清洗客房、卧室、卫的数量
    '''
    #卫数
    data["num_bath_room"] = data["num_bath_room"].map(lambda x:3 if x not in [1,2] else x) #3代表除了卫生间有1、2个
    data["num_bedroom"] = data["num_bedroom"].map(lambda x: 5 if x not in [1,2,3,4] else x) #5代表5个卧室以上
    data["num_living_room"] = data["num_living_room"].map(lambda x: 3 if x not in [0,1,2] else x)
    
    #组合，即xx 房 xx厅 xx卫
    data["bath_bed_living"] = data["num_bath_room"].astype("str") + "_" +  data["num_bedroom"].astype("str") + "_" + data["num_living_room"].astype("str")
    data["bath_bed_living"] = pd.factorize(data["bath_bed_living"])[0]
    columns = ["num_bath_room","num_bedroom","num_living_room","bath_bed_living"]
    return data[columns]

In [7]:
def countFeature(df=None):
    data = copy.deepcopy(df)
    columns = ["month_rent","department","district","position","direction"]
    data = data[columns]
    for i in columns[1:]:
        tmp = data.groupby(i,as_index=False)["month_rent"].count().rename(columns = {"month_rent":"{0}_cnt".format(i)})
        data = data.merge(tmp,how="left",on=i)
    cnt_cols = list(filter(lambda x:"cnt" in x,data.columns))
    return data[cnt_cols]

##### 增加平均值编码 cvloop

In [8]:
from sklearn.cross_validation import StratifiedKFold
def cvMeanEncoding(df,fea):
    data = copy.deepcopy(df)
    data = data.reset_index().rename(columns = {"index":"order"})
    new_train = None #用于存储
    train_data = data[data["month_rent"]!=-1]
    test_data = data[data["month_rent"]==-1]
    y = train_data["month_rent"]
    test_data[fea+"_mean_target"] = 0
    #cv-folds 的策略进行编码
    skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
    for i, (train_index, val_index) in enumerate(skf):
        print(i)
        train,val = train_data.iloc[train_index],train_data.iloc[val_index]
        maper = train.groupby(fea)["month_rent"].median()
        means = val[fea].map(maper)
        val[fea+"_mean_target"] = means
        new_train = pd.concat([new_train,val],axis=0,ignore_index=True)
        test_data[fea+"_mean_target"]= test_data[fea+"_mean_target"] + test_data[fea].map(maper)
    test_data[fea+"_mean_target"] = test_data[fea+"_mean_target"]/5
    data = pd.concat([train_data,test_data],axis=0,ignore_index=True)
    #全局平均值
    g_means = data[fea+"_mean_target"].median()
    data[fea+"_mean_target"]  = data[fea+"_mean_target"].fillna(g_means)
    data = data.sort_values(by="order")
    data = data.drop("order",axis=1)
    return data
    

In [9]:
def goupbyMeanEncoding(df,fea1,fea2):
    data = copy.deepcopy(df)
    data = data.reset_index().rename(columns = {"index":"order"})
    new_train = None #用于存储
    train_data = data[data["month_rent"]!=-1]
    test_data = data[data["month_rent"]==-1]
    y = train_data["month_rent"]
    colname = fea1+"_"+fea2+"_mean_target"
    test_data[colname] = 0
    print(colname)
    #cv-folds 的策略进行编码
    skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
    for i, (train_index, val_index) in enumerate(skf):
        print(i)
        train,val = train_data.iloc[train_index],train_data.iloc[val_index]
        maper = train.groupby([fea1,fea2],as_index=False)["month_rent"].median()
        maper.columns = [fea1,fea2,colname]
        val = pd.merge(val,maper,on=[fea1,fea2])
        new_train = pd.concat([new_train,val],axis=0,ignore_index=True)
        #对于测试集来说
        tmp = test_data.drop(colname,axis=1).merge(maper,on=[fea1,fea2],how="left")
        test_data[colname]= test_data[colname].values + tmp[colname].values
    test_data[colname] = test_data[colname]/5
    data = pd.concat([train_data,test_data],axis=0,ignore_index=True)
    #全局平均值
    g_means = data[colname].median()
    data[colname]  = data[colname].fillna(g_means)
    data = data.sort_values(by="order")
    data = data.drop("order",axis=1)
    return data
    

In [10]:
def mergeFeature():
    save_path= "../Feature/_featureEnineering_v1.1.pickle"
    if os.path.exists(save_path):
        print(save_path,"已经存在")
    else:
        data = readData()
        direction = getDirection(data) #one-hot
        data = basicClean(data)
        RoomNum = cleanRoomNum(data)
        count_fea = countFeature(data)
        #合并特征
        data = data.drop(["id","num_bath_room","num_bedroom","num_living_room"],axis=1)
        data = pd.concat([data,direction],axis=1)
        data = pd.concat([data,RoomNum],axis=1)
        data = pd.concat([data,count_fea],axis=1)
#         #反解密
#         data["total_floor"] = data["total_floor"].map(getTotalFloor)
#         data["rent_num"] = data["rent_num"].map(getRentNum) 
        #平均值编码1
        for i in ["department","position","direction","district","station"]:
            print(i)
            data = cvMeanEncoding(data,i)
            
        for i,j in (["department","district"],["station","position"]):
            data = goupbyMeanEncoding(data,i,j)
        
        #平均值编码2
        train = data[data["month_rent"]!=-1]
        test = data[data["month_rent"]==-1]
        meanEncoder = MeanEncoder(categorical_features=["department","position","direction","district","station"],target_type = "regression",n_splits=10)
        new_train = meanEncoder.fit_transform(train.drop("month_rent",axis=1),train["month_rent"])
        new_test = meanEncoder.transform(test.drop("month_rent",axis=1))
        #
        new_train["month_rent"] = train["month_rent"].values
        new_test["month_rent"] =-1
        data = pd.concat([new_train,new_test[new_train.columns]],axis=0,ignore_index=True)
        print(data.columns)
        drop_cols = ["rent_num","square","distance","total_floor"]
        data = data.drop(drop_cols,axis=1)
        data.to_pickle(save_path)
        print(data.shape)
        print("保存在",save_path)

In [77]:
if __name__ == '__main__':
    mergeFeature()

(196539, 19) (56279, 19)
东南
东
西北
西南
北
南
西
东北
handle the null feature
0.08203125
id                   int64
department           int64
rent_num           float64
floor                int64
total_floor        float64
square             float64
direction           object
num_bedroom          int64
num_living_room      int64
num_bath_room        int64
district            object
position            object
metro_line          object
station             object
distance           float64
month_rent         float64
dtype: object
department
0
1
2
3
4
position
0
1
2
3
4
direction
0
1
2
3
4
district
0
1
2
3
4
station
0
1
2
3
4
department_district_mean_target
0
1
2
3
4
station_position_mean_target
0
1
2
3
4
Index(['bath_bed_living', 'department', 'department_cnt',
       'department_district_mean_target', 'department_mean_target',
       'direction', 'direction_cnt', 'direction_mean_target', 'distance',
       'district', 'district_cnt', 'district_mean_target', 'floor', 'is_east',
       'is_east_n

In [22]:
data = pd.read_pickle("../Feature/_featureEnineering_v1.1.pickle")
train = data[data["month_rent"]!=-1]
test = data[data["month_rent"]==-1]
X = train.drop(["month_rent"],axis=1).values
y = train["month_rent"].values
X_test = test.drop(["month_rent"],axis=1).values
cols = test.drop(["month_rent"],axis=1).columns

print(X.shape,y.shape,X_test.shape)

(196539, 39) (196539,) (56279, 39)


In [23]:
from sklearn.cross_validation import StratifiedKFold
seed_ls = []
# 五折交叉训练，构造五个模型
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    #------------------------------------#
    # 模型部分
    
#     model = lgb.LGBMRegressor(objective='regression',num_leaves=60,
#                               learning_rate=0.05, n_estimators=2000,
#                               max_bin = 55, bagging_fraction = 0.8,
#                               bagging_freq = 5, feature_fraction = 0.2319,
#                               feature_fraction_seed=9, bagging_seed=9,
#                               min_data_in_leaf =6, min_sum_hessian_in_leaf = 11,random_seed=2018)
#logloss: [1.9440240545361742, 1.940272674000025, 1.9222839817175252, 1.901934152677352, 1.8703667334744163] 1.9157763192810986
#logloss: [1.5509205465739135, 1.5498981448405582, 1.5717136753226184, 1.487721626321449, 1.4460505427984207] 1.521260907171392
    model = lgb.LGBMRegressor(objective='regression',num_leaves=126,
                              learning_rate=0.05, n_estimators=3000,boosting_type="gbdt",max_depth=-1,
                             seed=2018,num_thread=-1,max_bin=425,bagging_fraction=0.8,colsample_bytree=0.9,subsample=0.8,lambda_l2=0.20)
    #
    #------------------------------------#
    lgb_model = model.fit(X[train_index], y[train_index],
                          eval_names =['train','valid'],
                          eval_metric= 'rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(lgb_model.best_score_['valid']['rmse'])
    loss += lgb_model.best_score_['valid']['rmse']
    test_pred= lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
print('logloss:', baseloss, loss/5)

Fold 0
[1]	train's rmse: 6.03029	valid's rmse: 6.18166
Training until validation scores don't improve for 100 rounds.
[2]	train's rmse: 5.79601	valid's rmse: 5.94393
[3]	train's rmse: 5.57478	valid's rmse: 5.71963
[4]	train's rmse: 5.37585	valid's rmse: 5.52094
[5]	train's rmse: 5.17785	valid's rmse: 5.32107
[6]	train's rmse: 5.00357	valid's rmse: 5.14875
[7]	train's rmse: 4.82664	valid's rmse: 4.96914
[8]	train's rmse: 4.67297	valid's rmse: 4.817
[9]	train's rmse: 4.51731	valid's rmse: 4.66035
[10]	train's rmse: 4.37131	valid's rmse: 4.51284
[11]	train's rmse: 4.23321	valid's rmse: 4.37455
[12]	train's rmse: 4.10462	valid's rmse: 4.24689
[13]	train's rmse: 3.9914	valid's rmse: 4.13715
[14]	train's rmse: 3.87827	valid's rmse: 4.02303
[15]	train's rmse: 3.77169	valid's rmse: 3.91681
[16]	train's rmse: 3.67043	valid's rmse: 3.81462
[17]	train's rmse: 3.57522	valid's rmse: 3.7199
[18]	train's rmse: 3.48656	valid's rmse: 3.63089
[19]	train's rmse: 3.41014	valid's rmse: 3.55932
[20]	train's

In [24]:
submission= pd.DataFrame(predict.mean(axis=0))
test = pd.read_csv("../Data/test.csv")
submission["id"] = test["id"].values
submission.columns=["price","id"]
submission[["id","price"]].to_csv("../Result/_baseline_v2.2.csv",index=False,encoding="utf-8",sep=",")

In [26]:
submission= pd.DataFrame(predict[0]*0.15+predict[0]*0.15+predict[0]*0.1+predict[0]*0.3+predict[0]*0.3)
test = pd.read_csv("../Data/test.csv")
submission["id"] = test["id"].values
submission.columns=["price","id"]
submission[["id","price"]].to_csv("../Result/_baseline_v2.3.csv",index=False,encoding="utf-8",sep=",")

#### stacking

In [2]:
data = pd.read_pickle("../Feature/_featureEnineering_v1.1.pickle")
train = data[data["month_rent"]!=-1]
test = data[data["month_rent"]==-1]
X = train.drop(["month_rent"],axis=1).values
y = train["month_rent"].values
X_test = test.drop(["month_rent"],axis=1).values
cols = test.drop(["month_rent"],axis=1).columns

print(X.shape,y.shape,X_test.shape)

(196539, 39) (196539,) (56279, 39)


#### 第一层

In [3]:
from sklearn.cross_validation import StratifiedKFold
seed_ls = []
# 五折交叉训练，构造五个模型
new_train = None
new_test = copy.deepcopy(test)
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
new_test["predict"] = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    #------------------------------------#
    # 模型部分
    
#     model = lgb.LGBMRegressor(objective='regression',num_leaves=60,
#                               learning_rate=0.05, n_estimators=2000,
#                               max_bin = 55, bagging_fraction = 0.8,
#                               bagging_freq = 5, feature_fraction = 0.2319,
#                               feature_fraction_seed=9, bagging_seed=9,
#                               min_data_in_leaf =6, min_sum_hessian_in_leaf = 11,random_seed=2018)
#logloss: [1.9440240545361742, 1.940272674000025, 1.9222839817175252, 1.901934152677352, 1.8703667334744163] 1.9157763192810986
    model = lgb.LGBMRegressor(objective='regression',num_leaves=125,
                              learning_rate=0.05, n_estimators=2500,boosting_type="gbdt",max_depth=-1,
                             seed=2018,num_thread=-1,max_bin=425,bagging_fraction=0.8,colsample_bytree=0.9,subsample=0.8,lambda_l2=0.20)
    #
    #------------------------------------#
    lgb_model = model.fit(X[train_index], y[train_index],
                          eval_names =['train','valid'],
                          eval_metric= 'rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(lgb_model.best_score_['valid']['rmse'])
    loss += lgb_model.best_score_['valid']['rmse']
    test_pred= lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
        
    #训练集
    train_predict = lgb_model.predict(X[test_index], num_iteration=lgb_model.best_iteration_)
    train_tmp = train.iloc[test_index]
    train_tmp["predict"] = train_predict
    new_train = pd.concat([new_train,train_tmp],axis=0,ignore_index=False)
    
    #测试集
    new_test["predict"]  = new_test["predict"] + test_pred
new_test["predict"] = new_test["predict"]/5
print('logloss:', baseloss, loss/5)

new_test.to_pickle("../Feature/_featureEnineering_v1.1_stacking1_test.pickle")
new_train.to_pickle("../Feature/_featureEnineering_v1.1_stacking1_train.pickle")

Fold 0
[1]	train's rmse: 6.03039	valid's rmse: 6.1818
Training until validation scores don't improve for 100 rounds.
[2]	train's rmse: 5.79616	valid's rmse: 5.94416
[3]	train's rmse: 5.57499	valid's rmse: 5.71976
[4]	train's rmse: 5.37617	valid's rmse: 5.52091
[5]	train's rmse: 5.17823	valid's rmse: 5.32107
[6]	train's rmse: 5.00416	valid's rmse: 5.14898
[7]	train's rmse: 4.82749	valid's rmse: 4.96927
[8]	train's rmse: 4.6739	valid's rmse: 4.8172
[9]	train's rmse: 4.51836	valid's rmse: 4.66085
[10]	train's rmse: 4.3724	valid's rmse: 4.51307
[11]	train's rmse: 4.23475	valid's rmse: 4.37522
[12]	train's rmse: 4.10676	valid's rmse: 4.24792
[13]	train's rmse: 3.99485	valid's rmse: 4.13908
[14]	train's rmse: 3.88144	valid's rmse: 4.02499
[15]	train's rmse: 3.77502	valid's rmse: 3.9189
[16]	train's rmse: 3.67393	valid's rmse: 3.81632
[17]	train's rmse: 3.579	valid's rmse: 3.72182
[18]	train's rmse: 3.49064	valid's rmse: 3.63345
[19]	train's rmse: 3.41426	valid's rmse: 3.56206
[20]	train's rm

In [4]:
new_test = pd.read_pickle("../Feature/_featureEnineering_v1.1_stacking1_test.pickle")
new_train = pd.read_pickle("../Feature/_featureEnineering_v1.1_stacking1_train.pickle")

In [5]:
new_test = new_test[new_train.columns]

##### 第二层

In [8]:
X = new_train.drop(["month_rent"],axis=1).values
y = new_train["month_rent"].values
X_test = new_test.drop(["month_rent"],axis=1).values
cols = new_test.drop(["month_rent"],axis=1).columns

print(X.shape,y.shape,X_test.shape)

(196539, 40) (196539,) (56279, 40)


In [9]:
from sklearn.cross_validation import StratifiedKFold
seed_ls = []
# 五折交叉训练，构造五个模型
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    #------------------------------------#
    # 模型部分
    
#     model = lgb.LGBMRegressor(objective='regression',num_leaves=60,
#                               learning_rate=0.05, n_estimators=2000,
#                               max_bin = 55, bagging_fraction = 0.8,
#                               bagging_freq = 5, feature_fraction = 0.2319,
#                               feature_fraction_seed=9, bagging_seed=9,
#                               min_data_in_leaf =6, min_sum_hessian_in_leaf = 11,random_seed=2018)
#logloss: [1.9440240545361742, 1.940272674000025, 1.9222839817175252, 1.901934152677352, 1.8703667334744163] 1.9157763192810986
    model = lgb.LGBMRegressor(objective='regression',num_leaves=100,
                              learning_rate=0.05, n_estimators=2500,boosting_type="gbdt",max_depth=-1,
                             seed=2018,num_thread=-1,max_bin=425,bagging_fraction=0.8,colsample_bytree=0.9,subsample=0.8,lambda_l2=0.20)
    #
    #------------------------------------#
    lgb_model = model.fit(X[train_index], y[train_index],
                          eval_names =['train','valid'],
                          eval_metric= 'rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(lgb_model.best_score_['valid']['rmse'])
    loss += lgb_model.best_score_['valid']['rmse']
    test_pred= lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
        
print('logloss:', baseloss, loss/5)


Fold 0
[1]	train's rmse: 5.98658	valid's rmse: 6.13355
Training until validation scores don't improve for 100 rounds.
[2]	train's rmse: 5.70943	valid's rmse: 5.85142
[3]	train's rmse: 5.44688	valid's rmse: 5.58288
[4]	train's rmse: 5.19884	valid's rmse: 5.3295
[5]	train's rmse: 4.96276	valid's rmse: 5.08808
[6]	train's rmse: 4.74016	valid's rmse: 4.86174
[7]	train's rmse: 4.52927	valid's rmse: 4.64623
[8]	train's rmse: 4.33071	valid's rmse: 4.4429
[9]	train's rmse: 4.14379	valid's rmse: 4.252
[10]	train's rmse: 3.96701	valid's rmse: 4.07158
[11]	train's rmse: 3.79932	valid's rmse: 3.8999
[12]	train's rmse: 3.64124	valid's rmse: 3.73746
[13]	train's rmse: 3.49256	valid's rmse: 3.58414
[14]	train's rmse: 3.37639	valid's rmse: 3.4666
[15]	train's rmse: 3.27189	valid's rmse: 3.35962
[16]	train's rmse: 3.14458	valid's rmse: 3.22849
[17]	train's rmse: 3.02543	valid's rmse: 3.10561
[18]	train's rmse: 2.91352	valid's rmse: 2.9911
[19]	train's rmse: 2.80887	valid's rmse: 2.88336
[20]	train's rm

In [21]:
submission= pd.DataFrame(predict[0]*0.3 + predict[1]*0.1 + predict[2]*0.15+predict[3]*0.3+predict[4]*0.15)
test = pd.read_csv("../Data/test.csv")
submission["id"] = test["id"].values
submission.columns=["price","id"]
submission[["id","price"]].to_csv("../Result/_baseline_v2.1.csv",index=False,encoding="utf-8",sep=",")

##### 重写函数

##### 增加total_floor 和floor的cv

In [107]:
def countFeature(df=None):
    data = copy.deepcopy(df)
    columns = ["month_rent","department","district","position","direction","log_total_floor","floor"]
    data = data[columns]
    for i in columns[1:]:
        tmp = data.groupby(i,as_index=False)["month_rent"].count().rename(columns = {"month_rent":"{0}_cnt".format(i)})
        data = data.merge(tmp,how="left",on=i)
    cnt_cols = list(filter(lambda x:"cnt" in x,data.columns))
    return data[cnt_cols]

In [108]:
def mergeFeature():
    save_path= "../Feature/_featureEnineering_v1.2.pickle"
    if os.path.exists(save_path):
        print(save_path,"已经存在")
    else:
        data = readData()
        direction = getDirection(data) #one-hot
        data = basicClean(data)
        RoomNum = cleanRoomNum(data)
        count_fea = countFeature(data)
        #合并特征
        data = data.drop(["id","num_bath_room","num_bedroom","num_living_room"],axis=1)
        data = pd.concat([data,direction],axis=1)
        data = pd.concat([data,RoomNum],axis=1)
        data = pd.concat([data,count_fea],axis=1)
#         #反解密
#         data["total_floor"] = data["total_floor"].map(getTotalFloor)
#         data["rent_num"] = data["rent_num"].map(getRentNum) 
        #平均值编码1
        for i in ["department","position","direction","district","station"]:
            print(i)
            data = cvMeanEncoding(data,i)
            
        for i,j in (["department","district"],["station","position"]):
            data = goupbyMeanEncoding(data,i,j)
        
        #平均值编码2
        train = data[data["month_rent"]!=-1]
        test = data[data["month_rent"]==-1]
        meanEncoder = MeanEncoder(categorical_features=["department","position","direction","district","station"],target_type = "regression",n_splits=10)
        new_train = meanEncoder.fit_transform(train.drop("month_rent",axis=1),train["month_rent"])
        new_test = meanEncoder.transform(test.drop("month_rent",axis=1))
        #
        new_train["month_rent"] = train["month_rent"].values
        new_test["month_rent"] =-1
        data = pd.concat([new_train,new_test[new_train.columns]],axis=0,ignore_index=True)
        print(data.columns)
        drop_cols = ["rent_num","square","distance","total_floor"]
        data = data.drop(drop_cols,axis=1)
        data.to_pickle(save_path)
        print(data.shape)
        print("保存在",save_path)

In [45]:
if __name__ == '__main__':
    mergeFeature()

(196539, 19) (56279, 19)
东南
东
西北
西南
北
南
西
东北
handle the null feature
0.08203125
id                   int64
department           int64
rent_num           float64
floor                int64
total_floor        float64
square             float64
direction           object
num_bedroom          int64
num_living_room      int64
num_bath_room        int64
district            object
position            object
metro_line          object
station             object
distance           float64
month_rent         float64
dtype: object
department
0
1
2
3
4
position
0
1
2
3
4
direction
0
1
2
3
4
district
0
1
2
3
4
station
0
1
2
3
4
department_district_mean_target
0
1
2
3
4
station_position_mean_target
0
1
2
3
4
Index(['bath_bed_living', 'department', 'department_cnt',
       'department_district_mean_target', 'department_mean_target',
       'direction', 'direction_cnt', 'direction_mean_target', 'distance',
       'district', 'district_cnt', 'district_mean_target', 'floor',
       'floor_cnt', 'is_east

In [109]:
data = pd.read_pickle("../Feature/_featureEnineering_v1.2.pickle")
train = data[data["month_rent"]!=-1]
test = data[data["month_rent"]==-1]
X = train.drop(["month_rent"],axis=1).values
y = train["month_rent"].values
X_test = test.drop(["month_rent"],axis=1).values
cols = test.drop(["month_rent"],axis=1).columns

print(X.shape,y.shape,X_test.shape)

(196539, 41) (196539,) (56279, 41)


In [110]:
from sklearn.cross_validation import StratifiedKFold
seed_ls = []
# 五折交叉训练，构造五个模型
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    #------------------------------------#
    # 模型部分
    
#     model = lgb.LGBMRegressor(objective='regression',num_leaves=60,
#                               learning_rate=0.05, n_estimators=2000,
#                               max_bin = 55, bagging_fraction = 0.8,
#                               bagging_freq = 5, feature_fraction = 0.2319,
#                               feature_fraction_seed=9, bagging_seed=9,
#                               min_data_in_leaf =6, min_sum_hessian_in_leaf = 11,random_seed=2018)
#logloss: [1.9440240545361742, 1.940272674000025, 1.9222839817175252, 1.901934152677352, 1.8703667334744163] 1.9157763192810986
    model = lgb.LGBMRegressor(objective='regression',num_leaves=125,
                              learning_rate=0.05, n_estimators=2500,boosting_type="gbdt",max_depth=-1,
                             seed=2018,num_thread=-1,max_bin=425,bagging_fraction=0.8,colsample_bytree=0.9,subsample=0.8,lambda_l2=0.20)
    #
    #------------------------------------#
    lgb_model = model.fit(X[train_index], y[train_index],
                          eval_names =['train','valid'],
                          eval_metric= 'rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(lgb_model.best_score_['valid']['rmse'])
    loss += lgb_model.best_score_['valid']['rmse']
    test_pred= lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
print('logloss:', baseloss, loss/5)

Fold 0
[1]	train's rmse: 6.03013	valid's rmse: 6.18016
Training until validation scores don't improve for 100 rounds.
[2]	train's rmse: 5.79524	valid's rmse: 5.94334
[3]	train's rmse: 5.57326	valid's rmse: 5.72007
[4]	train's rmse: 5.3647	valid's rmse: 5.50761
[5]	train's rmse: 5.16851	valid's rmse: 5.30818
[6]	train's rmse: 4.98432	valid's rmse: 5.12164
[7]	train's rmse: 4.80916	valid's rmse: 4.94526
[8]	train's rmse: 4.6447	valid's rmse: 4.78041
[9]	train's rmse: 4.4904	valid's rmse: 4.62629
[10]	train's rmse: 4.34534	valid's rmse: 4.47991
[11]	train's rmse: 4.20931	valid's rmse: 4.34094
[12]	train's rmse: 4.08235	valid's rmse: 4.21699
[13]	train's rmse: 3.96225	valid's rmse: 4.09584
[14]	train's rmse: 3.85114	valid's rmse: 3.98504
[15]	train's rmse: 3.74629	valid's rmse: 3.88025
[16]	train's rmse: 3.64925	valid's rmse: 3.78282
[17]	train's rmse: 3.55747	valid's rmse: 3.69267
[18]	train's rmse: 3.46956	valid's rmse: 3.60476
[19]	train's rmse: 3.38738	valid's rmse: 3.52433
[20]	train'

In [111]:
submission= pd.DataFrame(predict.mean(axis=0))
test = pd.read_csv("../Data/test.csv")
submission["id"] = test["id"].values
submission.columns=["price","id"]
submission[["id","price"]].to_csv("../Result/_baseline_线下1.50_v2.6.csv",index=False,encoding="utf-8",sep=",")

#### 添加鑫爷的特征

In [12]:
data = pd.read_pickle("../Feature/_featureEnineering_v1.2.pickle")

In [11]:
def get_cv(data):
    num_feature = data.drop(columns=['direction']).columns
    data['new_con'] = data['department'].astype(str)
    # , '地铁线路', '地铁站点', '区', '位置'
    for i in ['direction', 'position', 'floor']:
        data['new_con'] = data['new_con'].astype(str)+'_'+data[i].astype(str)
    data['new_con'] = data['new_con'].apply(lambda x: ' '.join(x.split('_')))

    # print(len(data))
    total_feature = sparse.csr_matrix((len(data), 0))
    cv = CountVectorizer(min_df=22)
    # print(data['new_con'])
    for feature in ['new_con']:
        data[feature] = data[feature].astype(str)
        # print(data[feature])
        cv.fit(data[feature])
        total_feature = sparse.hstack((total_feature, cv.transform(data[feature].astype(str))), 'csr', 'bool')
    print('CountVectorizer_over!')
    total_feature = sparse.hstack((sparse.csr_matrix(data[num_feature].astype('float32')), total_feature),
                                  'csr').astype('float32')
    print(total_feature)
    return total_feature
