In [140]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import copy
warnings.filterwarnings('ignore')
from matplotlib import style
style.use("ggplot")
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder 

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import chi2, SelectPercentile,f_classif
import lightgbm as lgb

In [379]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from itertools import product
 
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=5, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
 
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg({'mean': 'mean', 'beta': 'size'})
        col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
        col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
        col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new


In [288]:
def readData():
    train = pd.read_csv("../Data/train.csv")
    test = pd.read_csv("../Data/test.csv")
    print(train.shape,test.shape)
    columns = ["time","department","rent_num","floor","total_floor","square","direction","living_state","num_bedroom",
          "num_living_room","num_bath_room","rent_type","district","position","metro_line","station","distance","decoration","month_rent"]
    #------------------------------
    train.columns = columns
    test.columns = ["id"] + columns[:-1]
    #-------------------------------
    train = train.drop("time",axis=1).reset_index()
    train = train.rename(columns = {"index":"id"})
    test["month_rent"] = -1
    data = pd.concat([train,test[train.columns]],axis=0)
    data = data.reset_index(drop=True)
    return data

In [316]:
data = readData()

(196539, 19) (56279, 19)


In [317]:
data.isnull().sum(axis=0)

id                      0
department              0
rent_num             1023
floor                   0
total_floor             0
square                  0
direction               0
living_state       228197
num_bedroom             0
num_living_room         0
num_bath_room           0
rent_type          223617
district               41
position               41
metro_line         134546
station            134546
distance           134546
decoration         230119
month_rent              0
dtype: int64

In [325]:
def basicClean(df=None,is_del=True):
    data = copy.deepcopy(df)
    print("handle the null feature")
    if is_del:
        data = data.drop(["living_state","rent_type","decoration"],axis=1)
    else:
        pass
    #-----异常值清洗square
    data.loc[data["square"]>=0.1,"square"] = data.loc[data["square"]<0.1,"square"].mean()
    
    #--------空值处理-------------
    rent_median = data["rent_num"].median()
    print(rent_median)
    data["rent_num"] = data["rent_num"].fillna(data["rent_num"].median())
    data["distance"] = data["distance"].fillna(0)
    data["metro_line"] = data["metro_line"].fillna("none")
    data["station"] = data["station"].fillna("none")
    data["district"]  =data["district"].fillna("none")
    data["position"]  =data["position"].fillna("none")
    #-----log变换-----------
    log_cols = ["rent_num","square","distance","total_floor"]
    print(data.dtypes)
    for i in log_cols:
        data['log_'+i] = data[i].map(lambda x:np.log(x+0.0000001))
    #顺序变化
    enc_cols =["department","direction","position","station","metro_line","district"]
    for i in enc_cols:
        data[i] = pd.factorize(data[i])[0]
    return data  

In [326]:
def getDirection(df):
    '''
    将朝向拆开
    '''
    data = copy.deepcopy(df)
    for i,j in zip(['东南', '东', '西北', '西南', '北', '南', '西', '东北'],
     ["east_south","east","west_north","west_south","north","south","west","east_north"]):
        print(i)
        data["is_"+j] = data["direction"].map(lambda x:1 if i in x else 0)
    columns = list(filter(lambda x:"is_" in x,data.columns))
    return data[columns]

In [327]:
def cleanRoomNum(df):
    '''
    清洗客房、卧室、卫的数量
    '''
    #卫数
    data["num_bath_room"] = data["num_bath_room"].map(lambda x:3 if x not in [1,2] else x) #3代表除了卫生间有1、2个
    data["num_bedroom"] = data["num_bedroom"].map(lambda x: 5 if x not in [1,2,3,4] else x) #5代表5个卧室以上
    data["num_living_room"] = data["num_living_room"].map(lambda x: 3 if x not in [0,1,2] else x)
    
    #组合，即xx 房 xx厅 xx卫
    data["bath_bed_living"] = data["num_bath_room"].astype("str") + "_" +  data["num_bedroom"].astype("str") + "_" + data["num_living_room"].astype("str")
    data["bath_bed_living"] = pd.factorize(data["bath_bed_living"])[0]
    columns = ["num_bath_room","num_bedroom","num_living_room","bath_bed_living"]
    return data[columns]

##### 反解密

In [249]:
def getTotalFloor(total_floor):
    if str(total_floor)=='nan':
        return np.nan
    else:
        return round((total_floor - 0.0) / 0.018181818181818188,2)+ 1

In [250]:
def getRentNum(rent_num):
    if str(rent_num)=='nan':
        return np.nan
    else:
        return round((rent_num - 0.0078125)/0.00390625,2) + 1

In [257]:
def countFeature(df=None):
    data = copy.deepcopy(df)
    columns = ["month_rent","department","district","position","direction"]
    data = data[columns]
    for i in columns[1:]:
        tmp = data.groupby(i,as_index=False)["month_rent"].count().rename(columns = {"month_rent":"{0}_cnt".format(i)})
        data = data.merge(tmp,how="left",on=i)
    cnt_cols = list(filter(lambda x:"cnt" in x,data.columns))
    return data[cnt_cols]

In [335]:
def mergeFeature():
    save_path= "../Feature/_baseline_feat_v1.2.pickle"
    if os.path.exists(save_path):
        print(save_path,"已经存在")
    else:
        data = readData()
        direction = getDirection(data) #one-hot
        data = basicClean(data)
        RoomNum = cleanRoomNum(data)
        count_fea = countFeature(data)
        #合并特征
        data = data.drop(["id","num_bath_room","num_bedroom","num_living_room"],axis=1)
        data = pd.concat([data,direction],axis=1)
        data = pd.concat([data,RoomNum],axis=1)
        data = pd.concat([data,count_fea],axis=1)
#         #反解密
#         data["total_floor"] = data["total_floor"].map(getTotalFloor)
#         data["rent_num"] = data["rent_num"].map(getRentNum) 
        drop_cols = ["rent_num","square","distance","total_floor"]
        data = data.drop(drop_cols,axis=1)
        data.to_pickle(save_path)
        print(data.shape)
        print("保存在",save_path)

In [336]:
if __name__ == '__main__':
    mergeFeature()

(196539, 19) (56279, 19)
东南
东
西北
西南
北
南
西
东北
handle the null feature
0.08203125
id                   int64
department           int64
rent_num           float64
floor                int64
total_floor        float64
square             float64
direction           object
num_bedroom          int64
num_living_room      int64
num_bath_room        int64
district            object
position            object
metro_line          object
station             object
distance           float64
month_rent         float64
dtype: object
(252818, 27)
保存在 ../Feature/_baseline_feat_v1.2.pickle


In [375]:
data = pd.read_pickle("../Feature/_baseline_feat_v1.2.pickle")

In [376]:
data.dtypes

department           int64
floor                int64
direction            int64
district             int64
position             int64
metro_line           int64
station              int64
month_rent         float64
log_rent_num       float64
log_square         float64
log_distance       float64
log_total_floor    float64
is_east_south        int64
is_east              int64
is_west_north        int64
is_west_south        int64
is_north             int64
is_south             int64
is_west              int64
is_east_north        int64
num_bath_room        int64
num_bedroom          int64
num_living_room      int64
bath_bed_living      int64
depart_cnt           int64
district_cnt         int64
position_cnt         int64
dtype: object

##### 建模

In [341]:
train = data[data["month_rent"]!=-1]
test = data[data["month_rent"]==-1]
X = train.drop(["month_rent"],axis=1).values
y = train["month_rent"].values
X_test = test.drop(["month_rent"],axis=1).values
cols = test.drop(["month_rent"],axis=1).columns

print(X.shape,y.shape,X_test.shape)

(196539, 26) (196539,) (56279, 26)


In [342]:
seed_ls = []
# 五折交叉训练，构造五个模型
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    #------------------------------------#
    # 模型部分
    
    model = lgb.LGBMRegressor(objective='regression',num_leaves=60,
                              learning_rate=0.05, n_estimators=2000,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11,random_seed=2018)
    #------------------------------------#
    lgb_model = model.fit(X[train_index], y[train_index],
                          eval_names =['train','valid'],
                          eval_metric='rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(lgb_model.best_score_['valid']['rmse'])
    loss += lgb_model.best_score_['valid']['rmse']
    test_pred= lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
print('logloss:', baseloss, loss/5)

Fold 0
[1]	train's rmse: 6.12972	valid's rmse: 6.28328
Training until validation scores don't improve for 100 rounds.
[2]	train's rmse: 6.07982	valid's rmse: 6.23566
[3]	train's rmse: 5.98858	valid's rmse: 6.14663
[4]	train's rmse: 5.95303	valid's rmse: 6.11184
[5]	train's rmse: 5.81723	valid's rmse: 5.97147
[6]	train's rmse: 5.6538	valid's rmse: 5.80161
[7]	train's rmse: 5.55141	valid's rmse: 5.70054
[8]	train's rmse: 5.49944	valid's rmse: 5.65108
[9]	train's rmse: 5.42354	valid's rmse: 5.57517
[10]	train's rmse: 5.33456	valid's rmse: 5.48748
[11]	train's rmse: 5.2404	valid's rmse: 5.39431
[12]	train's rmse: 5.15426	valid's rmse: 5.30949
[13]	train's rmse: 5.07268	valid's rmse: 5.22946
[14]	train's rmse: 5.00669	valid's rmse: 5.16618
[15]	train's rmse: 4.95115	valid's rmse: 5.11094
[16]	train's rmse: 4.83563	valid's rmse: 4.99214
[17]	train's rmse: 4.78507	valid's rmse: 4.94296
[18]	train's rmse: 4.73214	valid's rmse: 4.89203
[19]	train's rmse: 4.64161	valid's rmse: 4.79756
[20]	train

In [343]:
submission= pd.DataFrame(predict.mean(axis=0))

In [344]:
submission.shape,test.shape

((56279, 1), (56279, 27))

In [345]:
test = pd.read_csv("../Data/test.csv")
submission["id"] = test["id"].values
submission.columns=["price","id"]
submission[["id","price"]].to_csv("../Result/_baseline_v1.5.csv",index=False,encoding="utf-8",sep=",")

In [346]:
submission["price"].mean()

8.058772466192028

In [386]:
train.nunique()

department         5547
floor                 3
direction            64
district             15
position            153
metro_line            6
station             119
month_rent          419
log_rent_num        154
log_square         7840
log_distance       1156
log_total_floor      55
is_east_south         2
is_east               2
is_west_north         2
is_west_south         2
is_north              2
is_south              2
is_west               2
is_east_north         2
num_bath_room         3
num_bedroom           5
num_living_room       4
bath_bed_living      56
depart_cnt          416
district_cnt         15
position_cnt        148
dtype: int64

##### 平均值编码

In [387]:
data = pd.read_pickle("../Feature/_baseline_feat_v1.2.pickle")
train = data[data["month_rent"]!=-1]
test = data[data["month_rent"]==-1]
meanEncoder = MeanEncoder(categorical_features=["department","position","direction","district","station"],target_type = "regression",n_splits=10)
new_train = meanEncoder.fit_transform(train.drop("month_rent",axis=1),train["month_rent"])
new_test = meanEncoder.transform(test.drop("month_rent",axis=1))

In [403]:
X = new_train.values
y = train["month_rent"].values
X_test = new_test.values
cols = new_train.columns
print(X.shape,y.shape,X_test.shape)

(196539, 31) (196539,) (56279, 31)


In [404]:
def rmse(y_true, predict):
    ture_y = copy.deepcopy(y_true)
    predict = copy.deepcopy(predict)
    ture_y = np.exp(ture_y)
    predict = np.exp(predict)
    score = np.sqrt(mean_squared_error(ture_y,predict))
    return 'rmse', score, True

In [408]:
from sklearn.cross_validation import StratifiedKFold
seed_ls = []
# 五折交叉训练，构造五个模型
skf=list(StratifiedKFold(y, n_folds=5, shuffle=True, random_state=1024))
baseloss = []
loss = 0
for i, (train_index, test_index) in enumerate(skf):
    print("Fold", i)
    #------------------------------------#
    # 模型部分
    
#     model = lgb.LGBMRegressor(objective='regression',num_leaves=60,
#                               learning_rate=0.05, n_estimators=2000,
#                               max_bin = 55, bagging_fraction = 0.8,
#                               bagging_freq = 5, feature_fraction = 0.2319,
#                               feature_fraction_seed=9, bagging_seed=9,
#                               min_data_in_leaf =6, min_sum_hessian_in_leaf = 11,random_seed=2018)
#logloss: [1.9440240545361742, 1.940272674000025, 1.9222839817175252, 1.901934152677352, 1.8703667334744163] 1.9157763192810986
    model = lgb.LGBMRegressor(objective='regression',num_leaves=125,
                              learning_rate=0.05, n_estimators=2000,boosting_type="gbdt",max_depth=-1,
                             seed=2018,num_thread=-1,max_bin=425,bagging_fraction=0.8,colsample_bytree=0.9,subsample=0.8,lambda_l2=0.30)
    #
    #------------------------------------#
    lgb_model = model.fit(X[train_index], y[train_index],
                          eval_names =['train','valid'],
                          eval_metric= 'rmse',
                          eval_set=[(X[train_index], y[train_index]), 
                                    (X[test_index], y[test_index])],early_stopping_rounds=100)
    baseloss.append(lgb_model.best_score_['valid']['rmse'])
    loss += lgb_model.best_score_['valid']['rmse']
    test_pred= lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration_)
    if i==0:
        predict = test_pred
    else:
        predict = np.vstack((predict,test_pred))
print('logloss:', baseloss, loss/5)

Fold 0
[1]	train's rmse: 6.02999	valid's rmse: 6.18043
Training until validation scores don't improve for 100 rounds.
[2]	train's rmse: 5.79548	valid's rmse: 5.94479
[3]	train's rmse: 5.57453	valid's rmse: 5.72124
[4]	train's rmse: 5.36701	valid's rmse: 5.51112
[5]	train's rmse: 5.16917	valid's rmse: 5.31192
[6]	train's rmse: 4.98638	valid's rmse: 5.12738
[7]	train's rmse: 4.81128	valid's rmse: 4.95026
[8]	train's rmse: 4.64767	valid's rmse: 4.7864
[9]	train's rmse: 4.49389	valid's rmse: 4.63416
[10]	train's rmse: 4.3579	valid's rmse: 4.49911
[11]	train's rmse: 4.22227	valid's rmse: 4.3625
[12]	train's rmse: 4.09894	valid's rmse: 4.23839
[13]	train's rmse: 3.98029	valid's rmse: 4.11949
[14]	train's rmse: 3.86733	valid's rmse: 4.00776
[15]	train's rmse: 3.76131	valid's rmse: 3.90383
[16]	train's rmse: 3.66203	valid's rmse: 3.80353
[17]	train's rmse: 3.57216	valid's rmse: 3.71409
[18]	train's rmse: 3.48458	valid's rmse: 3.6268
[19]	train's rmse: 3.4045	valid's rmse: 3.5469
[20]	train's r

In [409]:
submission= pd.DataFrame(predict.mean(axis=0))
test = pd.read_csv("../Data/test.csv")
submission["id"] = test["id"].values
submission.columns=["price","id"]
submission[["id","price"]].to_csv("../Result/_baseline_v1.7.csv",index=False,encoding="utf-8",sep=",")

In [392]:
submission["price"].mean()

8.064075162967665