In [1]:
import warnings
warnings.simplefilter('ignore')

import re
import gc

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 1000)

from tqdm.notebook import tqdm

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from gensim.models import Word2Vec

import lightgbm as lgb

In [2]:
train = pd.read_csv('raw_data/train.csv')

print(train.shape)
train.head()

(59288, 26)


Unnamed: 0,数据ID,容纳人数,便利设施,洗手间数量,床的数量,床的类型,卧室数量,取消条款,所在城市,清洁费,首次评论日期,房主是否有个人资料图片,房主身份是否验证,房主回复率,何时成为房主,是否支持随即预订,最近评论日期,维度,经度,民宿周边,评论个数,房产类型,民宿评分,房型,邮编,价格
0,train_0,4,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",1.5,3.0,4,2.0,0,3,0,2015-05-07,t,t,,2015-02-25,0,2016-06-26,34.109039,-118.27339,Los Feliz,12,17,97.0,0,90027,64.918531
1,train_1,2,"{TV,""Wireless Internet"",Kitchen,""Free parking ...",1.0,1.0,4,1.0,2,4,1,2016-07-02,t,t,,2009-10-27,1,2016-07-31,40.812897,-73.919163,Mott Haven,6,0,87.0,0,10454,54.918531
2,train_2,4,"{TV,""Air conditioning"",Kitchen,Heating,""Smoke ...",1.0,2.0,4,0.0,2,4,1,2017-07-01,t,f,100%,2017-06-29,1,2017-07-31,40.737643,-73.953309,Greenpoint,4,0,80.0,0,11222,73.219281
3,train_3,2,{},1.0,1.0,4,1.0,0,5,1,,t,t,,2013-03-19,0,,37.759935,-122.420558,Mission District,0,0,,1,94110,64.093909
4,train_4,3,"{Internet,""Wireless Internet"",""Air conditionin...",1.0,1.0,4,1.0,1,4,1,2014-04-30,t,t,100%,2011-07-30,0,2016-05-22,40.683363,-73.94949,Bedford-Stuyvesant,16,0,99.0,0,11216,68.454901


In [3]:
test = pd.read_csv('raw_data/test.csv')

print(test.shape)
test.head()

(14823, 25)


Unnamed: 0,数据ID,容纳人数,便利设施,洗手间数量,床的数量,床的类型,卧室数量,取消条款,所在城市,清洁费,首次评论日期,房主是否有个人资料图片,房主身份是否验证,房主回复率,何时成为房主,是否支持随即预订,最近评论日期,维度,经度,民宿周边,评论个数,房产类型,民宿评分,房型,邮编
0,test_0,2,"{TV,Internet,""Wireless Internet"",""Air conditio...",1.5,1.0,4,1.0,2,1,1,2015-05-25,t,t,100%,2015-05-20,1,2017-01-01,41.849684,-87.67627,Pilsen,17,17,97.0,1,60608
1,test_1,2,"{TV,Internet,""Wireless Internet"",""Air conditio...",2.0,1.0,4,1.0,2,3,1,2015-11-09,t,t,100%,2015-09-08,0,2015-11-15,34.068613,-118.246455,Echo Park,2,0,100.0,0,90012
2,test_2,5,"{TV,""Cable TV"",""Wireless Internet"",""Air condit...",1.0,3.0,4,2.0,1,4,1,2017-05-15,t,t,100%,2017-05-06,1,2017-09-25,40.701958,-73.917352,Bushwick,25,0,88.0,0,11237
3,test_3,6,"{""Cable TV"",Internet,""Wireless Internet"",""Air ...",1.0,3.0,4,1.0,2,4,1,2012-11-12,t,t,70%,2009-02-06,0,2017-07-29,40.742959,-73.99082,Flatiron District,12,0,82.0,0,10010
4,test_4,2,"{Internet,""Wireless Internet"",""Air conditionin...",1.0,1.0,4,1.0,0,3,1,2017-02-17,t,t,100%,2015-10-20,0,2017-03-25,34.046473,-117.734095,,2,17,100.0,1,91766


In [4]:
df_features = pd.concat([train, test])

print(df_features.shape)

(74111, 26)


In [5]:
# 数据填充和清洗

df_features['洗手间数量'].fillna(-1, inplace=True)
df_features['床的数量'].fillna(-1, inplace=True)
df_features['卧室数量'].fillna(-1, inplace=True)
df_features['房主是否有个人资料图片'].fillna('na', inplace=True)
df_features['房主身份是否验证'].fillna('na', inplace=True)        # 与上面特征是一样的, 可以去掉
df_features['房主回复率'].fillna('-1', inplace=True)
df_features['房主回复率'] = df_features['房主回复率'].astype(str).apply(lambda x: x.replace('%', ''))
df_features['房主回复率'] = df_features['房主回复率'].astype(int)
df_features['民宿周边'].fillna('na', inplace=True)
mean_score = df_features['民宿评分'].mean()
df_features['民宿评分'].fillna(mean_score, inplace=True)
df_features['邮编'].fillna('na', inplace=True)

In [6]:
for feat in ['房主是否有个人资料图片', '房主身份是否验证', '民宿周边', '邮编']:
    lbl = LabelEncoder()
    lbl.fit(df_features[feat])
    df_features[feat] = lbl.transform(df_features[feat])

In [7]:
def freq_enc(df, col):
    vc = df[col].value_counts(dropna=True, normalize=True).to_dict()
    df[f'{col}_freq'] = df[col].map(vc)
    return df

for feat in ['容纳人数', '洗手间数量', '床的数量', '床的类型',
             '卧室数量', '取消条款', '所在城市', '清洁费', 
             '房主是否有个人资料图片', '房主回复率', '是否支持随即预订',
             '民宿周边', '房产类型', '房型', '邮编']:
    df_features = freq_enc(df_features, feat)

In [8]:
# # Target Encoding

# def stat(df, df_merge, group_by, agg):
#     group = df.groupby(group_by).agg(agg)

#     columns = []
#     for on, methods in agg.items():
#         for method in methods:
#             columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
#     group.columns = columns
#     group.reset_index(inplace=True)
#     df_merge = df_merge.merge(group, on=group_by, how='left')

#     del (group)
#     gc.collect()
#     return df_merge
    

# def statis_feat(df_know, df_unknow):
#     df_unknow = stat(df_know, df_unknow, ['所在城市'], {'价格': ['mean']})
# #     df_unknow = stat(df_know, df_unknow, ['邮编'], {'价格': ['mean', 'std', 'max']})

#     return df_unknow
    
    

# # 5折交叉
# df_train = df_features[~df_features['价格'].isnull()]
# df_train = df_train.reset_index(drop=True)
# df_test = df_features[df_features['价格'].isnull()]

# df_stas_feat = None
# kf = KFold(n_splits=5, random_state=2021, shuffle=True)
# for train_index, val_index in kf.split(df_train):
#     df_fold_train = df_train.iloc[train_index]
#     df_fold_val = df_train.iloc[val_index]

#     df_fold_val = statis_feat(df_fold_train, df_fold_val)
#     df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

#     del(df_fold_train)
#     del(df_fold_val)
#     gc.collect()

# df_test = statis_feat(df_train, df_test)
# df_features = pd.concat([df_stas_feat, df_test], axis=0)

# del(df_stas_feat)
# del(df_train)
# del(df_test)
# gc.collect()

In [9]:
# 数值交叉特征

df_features['人均床数量'] = df_features['容纳人数'] / (df_features['床的数量'] + 1e-3)
df_features['人均卧室量'] = df_features['容纳人数'] / (df_features['卧室数量'] + 1e-3)
df_features['卧室床均量'] = df_features['床的数量'] / (df_features['卧室数量'] + 1e-3)
df_features['经纬度平方根'] = (df_features['维度']*df_features['维度'] + df_features['经度']*df_features['经度'])**.5

df_features['城市最大维度'] = df_features.groupby(['所在城市'])['维度'].transform('max')
df_features['城市最小维度'] = df_features.groupby(['所在城市'])['维度'].transform('min')
df_features['城市维度跨度'] = df_features['城市最大维度'] - df_features['城市最小维度']
df_features['城市最大经度'] = df_features.groupby(['所在城市'])['经度'].transform('max')
df_features['城市最小经度'] = df_features.groupby(['所在城市'])['经度'].transform('min')
df_features['城市经度跨度'] = df_features['城市最大经度'] - df_features['城市最小经度']
df_features['城市面积'] = df_features['城市维度跨度'] * df_features['城市经度跨度']
df_features['城市总房数'] = df_features.groupby(['所在城市'])['数据ID'].transform('count')
df_features['城市房密度'] = df_features['城市面积'] / df_features['城市总房数']
df_features.drop(['城市最大维度', '城市最小维度', '城市最大经度', '城市最小经度', '城市总房数'], axis=1, inplace=True)

In [10]:
# 时间特征处理

df_features['首次评论日期'] = pd.to_datetime(df_features['首次评论日期']).values.astype(np.int64) // 10 ** 9
df_features['何时成为房主'] = pd.to_datetime(df_features['何时成为房主']).values.astype(np.int64) // 10 ** 9
df_features['最近评论日期'] = pd.to_datetime(df_features['最近评论日期']).values.astype(np.int64) // 10 ** 9

df_features['timestamp_diff1'] = df_features['首次评论日期'] - df_features['何时成为房主']
df_features['timestamp_diff2'] = df_features['最近评论日期'] - df_features['首次评论日期']
df_features['timestamp_diff3'] = df_features['最近评论日期'] - df_features['何时成为房主']

In [11]:
# 统计特征

def brute_force(df, features, groups):
    for method in tqdm(['max', 'min', 'mean', 'std']):
        for feature in features:
            for group in groups:
                df[f'{group}_{feature}_{method}'] = df.groupby(group)[feature].transform(method)
                
    return df

dense_feats = ['timestamp_diff1', 'timestamp_diff2', 'timestamp_diff3']
cate_feats  = ['房型']

df_features = brute_force(df_features, dense_feats, cate_feats)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [12]:
# TF-IDF + SVD encoding

n_components = 12

df_features['便利设施'] = df_features['便利设施'].apply(
    lambda x: x.replace('{', '').replace('}', '').replace('"', '').replace(':', '').replace(',', ' '))

X = list(df_features['便利设施'].values)
tfv = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
tfv.fit(X)
X_tfidf = tfv.transform(X)
svd = TruncatedSVD(n_components=n_components)
svd.fit(X_tfidf)
X_svd = svd.transform(X_tfidf)

for i in range(n_components):
    df_features[f'便利设施_tfidf_{i}'] = X_svd[:, i]
    
df_features.head()

Unnamed: 0,数据ID,容纳人数,便利设施,洗手间数量,床的数量,床的类型,卧室数量,取消条款,所在城市,清洁费,首次评论日期,房主是否有个人资料图片,房主身份是否验证,房主回复率,何时成为房主,是否支持随即预订,最近评论日期,维度,经度,民宿周边,评论个数,房产类型,民宿评分,房型,邮编,价格,容纳人数_freq,洗手间数量_freq,床的数量_freq,床的类型_freq,卧室数量_freq,取消条款_freq,所在城市_freq,清洁费_freq,房主是否有个人资料图片_freq,房主回复率_freq,是否支持随即预订_freq,民宿周边_freq,房产类型_freq,房型_freq,邮编_freq,人均床数量,人均卧室量,卧室床均量,经纬度平方根,城市维度跨度,城市经度跨度,城市面积,城市房密度,timestamp_diff1,timestamp_diff2,timestamp_diff3,房型_timestamp_diff1_max,房型_timestamp_diff2_max,房型_timestamp_diff3_max,房型_timestamp_diff1_min,房型_timestamp_diff2_min,房型_timestamp_diff3_min,房型_timestamp_diff1_mean,房型_timestamp_diff2_mean,房型_timestamp_diff3_mean,房型_timestamp_diff1_std,房型_timestamp_diff2_std,房型_timestamp_diff3_std,便利设施_tfidf_0,便利设施_tfidf_1,便利设施_tfidf_2,便利设施_tfidf_3,便利设施_tfidf_4,便利设施_tfidf_5,便利设施_tfidf_6,便利设施_tfidf_7,便利设施_tfidf_8,便利设施_tfidf_9,便利设施_tfidf_10,便利设施_tfidf_11
0,train_0,4,TV Cable TV Internet Wireless Internet Air con...,1.5,3.0,4,2.0,0,3,0,1430956800,2,2,-1,1424822400,0,1466899200,34.109039,-118.27339,323,12,17,97.0,0,454,64.918531,0.16281,0.051288,0.086924,0.971894,0.153162,0.304206,0.302964,0.265925,0.994414,0.246913,0.737542,0.002672,0.222787,0.557407,0.005411,1.332889,1.999,1.49925,123.093547,1.393795,1.255555,1.749986,7.8e-05,6134400,35942400,42076800,10729756037,10729756037,10730188037,-10730188037,0,-10730188037,-1961812000.0,35824310.0,-1925987000.0,4222271000.0,226707000.0,4237623000.0,0.485278,0.34506,-0.048676,-0.26694,-0.294044,0.162518,-0.115719,-0.066753,0.012276,0.114879,0.188774,0.141827
1,train_1,2,TV Wireless Internet Kitchen Free parking on p...,1.0,1.0,4,1.0,2,4,1,1467417600,2,2,-1,1256601600,1,1469923200,40.812897,-73.919163,371,6,0,87.0,0,150,54.918531,0.429815,0.783946,0.60914,0.971894,0.671749,0.436831,0.436494,0.734075,0.994414,0.246913,0.262458,0.000594,0.661211,0.557407,0.000337,1.998002,1.998002,0.999001,84.437759,0.40938,0.543143,0.222352,7e-06,210816000,2505600,213321600,10729756037,10729756037,10730188037,-10730188037,0,-10730188037,-1961812000.0,35824310.0,-1925987000.0,4222271000.0,226707000.0,4237623000.0,0.65459,0.014672,-0.260082,-0.125328,-0.065055,-0.054602,0.17541,-0.184152,0.017464,-0.022459,-0.174326,0.004266
2,train_2,4,TV Air conditioning Kitchen Heating Smoke dete...,1.0,2.0,4,0.0,2,4,1,1498867200,2,0,100,1498694400,1,1501459200,40.737643,-73.953309,238,4,0,80.0,0,231,73.219281,0.16281,0.783946,0.225392,0.971894,0.090607,0.436831,0.436494,0.734075,0.994414,0.583638,0.262458,0.009783,0.661211,0.557407,0.009958,1.999,4000.0,2000.0,84.431318,0.40938,0.543143,0.222352,7e-06,172800,2592000,2764800,10729756037,10729756037,10730188037,-10730188037,0,-10730188037,-1961812000.0,35824310.0,-1925987000.0,4222271000.0,226707000.0,4237623000.0,0.374496,-0.068823,-0.017293,-0.02166,0.205175,0.184736,0.317999,-0.11052,-0.015582,0.102508,0.10009,-0.081543
3,train_3,2,,1.0,1.0,4,1.0,0,5,1,-9223372037,2,2,-1,1363651200,0,-9223372037,37.759935,-122.420558,356,0,0,94.067365,1,739,64.093909,0.429815,0.783946,0.60914,0.971894,0.671749,0.304206,0.086816,0.734075,0.994414,0.246913,0.737542,0.010579,0.661211,0.413407,0.013331,1.998002,1.998002,0.999001,128.111692,0.121451,0.146278,0.017766,3e-06,-10587023237,0,-10587023237,10729064837,10730188037,10730188037,-10730447237,0,-10730447237,-2466029000.0,32908980.0,-2433120000.0,4600638000.0,261981700.0,4611573000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,train_4,3,Internet Wireless Internet Air conditioning Ki...,1.0,1.0,4,1.0,1,4,1,1398816000,2,2,100,1311984000,0,1463875200,40.683363,-73.94949,44,16,0,99.0,0,220,68.454901,0.105167,0.783946,0.60914,0.971894,0.671749,0.257222,0.436494,0.734075,0.994414,0.583638,0.737542,0.029226,0.661211,0.557407,0.008231,2.997003,2.997003,0.999001,84.401796,0.40938,0.543143,0.222352,7e-06,86832000,65059200,151891200,10729756037,10729756037,10730188037,-10730188037,0,-10730188037,-1961812000.0,35824310.0,-1925987000.0,4222271000.0,226707000.0,4237623000.0,0.451925,0.36969,-0.052787,-0.329882,-0.366512,0.120876,-0.102477,0.103181,-0.026274,-0.18992,-0.063034,-0.072947


In [13]:
# emb_size = 2
# # sentences = df_features['便利设施'].str.lower().values.tolist()
# sentences = df_features['便利设施'].values.tolist()

# words = []
# for i in range(len(sentences)):
#     sentences[i] = sentences[i].split()
#     words += sentences[i]
    
# words = list(set(words))

# model = Word2Vec(sentences, size=emb_size, window=3,
#                  min_count=1, sg=0, hs=1, seed=2021)

# emb_matrix_mean = []
# emb_matrix_max = []

# for seq in sentences:
#     vec = []
#     for w in seq:
#         if w in model:
#             vec.append(model[w])
#     if len(vec) > 0:
#         emb_matrix_mean.append(np.mean(vec, axis=0))
#         emb_matrix_max.append(np.max(vec, axis=0))
#     else:
#         emb_matrix_mean.append([0] * emb_size)
#         emb_matrix_max.append([0] * emb_size)

# df_emb_mean = pd.DataFrame(emb_matrix_mean)
# df_emb_mean.columns = ['便利设施_w2v_mean_{}'.format(
#     i) for i in range(emb_size)]

# df_emb_max = pd.DataFrame(emb_matrix_max)
# df_emb_max.columns = ['便利设施_w2v_max_{}'.format(
#     i) for i in range(emb_size)]

# for i in range(emb_size):
#     df_features[f'便利设施_w2v_mean_{i}'] = df_emb_mean[f'便利设施_w2v_mean_{i}']
#     df_features[f'便利设施_w2v_max_{i}'] = df_emb_max[f'便利设施_w2v_max_{i}']

# df_features.head()

In [14]:
df_features.drop(['房主身份是否验证', '便利设施', '首次评论日期', '何时成为房主',
                  '最近评论日期'], axis=1, inplace=True)
df_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74111 entries, 0 to 14822
Data columns (total 71 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   数据ID                     74111 non-null  object 
 1   容纳人数                     74111 non-null  int64  
 2   洗手间数量                    74111 non-null  float64
 3   床的数量                     74111 non-null  float64
 4   床的类型                     74111 non-null  int64  
 5   卧室数量                     74111 non-null  float64
 6   取消条款                     74111 non-null  int64  
 7   所在城市                     74111 non-null  int64  
 8   清洁费                      74111 non-null  int64  
 9   房主是否有个人资料图片              74111 non-null  int64  
 10  房主回复率                    74111 non-null  int64  
 11  是否支持随即预订                 74111 non-null  int64  
 12  维度                       74111 non-null  float64
 13  经度                       74111 non-null  float64
 14  民宿周边                  

In [15]:
df_test = df_features[df_features['价格'].isnull()].copy()
df_train = df_features[df_features['价格'].notnull()].copy()

print(df_train.shape, df_test.shape)

(59288, 71) (14823, 71)


In [16]:
bad_feats = ['房型_timestamp_diff3_std',
 '房型_timestamp_diff3_mean',
 '房型_timestamp_diff2_min',
 '房型_timestamp_diff1_std',
 '房型_timestamp_diff1_mean']

# bad_feats = []

In [17]:
ycol = '价格'
feature_names = list(
    filter(lambda x: x not in [ycol, '数据ID'] + bad_feats, df_train.columns))

model = lgb.LGBMRegressor(num_leaves=64,
                          max_depth=6,
                          learning_rate=0.1,
                          n_estimators=10000,
                          subsample=0.8,
                          feature_fraction=0.8,
                          reg_alpha=0.5,
                          reg_lambda=0.5,
                          random_state=2021,
                          importance_type='gain',
                          metric=None
                          )


oof = []
prediction = df_test[['数据ID']]
prediction[ycol] = 0
df_importance_list = []

kfold = KFold(n_splits=10, shuffle=True, random_state=2021)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric='rmse',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict(
        X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = df_train.iloc[val_idx][['数据ID', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction['价格'] += pred_test / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Training until validation scores don't improve for 50 rounds
[500]	train's rmse: 4.12885	train's l2: 17.0474	valid's rmse: 5.56874	valid's l2: 31.0108
Early stopping, best iteration is:
[552]	train's rmse: 4.03646	train's l2: 16.293	valid's rmse: 5.56289	valid's l2: 30.9458


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[363]	train's rmse: 4.40172	train's l2: 19.3752	valid's rmse: 5.52486	valid's l2: 30.5241


Training until validation scores don't improve for 50 rounds
[500]	train's rmse: 4.11544	train's l2: 16.9368	valid's rmse: 5.41944	valid's l2: 29.3703
Early stopping, best iteration is:
[453]	train's rmse: 4.20266	train's l2: 17.6623	valid's rmse: 5.41659	valid's l2: 29.3395


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[430]	train's rmse: 4.27284	train's l2: 18.2571	valid's rmse: 5.34117	valid's l2: 28.5281


Training until validation scores don't improve for 50 rounds
Early 

In [18]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,房型,9227175.0
1,洗手间数量,2136790.0
2,容纳人数,1200462.0
3,房型_timestamp_diff1_max,1128067.0
4,经度,1030693.0
5,邮编,972777.8
6,卧室数量,923184.3
7,经纬度平方根,678408.3
8,维度,675195.9
9,洗手间数量_freq,671329.0


In [19]:
df_importance[df_importance.importance == 0]['column'].values.tolist()

[]

In [20]:
df_oof = pd.concat(oof)
rmse = mean_squared_error(df_oof[ycol], df_oof['pred'], squared=False)
print('rmse:', rmse)

rmse: 5.43123935804306


In [21]:
sub = prediction.copy(deep=True)
sub.to_csv(f'sub_{rmse}.csv', index=False, encoding='utf-8')