## 预处理部分

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

# 导入数据
train_data = pd.read_csv('train.csv',encoding="utf-8")
train_data=pd.DataFrame(train_data)
#-------------------预处理1---------------------
# 清除离群点
train_data=train_data[train_data.space<0.4]
# 纠偏
train_data["rent"] = np.log1p(train_data["rent"])

# ##提取小区平均租金
#按小区名分组
rent_mean = train_data.groupby('Cname')['rent'].mean()
Cname_rent_mean = pd.DataFrame(train_data.groupby('Cname')['rent'].mean()).mean()
dict_Cname = rent_mean.to_dict()
#按行政区分组
rent_mean = train_data.groupby('area')['rent'].mean()
area_rent_mean = pd.DataFrame(train_data.groupby('area')['rent'].mean()).mean()
dict_area = rent_mean.to_dict()
#按商圈分组
rent_mean = train_data.groupby('position')['rent'].mean()
position_rent_mean = pd.DataFrame(train_data.groupby('position')['rent'].mean()).mean()
dict_position = rent_mean.to_dict()
#-------------------合并---------------------
# 导入测试集，提取id
test_data = pd.read_csv('test.csv',encoding="utf-8")
test_data=pd.DataFrame(test_data)
test_ID=test_data["id"]
test_data.drop("id",axis=1,inplace=True)
# 训练集测试集合并
ntrain = train_data.shape[0]  #训练集样本数
ntest = test_data.shape[0]  #测试集样本数
y_train = train_data.rent.values # 提取训练集rent列
all_data = pd.concat((train_data, test_data)).reset_index(drop=True)
all_data.drop(['rent'], axis=1, inplace=True)
#-------------------预处理2---------------------
# 填补缺失值
# 小区租房数量（平均数）
mean_val = all_data["rent_quantity"].mean()
all_data["rent_quantity"] = all_data["rent_quantity"].fillna(mean_val)
# 区、位置（众数）
mode_area = all_data["area"].mode()
mode_position = all_data["position"].mode()
all_data["area"] = all_data["area"].fillna(int(mode_area))
all_data["position"] = all_data["position"].fillna(int(mode_position))
# 居室数量(相加、比例)
all_data['total_num']= all_data['bedroom_num']+all_data['hall_num']+all_data['toilet_num']
all_data['bedroom_ratio'] = all_data['bedroom_num']/all_data['total_num']
all_data['hall_ratio'] = all_data['hall_num']/all_data['total_num']
all_data['toilet_ratio'] = all_data['toilet_num']/all_data['total_num']
# 地铁
all_data["distance"] = all_data["distance"].fillna(2)
all_data["subway_station"] = all_data["subway_station"].fillna(-1)
all_data["subway_line"] = all_data["subway_line"].fillna(-1)

# 居住状态、装修情况
all_data['state']=all_data['state'].apply(lambda x:1 if x>0 else 0)#是否居住
all_data['decoration situation']=all_data['decoration situation'].apply(lambda x : 0 if x != x else x)

# 房屋朝向
all_data['east']=all_data['orientation'].apply(lambda x:1 if ("东" in x.split(' ')) else 0)
all_data['south']=all_data['orientation'].apply(lambda x:1 if ("南" in x.split(' ')) else 0)
all_data['west']=all_data['orientation'].apply(lambda x:1 if ("西" in x.split(' ')) else 0)
all_data['north']=all_data['orientation'].apply(lambda x:1 if ("北" in x.split(' ')) else 0)
all_data['north-east']=all_data['orientation'].apply(lambda x:1 if ("东北" in x.split(' ')) else 0)
all_data['south-east']=all_data['orientation'].apply(lambda x:1 if ("东南" in x.split(' ')) else 0)
all_data['north-west']=all_data['orientation'].apply(lambda x:1 if ("西北" in x.split(' ')) else 0)
all_data['south-west']=all_data['orientation'].apply(lambda x:1 if ("西南" in x.split(' ')) else 0)

# 添加对应平均租金
all_data['avgrent_Cname']=all_data['Cname'].apply(lambda x:dict_Cname[x] if(x in dict_Cname.keys()) else Cname_rent_mean)
all_data['avgrent_area']=all_data['area'].apply(lambda x:dict_area[x] if(x in dict_area.keys()) else area_rent_mean)
all_data['avgrent_position']=all_data['position'].apply(lambda x:dict_position[x] if(x in dict_position.keys()) else position_rent_mean)

#删除列，转换类型
all_data["avgrent_Cname"]=all_data["avgrent_Cname"].astype('float')
all_data.drop(["rent_style","orientation","bedroom_num","avgrent_area"],axis = 1,inplace=True)
colunms = ['time', 'Cname', 'state', 'area', 'position','east','south','west','north','north-east','south-east','north-west','south-west','subway_station','subway_line']
for i in colunms:
    all_data[i] = all_data[i].astype(str)
#print(all_data.isnull().sum())
#print(all_data.dtypes)
all_data = np.array(all_data)


In [2]:
#-------------------得到数据集---------------------
# 分开训练集测试集数据
train = all_data[:ntrain]
test = all_data[ntrain:]
# y_train 为训练集目标

### 建模

In [3]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV  # 网格搜索和随机搜索
from heamy.dataset import Dataset

from heamy.dataset import Dataset
from heamy.estimator import  Classifier
from heamy.pipeline import ModelsPipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [5]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=600,
                              learning_rate=0.05, n_estimators=5000,
                              max_bin = 150, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 1,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

model_lgb.fit(train, y_train)
#预测
lgb_train_pred = model_lgb.predict(train)
lgb_pred = np.expm1(model_lgb.predict(test))
#结果
print(rmsle(y_train, lgb_train_pred))
print(lgb_pred)

0.03725892229100735
[ 4.23882337  6.06402787 13.10662523 ...  7.63340063 12.79513437
  4.67257814]


In [6]:
sub = pd.DataFrame()
sub['id'] = test_ID
sub['price'] = lgb_pred
sub.to_csv(r'submission1.csv',index=False)

### 随机森林筛选变量

#这部分先不管！！！！！！！！！！！！

In [143]:
clf = RandomForestClassifier()
temp_x = train
clf.fit(temp_x, y_train.astype('int'))
importance = clf.feature_importances_
indices = np.argsort(importance)[::-1]
features = temp_x.columns
for f in range(train.shape[1]):
    print(("%2d) %-*s %f" % (f + 1, 30, features[f], importance[indices[f]])))

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

## gridsearch

In [16]:
param_test = {
        'max_depth': range(5,15,2),
        'num_leaves': range(10,200,5),
    }
estimator = lgb.LGBMRegressor(objective='regression',num_leaves=110,max_depth = 13,
                              min_child_weight = 1,learning_rate=0.05, n_estimators=800,
                              max_bin = 150, bagging_fraction = 0.8,
                              subsample = 0.8,colsample_bytree=0.8,nthread = 7,
                              bagging_freq = 5, feature_fraction = 1,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
gsearch = GridSearchCV( estimator , param_grid = param_test, scoring='roc_auc', cv=5 )
gsearch.fit(train, y_train)
gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_

ValueError: continuous format is not supported

### 模型融合

In [19]:
#创建数据集
dataset = Dataset(train,y_train,test)#对无标签训练集进行预测时将X_test替换为X_n
model_xgb = Classifier(dataset=dataset, estimator=XGBClassifier,parameters={
    'colsample_bytree':0.4603,'gamma':0.0468,'learning_rate':0.05,'max_depth':3,'min_child_weight':0.05,
                                                                              'reg_alpha':0.4640,
                                                                              'n_estimators':2200,
                                                                              'objective':'binary:logistic',
                                                                               'seed': 32,
                                                                              'gamma':0.4,
                                                                              'colsample_bytree':0.75,'random_state':7,
                                                                              'subsample':0.5213,}, name='xgb')

model_lgb = Classifier(dataset=dataset, estimator=lgb.LGBMClassifier,parameters={
                                                                                'reg_lambda':0.002,
                                                                                'max_bin':150,
                                                                                'max_depth':13,
                                                                                'min_child_weight':1,
                                                                                'learning_rate':0.05,
                                                                                'num_leaves':600,
                                                                                'bagging_seed':9,
                                                                                'n_estimators':900,
                                                                                'boosting_type':'gbdt',
                                                                                'reg_alpha':0.001,
                                                                                'colsample_bytree':0.5,
                                                                                'min_child_samples':24,}, name='lgb')


NameError: name 'Dataset' is not defined

In [None]:
pipeline = ModelsPipeline(model_lgb,model_xgb)
stack_ds = pipeline.stack(k=10,seed=111)
stacker = Classifier(dataset=stack_ds, estimator=LogisticRegression)
stacker.validate(k=10,scorer=roc_auc_score)
results = stacker.predict()

### xgboost

In [11]:
# 建模
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
model_xgb.fit(train, y_train)
#预测
xgb_train_pred = model_xgb.predict(train)
xgb_pred = np.expm1(model_xgb.predict(test))
#结果
print(rmsle(y_train, xgb_train_pred))
print(xgb_train_pred)
print(xgb_pred)


0.199776984384152
[1.8357805 2.4599302 2.1262748 ... 1.4831477 1.2339164 2.33926  ]
[ 4.7318835  6.0349975 14.961876  ...  6.0965757 10.82643    4.381811 ]


In [27]:
#输出
sub = pd.DataFrame()
sub['id'] = test_ID
sub['price'] = xgb_pred
sub.to_csv(r'C:\Users\zyy\Desktop\submission1.csv',index=False)

### GBoost

In [5]:
#结果
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
GBoost.fit(train, y_train)
# 预测
gb_train_pred = GBoost.predict(train)
gb_pred = np.expm1(GBoost.predict(test))
#结果
print(rmsle(y_train, gb_train_pred))
print(gb_train_pred)
print(gb_pred)
#输出
sub = pd.DataFrame()
sub['id'] = test_ID
sub['price'] = gb_pred
sub.to_csv(r'C:\Users\zyy\Desktop\GBoost.csv',index=False)

NameError: name 'xgb_train_pred' is not defined